datacompose 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacompose might be problematic. Click here for more details.
- datacompose-0.2.4/CHANGELOG.md +108 -0
- datacompose-0.2.4/LICENSE +21 -0
- datacompose-0.2.4/MANIFEST.in +38 -0
- datacompose-0.2.4/PKG-INFO +431 -0
- datacompose-0.2.4/README.md +384 -0
- datacompose-0.2.4/datacompose/__init__.py +1 -0
- datacompose-0.2.4/datacompose/cli/__init__.py +5 -0
- datacompose-0.2.4/datacompose/cli/colors.py +80 -0
- datacompose-0.2.4/datacompose/cli/commands/__init__.py +3 -0
- datacompose-0.2.4/datacompose/cli/commands/add.py +215 -0
- datacompose-0.2.4/datacompose/cli/commands/init.py +451 -0
- datacompose-0.2.4/datacompose/cli/commands/list.py +118 -0
- datacompose-0.2.4/datacompose/cli/commands/upgrade.py +7 -0
- datacompose-0.2.4/datacompose/cli/main.py +59 -0
- datacompose-0.2.4/datacompose/cli/validation.py +72 -0
- datacompose-0.2.4/datacompose/generators/__init__.py +3 -0
- datacompose-0.2.4/datacompose/generators/base.py +193 -0
- datacompose-0.2.4/datacompose/generators/pyspark/__init__.py +1 -0
- datacompose-0.2.4/datacompose/generators/pyspark/generator.py +51 -0
- datacompose-0.2.4/datacompose/operators/__init__.py +21 -0
- datacompose-0.2.4/datacompose/operators/primitives.py +595 -0
- datacompose-0.2.4/datacompose/transformers/__init__.py +0 -0
- datacompose-0.2.4/datacompose/transformers/discovery.py +186 -0
- datacompose-0.2.4/datacompose/transformers/text/__init__.py +1 -0
- datacompose-0.2.4/datacompose/transformers/text/clean_addresses/__init__.py +1 -0
- datacompose-0.2.4/datacompose/transformers/text/clean_addresses/pyspark/pyspark_primitives.py +1967 -0
- datacompose-0.2.4/datacompose/transformers/text/clean_emails/__init__.py +1 -0
- datacompose-0.2.4/datacompose/transformers/text/clean_emails/pyspark/pyspark_primitives.py +781 -0
- datacompose-0.2.4/datacompose/transformers/text/clean_phone_numbers/__init__.py +0 -0
- datacompose-0.2.4/datacompose/transformers/text/clean_phone_numbers/pyspark/pyspark_primitives.py +941 -0
- datacompose-0.2.4/datacompose.egg-info/PKG-INFO +431 -0
- datacompose-0.2.4/datacompose.egg-info/SOURCES.txt +89 -0
- datacompose-0.2.4/datacompose.egg-info/dependency_links.txt +1 -0
- datacompose-0.2.4/datacompose.egg-info/entry_points.txt +2 -0
- datacompose-0.2.4/datacompose.egg-info/requires.txt +18 -0
- datacompose-0.2.4/datacompose.egg-info/top_level.txt +1 -0
- datacompose-0.2.4/pyproject.toml +89 -0
- datacompose-0.2.4/setup.cfg +5 -0
- datacompose-0.2.4/tests/__init__.py +0 -0
- datacompose-0.2.4/tests/integration/__init__.py +1 -0
- datacompose-0.2.4/tests/integration/test_end_to_end.py +182 -0
- datacompose-0.2.4/tests/integration/test_generated_imports.py +165 -0
- datacompose-0.2.4/tests/unit/cli/.venv/bin/activate_this.py +59 -0
- datacompose-0.2.4/tests/unit/cli/.venv/lib/python3.12/site-packages/_virtualenv.py +101 -0
- datacompose-0.2.4/tests/unit/cli/__init__.py +1 -0
- datacompose-0.2.4/tests/unit/cli/build/__init__.py +0 -0
- datacompose-0.2.4/tests/unit/cli/build/postgres/__init__.py +0 -0
- datacompose-0.2.4/tests/unit/cli/build/postgres/clean_emails/__init__.py +0 -0
- datacompose-0.2.4/tests/unit/cli/build/postgres/clean_emails/email_cleaner_udf_spec.yaml +67 -0
- datacompose-0.2.4/tests/unit/cli/build/postgres/clean_emails/test_email_cleaner_udf.py +102 -0
- datacompose-0.2.4/tests/unit/cli/build/spark/__init__.py +0 -0
- datacompose-0.2.4/tests/unit/cli/build/spark/clean_emails/__init__.py +0 -0
- datacompose-0.2.4/tests/unit/cli/build/spark/clean_emails/email_cleaner_udf.py +196 -0
- datacompose-0.2.4/tests/unit/cli/build/spark/clean_emails/email_cleaner_udf_spec.yaml +67 -0
- datacompose-0.2.4/tests/unit/cli/build/spark/clean_emails/test_email_cleaner_udf.py +102 -0
- datacompose-0.2.4/tests/unit/cli/test_add_command.py +177 -0
- datacompose-0.2.4/tests/unit/cli/test_add_validation.py +96 -0
- datacompose-0.2.4/tests/unit/cli/test_init_command.py +123 -0
- datacompose-0.2.4/tests/unit/cli/test_list_command.py +239 -0
- datacompose-0.2.4/tests/unit/cli/test_main.py +51 -0
- datacompose-0.2.4/tests/unit/generators/__init__.py +1 -0
- datacompose-0.2.4/tests/unit/generators/test_base_generator.py +330 -0
- datacompose-0.2.4/tests/unit/generators/test_spark_generator.py +266 -0
- datacompose-0.2.4/tests/unit/operators/conditional_tests_common.py +26 -0
- datacompose-0.2.4/tests/unit/operators/conftest.py +61 -0
- datacompose-0.2.4/tests/unit/operators/test_conditional_complex_logic.py +200 -0
- datacompose-0.2.4/tests/unit/operators/test_conditional_data_driven.py +117 -0
- datacompose-0.2.4/tests/unit/operators/test_conditional_edge_cases.py +150 -0
- datacompose-0.2.4/tests/unit/operators/test_conditional_error_handling.py +67 -0
- datacompose-0.2.4/tests/unit/operators/test_conditional_parameters.py +94 -0
- datacompose-0.2.4/tests/unit/operators/test_conditional_performance.py +106 -0
- datacompose-0.2.4/tests/unit/operators/test_conditional_real_world.py +183 -0
- datacompose-0.2.4/tests/unit/operators/test_operators.py +951 -0
- datacompose-0.2.4/tests/unit/transformers/__init__.py +1 -0
- datacompose-0.2.4/tests/unit/transformers/test_discovery.py +171 -0
- datacompose-0.2.4/tests/unit/transformers/text/common/test_common.py +0 -0
- datacompose-0.2.4/tests/unit/transformers/text/test_addresses/test_building_unit_extraction.py +791 -0
- datacompose-0.2.4/tests/unit/transformers/text/test_addresses/test_city_state_extraction.py +831 -0
- datacompose-0.2.4/tests/unit/transformers/text/test_addresses/test_clean_addresses.py +291 -0
- datacompose-0.2.4/tests/unit/transformers/text/test_addresses/test_country_extraction.py +247 -0
- datacompose-0.2.4/tests/unit/transformers/text/test_addresses/test_data_addresses.py +627 -0
- datacompose-0.2.4/tests/unit/transformers/text/test_addresses/test_po_box_extraction.py +317 -0
- datacompose-0.2.4/tests/unit/transformers/text/test_addresses/test_street_extraction.py +1038 -0
- datacompose-0.2.4/tests/unit/transformers/text/test_addresses/test_zip_code_extraction.py +996 -0
- datacompose-0.2.4/tests/unit/transformers/text/test_emails/test_debug_long_emails.py +46 -0
- datacompose-0.2.4/tests/unit/transformers/text/test_emails/test_email_extraction.py +848 -0
- datacompose-0.2.4/tests/unit/transformers/text/test_emails/test_email_optimized.py +219 -0
- datacompose-0.2.4/tests/unit/transformers/text/test_phone_numbers/test_phone_extraction.py +611 -0
- datacompose-0.2.4/tests/unit/transformers/text/test_phone_numbers/test_phone_formatting.py +489 -0
- datacompose-0.2.4/tests/yaml_specs/__init__.py +1 -0
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to Datacompose will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.2.4] - 2024-08-12
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- **Phone Number Primitives**: Complete set of 45+ phone number transformation functions
|
|
14
|
+
- NANP validation and formatting (North American Numbering Plan)
|
|
15
|
+
- International phone support with E.164 formatting
|
|
16
|
+
- Extension handling and toll-free detection
|
|
17
|
+
- Phone number extraction from text
|
|
18
|
+
- Letter-to-number conversion (1-800-FLOWERS support)
|
|
19
|
+
- **Address Improvements**: Enhanced street extraction and standardization
|
|
20
|
+
- Fixed numbered street extraction ("5th Avenue" correctly returns "5th")
|
|
21
|
+
- Improved null handling in street extraction
|
|
22
|
+
- Custom mapping support for street suffix standardization
|
|
23
|
+
- **Utils Export**: Generated code now includes `utils/primitives.py` for standalone deployment
|
|
24
|
+
- PrimitiveRegistry class embedded with generated code
|
|
25
|
+
- No runtime dependency on datacompose package
|
|
26
|
+
- Fallback imports for maximum compatibility
|
|
27
|
+
|
|
28
|
+
### Changed
|
|
29
|
+
- **BREAKING**: Renamed `PrimitiveNameSpace` to `PrimitiveRegistry` throughout codebase
|
|
30
|
+
- **Major Architecture Shift**: Removed YAML/spec file system entirely
|
|
31
|
+
- No more YAML specifications or JSON replacements
|
|
32
|
+
- Direct primitive file copying instead of template rendering
|
|
33
|
+
- Simplified discovery system works with transformer directories
|
|
34
|
+
- Removed `validate` command completely
|
|
35
|
+
- **Import Strategy**: Primitives now try local utils import first, fall back to datacompose package
|
|
36
|
+
- **File Naming**: Generated files use plural form with primitives suffix
|
|
37
|
+
- `clean_emails` → `email_primitives.py`
|
|
38
|
+
- `clean_addresses` → `address_primitives.py`
|
|
39
|
+
- `clean_phone_numbers` → `phone_primitives.py`
|
|
40
|
+
|
|
41
|
+
### Fixed
|
|
42
|
+
- Phone `normalize_separators` now correctly handles parentheses: `(555)123-4567` → `555-123-4567`
|
|
43
|
+
- Street extraction for numbered streets ("5th Avenue" issue)
|
|
44
|
+
- Compose decorator now requires namespace to be passed explicitly for proper method resolution
|
|
45
|
+
- `standardize_street_suffix` applies both custom and default mappings correctly
|
|
46
|
+
- Test failures due to namespace resolution in compose decorator
|
|
47
|
+
|
|
48
|
+
### Removed
|
|
49
|
+
- All YAML/spec file functionality
|
|
50
|
+
- PostgreSQL generator references
|
|
51
|
+
- Jinja2 template dependencies
|
|
52
|
+
- `validate` command from CLI
|
|
53
|
+
- Old Spark integration tests (replaced with end-to-end tests)
|
|
54
|
+
|
|
55
|
+
## [0.2.0] - 2024-XX-XX
|
|
56
|
+
|
|
57
|
+
### Added
|
|
58
|
+
- **Primitive System**: New composable primitive architecture for building data pipelines
|
|
59
|
+
- `SmartPrimitive` class for partial application and parameter binding
|
|
60
|
+
- `PrimitiveRegistry` (originally PrimitiveNameSpace) for organizing related transformations
|
|
61
|
+
- Support for conditional primitives (boolean-returning functions)
|
|
62
|
+
- **Conditional Compilation**: AST-based pipeline compilation with if/else support
|
|
63
|
+
- `PipelineCompiler` for parsing and compiling conditional logic
|
|
64
|
+
- `StablePipeline` for executing compiled pipelines
|
|
65
|
+
- Full support for nested conditionals and complex branching
|
|
66
|
+
- **Comprehensive Testing**: 44+ tests covering conditional compilation scenarios
|
|
67
|
+
- Edge cases and null handling
|
|
68
|
+
- Complex nested logic
|
|
69
|
+
- Data-driven conditions
|
|
70
|
+
- Performance optimization tests
|
|
71
|
+
- Real-world use cases
|
|
72
|
+
- Parameter handling
|
|
73
|
+
- Error handling
|
|
74
|
+
- **Improved Architecture**: Dual approach for different runtime constraints
|
|
75
|
+
- Primitives for flexible runtimes (Python, Spark, Scala)
|
|
76
|
+
- Templates for rigid targets (SQL, PostgreSQL)
|
|
77
|
+
|
|
78
|
+
### Changed
|
|
79
|
+
- Made PySpark an optional dependency
|
|
80
|
+
- Reorganized test structure with focused test files and shared fixtures
|
|
81
|
+
- Refined architecture to support both template-based and primitive-based approaches
|
|
82
|
+
|
|
83
|
+
### Fixed
|
|
84
|
+
- Import paths for pipeline compilation modules
|
|
85
|
+
- Missing return statements in pipeline execution
|
|
86
|
+
- Conditional logic to use accumulated results correctly
|
|
87
|
+
|
|
88
|
+
## [0.1.4] - 2024-XX-XX
|
|
89
|
+
|
|
90
|
+
### Added
|
|
91
|
+
- Initial release of Datacompose
|
|
92
|
+
- Core framework for generating data cleaning UDFs
|
|
93
|
+
- Support for Spark, PostgreSQL, and Pandas targets
|
|
94
|
+
- Built-in specifications for common data cleaning tasks:
|
|
95
|
+
- Email address cleaning
|
|
96
|
+
- Phone number normalization
|
|
97
|
+
- Address standardization
|
|
98
|
+
- Job title standardization
|
|
99
|
+
- Date/time parsing
|
|
100
|
+
- CLI interface with commands:
|
|
101
|
+
- `datacompose init` - Initialize project
|
|
102
|
+
- `datacompose add` - Generate UDFs from specs
|
|
103
|
+
- `datacompose list` - List available targets and specs
|
|
104
|
+
- `datacompose validate` - Validate specification files
|
|
105
|
+
- YAML-based specification format
|
|
106
|
+
- Jinja2 templating for code generation
|
|
107
|
+
- Comprehensive test suite
|
|
108
|
+
- Documentation with Sphinx and Furo theme
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 datacompose
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Include documentation
|
|
2
|
+
include README.md
|
|
3
|
+
include LICENSE
|
|
4
|
+
include CHANGELOG.md
|
|
5
|
+
|
|
6
|
+
# Include all YAML specifications
|
|
7
|
+
recursive-include datacompose/transformers *.yaml
|
|
8
|
+
|
|
9
|
+
# Include all Jinja2 templates
|
|
10
|
+
recursive-include datacompose/transformers *.j2
|
|
11
|
+
recursive-include datacompose/generators *.j2
|
|
12
|
+
|
|
13
|
+
# Include type hints
|
|
14
|
+
recursive-include datacompose py.typed
|
|
15
|
+
|
|
16
|
+
# Include test data (optional, remove if you don't want tests in distribution)
|
|
17
|
+
recursive-include tests *.py
|
|
18
|
+
recursive-include tests *.csv
|
|
19
|
+
recursive-include tests *.yaml
|
|
20
|
+
|
|
21
|
+
# Exclude unnecessary files
|
|
22
|
+
global-exclude *.pyc
|
|
23
|
+
global-exclude *.pyo
|
|
24
|
+
global-exclude __pycache__
|
|
25
|
+
global-exclude .DS_Store
|
|
26
|
+
global-exclude .git*
|
|
27
|
+
global-exclude *.swp
|
|
28
|
+
global-exclude *~
|
|
29
|
+
|
|
30
|
+
# Exclude development files
|
|
31
|
+
exclude .pre-commit-config.yaml
|
|
32
|
+
exclude .gitignore
|
|
33
|
+
exclude docker-compose.yml
|
|
34
|
+
exclude Dockerfile
|
|
35
|
+
exclude Makefile
|
|
36
|
+
prune docs/build
|
|
37
|
+
prune notebooks
|
|
38
|
+
prune scripts
|
|
@@ -0,0 +1,431 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datacompose
|
|
3
|
+
Version: 0.2.4
|
|
4
|
+
Summary: Copy-pasteable data transformation primitives for PySpark. Inspired by shadcn-svelte.
|
|
5
|
+
Author: Datacompose Contributors
|
|
6
|
+
Maintainer: Datacompose Contributors
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/datacompose/datacompose
|
|
9
|
+
Project-URL: Documentation, https://github.com/datacompose/datacompose/tree/main/docs
|
|
10
|
+
Project-URL: Repository, https://github.com/datacompose/datacompose.git
|
|
11
|
+
Project-URL: Issues, https://github.com/datacompose/datacompose/issues
|
|
12
|
+
Project-URL: Changelog, https://github.com/datacompose/datacompose/blob/main/CHANGELOG.md
|
|
13
|
+
Keywords: data-cleaning,data-quality,udf,spark,postgres,code-generation,data-pipeline,etl
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Topic :: Software Development :: Code Generators
|
|
17
|
+
Classifier: Topic :: Database
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
26
|
+
Classifier: Operating System :: OS Independent
|
|
27
|
+
Requires-Python: >=3.8
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
License-File: LICENSE
|
|
30
|
+
Requires-Dist: jinja2>=3.0.0
|
|
31
|
+
Requires-Dist: pyyaml>=6.0
|
|
32
|
+
Requires-Dist: click>=8.0.0
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
35
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
36
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
37
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
38
|
+
Provides-Extra: docs
|
|
39
|
+
Requires-Dist: sphinx>=7.2.0; extra == "docs"
|
|
40
|
+
Requires-Dist: furo>=2024.1.0; extra == "docs"
|
|
41
|
+
Requires-Dist: myst-parser>=2.0.0; extra == "docs"
|
|
42
|
+
Requires-Dist: sphinx-autodoc-typehints>=1.25.0; extra == "docs"
|
|
43
|
+
Requires-Dist: sphinx-copybutton>=0.5.2; extra == "docs"
|
|
44
|
+
Requires-Dist: sphinx-tabs>=3.4.0; extra == "docs"
|
|
45
|
+
Requires-Dist: sphinx-click>=5.1.0; extra == "docs"
|
|
46
|
+
Dynamic: license-file
|
|
47
|
+
|
|
48
|
+
# Datacompose
|
|
49
|
+
|
|
50
|
+
A powerful data transformation framework for building reusable, composable data cleaning pipelines in PySpark.
|
|
51
|
+
|
|
52
|
+
## Overview
|
|
53
|
+
|
|
54
|
+
Datacompose provides a declarative way to build data transformation pipelines using composable primitives. It generates optimized, standalone PySpark code that can be deployed without runtime dependencies.
|
|
55
|
+
|
|
56
|
+
## Key Features
|
|
57
|
+
|
|
58
|
+
- **Composable Primitives**: Build complex transformations from simple, reusable functions
|
|
59
|
+
- **Smart Partial Application**: Configure transformations with parameters for reuse
|
|
60
|
+
- **Pipeline Compilation**: Convert declarative pipeline definitions into optimized Spark operations
|
|
61
|
+
- **Code Generation**: Generate standalone PySpark code with embedded dependencies
|
|
62
|
+
- **Comprehensive Libraries**: Pre-built primitives for emails, addresses, and phone numbers
|
|
63
|
+
- **Conditional Logic**: Support for if/else branching in pipelines
|
|
64
|
+
- **Type-Safe Operations**: All transformations maintain Spark column type safety
|
|
65
|
+
|
|
66
|
+
## Installation
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
pip install datacompose
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Quick Start
|
|
73
|
+
|
|
74
|
+
### 1. Initialize a Project
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
datacompose init
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
This creates a `datacompose.json` configuration file with default settings.
|
|
81
|
+
|
|
82
|
+
### 2. Generate Transformation Code
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
# Generate email cleaning primitives
|
|
86
|
+
datacompose add clean_emails --target pyspark
|
|
87
|
+
|
|
88
|
+
# Generate address standardization primitives
|
|
89
|
+
datacompose add clean_addresses --target pyspark
|
|
90
|
+
|
|
91
|
+
# Generate phone number validation primitives
|
|
92
|
+
datacompose add clean_phone_numbers --target pyspark
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### 3. Use the Generated Code
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
from pyspark.sql import SparkSession
|
|
99
|
+
from pyspark.sql import functions as F
|
|
100
|
+
|
|
101
|
+
# Import the generated primitives
|
|
102
|
+
from build.pyspark.clean_emails.email_primitives import emails
|
|
103
|
+
|
|
104
|
+
# Create Spark session
|
|
105
|
+
spark = SparkSession.builder.appName("DataCleaning").getOrCreate()
|
|
106
|
+
|
|
107
|
+
# Load your data
|
|
108
|
+
df = spark.read.csv("data.csv", header=True)
|
|
109
|
+
|
|
110
|
+
# Apply email transformations
|
|
111
|
+
cleaned_df = df.withColumn(
|
|
112
|
+
"email_clean",
|
|
113
|
+
emails.standardize_email(F.col("email"))
|
|
114
|
+
).withColumn(
|
|
115
|
+
"email_domain",
|
|
116
|
+
emails.extract_domain(F.col("email_clean"))
|
|
117
|
+
).withColumn(
|
|
118
|
+
"is_valid",
|
|
119
|
+
emails.is_valid_email(F.col("email_clean"))
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Filter to valid emails only
|
|
123
|
+
valid_emails = cleaned_df.filter(F.col("is_valid"))
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Core Concepts
|
|
127
|
+
|
|
128
|
+
### PrimitiveRegistry
|
|
129
|
+
|
|
130
|
+
A container for organizing related transformation functions:
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
from datacompose.operators.primitives import PrimitiveRegistry
|
|
134
|
+
|
|
135
|
+
# Create a registry for text operations
|
|
136
|
+
text = PrimitiveRegistry("text")
|
|
137
|
+
|
|
138
|
+
# Register transformation functions
|
|
139
|
+
@text.register()
|
|
140
|
+
def lowercase(col):
|
|
141
|
+
return F.lower(col)
|
|
142
|
+
|
|
143
|
+
@text.register()
|
|
144
|
+
def remove_spaces(col):
|
|
145
|
+
return F.regexp_replace(col, r'\s+', '')
|
|
146
|
+
|
|
147
|
+
# Use the transformations
|
|
148
|
+
df = df.withColumn("clean_text", text.lowercase(F.col("input")))
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### SmartPrimitive
|
|
152
|
+
|
|
153
|
+
Enables partial application of transformations:
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
@text.register()
|
|
157
|
+
def trim(col, chars=' '):
|
|
158
|
+
return F.trim(col, chars)
|
|
159
|
+
|
|
160
|
+
# Direct usage
|
|
161
|
+
df = df.withColumn("trimmed", text.trim(F.col("input")))
|
|
162
|
+
|
|
163
|
+
# Pre-configured usage
|
|
164
|
+
trim_tabs = text.trim(chars='\t')
|
|
165
|
+
df = df.withColumn("no_tabs", trim_tabs(F.col("input")))
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### Pipeline Composition
|
|
169
|
+
|
|
170
|
+
Build complex pipelines from simple primitives:
|
|
171
|
+
|
|
172
|
+
```python
|
|
173
|
+
@text.compose(text=text)
|
|
174
|
+
def clean_pipeline():
|
|
175
|
+
text.trim()
|
|
176
|
+
text.lowercase()
|
|
177
|
+
text.remove_spaces()
|
|
178
|
+
|
|
179
|
+
# Apply the entire pipeline
|
|
180
|
+
df = df.withColumn("cleaned", clean_pipeline(F.col("input")))
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### Conditional Pipelines
|
|
184
|
+
|
|
185
|
+
Add conditional logic to your transformations:
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
@text.register(is_conditional=True)
|
|
189
|
+
def is_valid_length(col):
|
|
190
|
+
return F.length(col) > 5
|
|
191
|
+
|
|
192
|
+
@text.register()
|
|
193
|
+
def truncate(col):
|
|
194
|
+
return F.substring(col, 1, 5)
|
|
195
|
+
|
|
196
|
+
@text.compose(text=text)
|
|
197
|
+
def smart_truncate():
|
|
198
|
+
if text.is_valid_length():
|
|
199
|
+
text.truncate()
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
## Available Primitives
|
|
203
|
+
|
|
204
|
+
### Email Primitives
|
|
205
|
+
|
|
206
|
+
```python
|
|
207
|
+
from build.pyspark.clean_emails.email_primitives import emails
|
|
208
|
+
|
|
209
|
+
# Validation
|
|
210
|
+
emails.is_valid_email(col)
|
|
211
|
+
emails.is_business_email(col)
|
|
212
|
+
emails.is_disposable_email(col)
|
|
213
|
+
|
|
214
|
+
# Extraction
|
|
215
|
+
emails.extract_domain(col)
|
|
216
|
+
emails.extract_username(col)
|
|
217
|
+
emails.extract_tld(col)
|
|
218
|
+
|
|
219
|
+
# Standardization
|
|
220
|
+
emails.standardize_email(col)
|
|
221
|
+
emails.normalize_gmail(col)
|
|
222
|
+
emails.fix_common_typos(col)
|
|
223
|
+
|
|
224
|
+
# Filtering
|
|
225
|
+
emails.filter_valid_emails(col)
|
|
226
|
+
emails.filter_business_emails(col)
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### Address Primitives
|
|
230
|
+
|
|
231
|
+
```python
|
|
232
|
+
from build.pyspark.clean_addresses.address_primitives import addresses
|
|
233
|
+
|
|
234
|
+
# Extraction
|
|
235
|
+
addresses.extract_street_number(col)
|
|
236
|
+
addresses.extract_street_name(col)
|
|
237
|
+
addresses.extract_city(col)
|
|
238
|
+
addresses.extract_state(col)
|
|
239
|
+
addresses.extract_zip_code(col)
|
|
240
|
+
|
|
241
|
+
# Standardization
|
|
242
|
+
addresses.standardize_state(col)
|
|
243
|
+
addresses.standardize_street_suffix(col)
|
|
244
|
+
addresses.standardize_direction(col)
|
|
245
|
+
|
|
246
|
+
# Validation
|
|
247
|
+
addresses.is_valid_zip_code(col)
|
|
248
|
+
addresses.is_valid_state(col)
|
|
249
|
+
addresses.is_po_box(col)
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
### Phone Number Primitives
|
|
253
|
+
|
|
254
|
+
```python
|
|
255
|
+
from build.pyspark.clean_phone_numbers.phone_primitives import phones
|
|
256
|
+
|
|
257
|
+
# Validation
|
|
258
|
+
phones.is_valid_nanp(col)
|
|
259
|
+
phones.is_valid_international(col)
|
|
260
|
+
phones.is_toll_free(col)
|
|
261
|
+
|
|
262
|
+
# Extraction
|
|
263
|
+
phones.extract_country_code(col)
|
|
264
|
+
phones.extract_area_code(col)
|
|
265
|
+
phones.extract_exchange(col)
|
|
266
|
+
phones.extract_subscriber(col)
|
|
267
|
+
|
|
268
|
+
# Formatting
|
|
269
|
+
phones.format_nanp(col)
|
|
270
|
+
phones.format_e164(col)
|
|
271
|
+
phones.format_international(col)
|
|
272
|
+
|
|
273
|
+
# Standardization
|
|
274
|
+
phones.standardize_phone(col)
|
|
275
|
+
phones.clean_phone(col)
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
## Advanced Usage
|
|
279
|
+
|
|
280
|
+
### Creating Custom Primitives
|
|
281
|
+
|
|
282
|
+
```python
|
|
283
|
+
from datacompose.operators.primitives import PrimitiveRegistry
|
|
284
|
+
|
|
285
|
+
# Create your own registry
|
|
286
|
+
custom = PrimitiveRegistry("custom")
|
|
287
|
+
|
|
288
|
+
@custom.register()
|
|
289
|
+
def remove_special_chars(col):
|
|
290
|
+
return F.regexp_replace(col, r'[^a-zA-Z0-9\s]', '')
|
|
291
|
+
|
|
292
|
+
@custom.register()
|
|
293
|
+
def capitalize_words(col):
|
|
294
|
+
return F.initcap(col)
|
|
295
|
+
|
|
296
|
+
@custom.register(is_conditional=True)
|
|
297
|
+
def contains_numbers(col):
|
|
298
|
+
return col.rlike(r'\d+')
|
|
299
|
+
|
|
300
|
+
# Create a pipeline with your custom primitives
|
|
301
|
+
@custom.compose(custom=custom)
|
|
302
|
+
def clean_text():
|
|
303
|
+
custom.remove_special_chars()
|
|
304
|
+
if custom.contains_numbers():
|
|
305
|
+
custom.capitalize_words()
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
### Working with Parameters
|
|
309
|
+
|
|
310
|
+
```python
|
|
311
|
+
@custom.register()
|
|
312
|
+
def pad_string(col, length=10, fill_char='0'):
|
|
313
|
+
return F.lpad(col, length, fill_char)
|
|
314
|
+
|
|
315
|
+
# Use with different parameters
|
|
316
|
+
df = df.withColumn("padded_10", custom.pad_string(F.col("id")))
|
|
317
|
+
df = df.withColumn("padded_5", custom.pad_string(length=5)(F.col("id")))
|
|
318
|
+
df = df.withColumn("padded_x", custom.pad_string(length=8, fill_char='X')(F.col("id")))
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
### Combining Multiple Registries
|
|
322
|
+
|
|
323
|
+
```python
|
|
324
|
+
from build.pyspark.clean_emails.email_primitives import emails
|
|
325
|
+
from build.pyspark.clean_phones.phone_primitives import phones
|
|
326
|
+
|
|
327
|
+
# Create a combined validation pipeline
|
|
328
|
+
validation = PrimitiveRegistry("validation")
|
|
329
|
+
|
|
330
|
+
@validation.compose(emails=emails, phones=phones)
|
|
331
|
+
def validate_contact_info():
|
|
332
|
+
# Check email
|
|
333
|
+
if emails.is_valid_email():
|
|
334
|
+
emails.standardize_email()
|
|
335
|
+
|
|
336
|
+
# Check phone
|
|
337
|
+
if phones.is_valid_phone():
|
|
338
|
+
phones.standardize_phone()
|
|
339
|
+
```
|
|
340
|
+
|
|
341
|
+
## CLI Commands
|
|
342
|
+
|
|
343
|
+
### Initialize a Project
|
|
344
|
+
```bash
|
|
345
|
+
datacompose init [--yes]
|
|
346
|
+
```
|
|
347
|
+
|
|
348
|
+
### Add Transformers
|
|
349
|
+
```bash
|
|
350
|
+
datacompose add <transformer> [--target TARGET] [--output OUTPUT] [--verbose]
|
|
351
|
+
|
|
352
|
+
# Examples
|
|
353
|
+
datacompose add clean_emails --target pyspark
|
|
354
|
+
datacompose add clean_addresses --target pyspark --output ./custom/path
|
|
355
|
+
datacompose add clean_phone_numbers --target pyspark --verbose
|
|
356
|
+
```
|
|
357
|
+
|
|
358
|
+
### List Available Transformers
|
|
359
|
+
```bash
|
|
360
|
+
datacompose list transformers
|
|
361
|
+
datacompose list generators
|
|
362
|
+
```
|
|
363
|
+
|
|
364
|
+
## Project Structure
|
|
365
|
+
|
|
366
|
+
After running `datacompose add`, your project will have the following structure:
|
|
367
|
+
|
|
368
|
+
```
|
|
369
|
+
project/
|
|
370
|
+
├── datacompose.json # Configuration file
|
|
371
|
+
├── build/
|
|
372
|
+
│ └── pyspark/
|
|
373
|
+
│ ├── clean_emails/
|
|
374
|
+
│ │ ├── email_primitives.py # Generated email primitives
|
|
375
|
+
│ │ └── utils/
|
|
376
|
+
│ │ └── primitives.py # Core framework (embedded)
|
|
377
|
+
│ ├── clean_addresses/
|
|
378
|
+
│ │ ├── address_primitives.py
|
|
379
|
+
│ │ └── utils/
|
|
380
|
+
│ │ └── primitives.py
|
|
381
|
+
│ └── clean_phone_numbers/
|
|
382
|
+
│ ├── phone_primitives.py
|
|
383
|
+
│ └── utils/
|
|
384
|
+
│ └── primitives.py
|
|
385
|
+
```
|
|
386
|
+
|
|
387
|
+
## Configuration
|
|
388
|
+
|
|
389
|
+
The `datacompose.json` file configures default settings:
|
|
390
|
+
|
|
391
|
+
```json
|
|
392
|
+
{
|
|
393
|
+
"version": "1.0.0",
|
|
394
|
+
"targets": {
|
|
395
|
+
"pyspark": {
|
|
396
|
+
"output": "./build/pyspark",
|
|
397
|
+
"generator": "SparkPandasUDFGenerator"
|
|
398
|
+
}
|
|
399
|
+
},
|
|
400
|
+
"templates": {
|
|
401
|
+
"directory": "src/transformers/templates"
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
```
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
## Performance Considerations
|
|
409
|
+
|
|
410
|
+
- Primitives are designed to be efficient Spark operations
|
|
411
|
+
- Pipelines are compiled to minimize intermediate columns
|
|
412
|
+
- Conditional logic uses Spark's `when/otherwise` for vectorized operations
|
|
413
|
+
- Generated code has no runtime dependencies beyond PySpark
|
|
414
|
+
|
|
415
|
+
## Philosophy & Inspiration
|
|
416
|
+
|
|
417
|
+
Datacompose is inspired by [shadcn-svelte](https://www.shadcn-svelte.com/) and [huntabyte](https://github.com/huntabyte)'s approach to component libraries. Just as shadcn-svelte provides "copy and paste" components rather than npm packages, Datacompose generates data transformation code that becomes part of YOUR codebase.
|
|
418
|
+
|
|
419
|
+
**Why we believe in this approach:**
|
|
420
|
+
|
|
421
|
+
- **You Own Your Code**: No external dependencies to manage or worry about breaking changes
|
|
422
|
+
- **Full Transparency**: Every transformation is readable, debuggable PySpark code you can understand
|
|
423
|
+
- **Customization First**: Need to adjust transformation? Just edit the code
|
|
424
|
+
- **Learn by Reading**: The generated code serves as documentation and learning material
|
|
425
|
+
|
|
426
|
+
This is NOT a traditional library - it's a code generator that gives you production-ready data transformation primitives that you can modify to fit your exact needs.
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
## License
|
|
430
|
+
|
|
431
|
+
MIT License - see LICENSE file for details
|