datacompose 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacompose might be problematic. Click here for more details.

Files changed (90) hide show
  1. datacompose-0.2.4/CHANGELOG.md +108 -0
  2. datacompose-0.2.4/LICENSE +21 -0
  3. datacompose-0.2.4/MANIFEST.in +38 -0
  4. datacompose-0.2.4/PKG-INFO +431 -0
  5. datacompose-0.2.4/README.md +384 -0
  6. datacompose-0.2.4/datacompose/__init__.py +1 -0
  7. datacompose-0.2.4/datacompose/cli/__init__.py +5 -0
  8. datacompose-0.2.4/datacompose/cli/colors.py +80 -0
  9. datacompose-0.2.4/datacompose/cli/commands/__init__.py +3 -0
  10. datacompose-0.2.4/datacompose/cli/commands/add.py +215 -0
  11. datacompose-0.2.4/datacompose/cli/commands/init.py +451 -0
  12. datacompose-0.2.4/datacompose/cli/commands/list.py +118 -0
  13. datacompose-0.2.4/datacompose/cli/commands/upgrade.py +7 -0
  14. datacompose-0.2.4/datacompose/cli/main.py +59 -0
  15. datacompose-0.2.4/datacompose/cli/validation.py +72 -0
  16. datacompose-0.2.4/datacompose/generators/__init__.py +3 -0
  17. datacompose-0.2.4/datacompose/generators/base.py +193 -0
  18. datacompose-0.2.4/datacompose/generators/pyspark/__init__.py +1 -0
  19. datacompose-0.2.4/datacompose/generators/pyspark/generator.py +51 -0
  20. datacompose-0.2.4/datacompose/operators/__init__.py +21 -0
  21. datacompose-0.2.4/datacompose/operators/primitives.py +595 -0
  22. datacompose-0.2.4/datacompose/transformers/__init__.py +0 -0
  23. datacompose-0.2.4/datacompose/transformers/discovery.py +186 -0
  24. datacompose-0.2.4/datacompose/transformers/text/__init__.py +1 -0
  25. datacompose-0.2.4/datacompose/transformers/text/clean_addresses/__init__.py +1 -0
  26. datacompose-0.2.4/datacompose/transformers/text/clean_addresses/pyspark/pyspark_primitives.py +1967 -0
  27. datacompose-0.2.4/datacompose/transformers/text/clean_emails/__init__.py +1 -0
  28. datacompose-0.2.4/datacompose/transformers/text/clean_emails/pyspark/pyspark_primitives.py +781 -0
  29. datacompose-0.2.4/datacompose/transformers/text/clean_phone_numbers/__init__.py +0 -0
  30. datacompose-0.2.4/datacompose/transformers/text/clean_phone_numbers/pyspark/pyspark_primitives.py +941 -0
  31. datacompose-0.2.4/datacompose.egg-info/PKG-INFO +431 -0
  32. datacompose-0.2.4/datacompose.egg-info/SOURCES.txt +89 -0
  33. datacompose-0.2.4/datacompose.egg-info/dependency_links.txt +1 -0
  34. datacompose-0.2.4/datacompose.egg-info/entry_points.txt +2 -0
  35. datacompose-0.2.4/datacompose.egg-info/requires.txt +18 -0
  36. datacompose-0.2.4/datacompose.egg-info/top_level.txt +1 -0
  37. datacompose-0.2.4/pyproject.toml +89 -0
  38. datacompose-0.2.4/setup.cfg +5 -0
  39. datacompose-0.2.4/tests/__init__.py +0 -0
  40. datacompose-0.2.4/tests/integration/__init__.py +1 -0
  41. datacompose-0.2.4/tests/integration/test_end_to_end.py +182 -0
  42. datacompose-0.2.4/tests/integration/test_generated_imports.py +165 -0
  43. datacompose-0.2.4/tests/unit/cli/.venv/bin/activate_this.py +59 -0
  44. datacompose-0.2.4/tests/unit/cli/.venv/lib/python3.12/site-packages/_virtualenv.py +101 -0
  45. datacompose-0.2.4/tests/unit/cli/__init__.py +1 -0
  46. datacompose-0.2.4/tests/unit/cli/build/__init__.py +0 -0
  47. datacompose-0.2.4/tests/unit/cli/build/postgres/__init__.py +0 -0
  48. datacompose-0.2.4/tests/unit/cli/build/postgres/clean_emails/__init__.py +0 -0
  49. datacompose-0.2.4/tests/unit/cli/build/postgres/clean_emails/email_cleaner_udf_spec.yaml +67 -0
  50. datacompose-0.2.4/tests/unit/cli/build/postgres/clean_emails/test_email_cleaner_udf.py +102 -0
  51. datacompose-0.2.4/tests/unit/cli/build/spark/__init__.py +0 -0
  52. datacompose-0.2.4/tests/unit/cli/build/spark/clean_emails/__init__.py +0 -0
  53. datacompose-0.2.4/tests/unit/cli/build/spark/clean_emails/email_cleaner_udf.py +196 -0
  54. datacompose-0.2.4/tests/unit/cli/build/spark/clean_emails/email_cleaner_udf_spec.yaml +67 -0
  55. datacompose-0.2.4/tests/unit/cli/build/spark/clean_emails/test_email_cleaner_udf.py +102 -0
  56. datacompose-0.2.4/tests/unit/cli/test_add_command.py +177 -0
  57. datacompose-0.2.4/tests/unit/cli/test_add_validation.py +96 -0
  58. datacompose-0.2.4/tests/unit/cli/test_init_command.py +123 -0
  59. datacompose-0.2.4/tests/unit/cli/test_list_command.py +239 -0
  60. datacompose-0.2.4/tests/unit/cli/test_main.py +51 -0
  61. datacompose-0.2.4/tests/unit/generators/__init__.py +1 -0
  62. datacompose-0.2.4/tests/unit/generators/test_base_generator.py +330 -0
  63. datacompose-0.2.4/tests/unit/generators/test_spark_generator.py +266 -0
  64. datacompose-0.2.4/tests/unit/operators/conditional_tests_common.py +26 -0
  65. datacompose-0.2.4/tests/unit/operators/conftest.py +61 -0
  66. datacompose-0.2.4/tests/unit/operators/test_conditional_complex_logic.py +200 -0
  67. datacompose-0.2.4/tests/unit/operators/test_conditional_data_driven.py +117 -0
  68. datacompose-0.2.4/tests/unit/operators/test_conditional_edge_cases.py +150 -0
  69. datacompose-0.2.4/tests/unit/operators/test_conditional_error_handling.py +67 -0
  70. datacompose-0.2.4/tests/unit/operators/test_conditional_parameters.py +94 -0
  71. datacompose-0.2.4/tests/unit/operators/test_conditional_performance.py +106 -0
  72. datacompose-0.2.4/tests/unit/operators/test_conditional_real_world.py +183 -0
  73. datacompose-0.2.4/tests/unit/operators/test_operators.py +951 -0
  74. datacompose-0.2.4/tests/unit/transformers/__init__.py +1 -0
  75. datacompose-0.2.4/tests/unit/transformers/test_discovery.py +171 -0
  76. datacompose-0.2.4/tests/unit/transformers/text/common/test_common.py +0 -0
  77. datacompose-0.2.4/tests/unit/transformers/text/test_addresses/test_building_unit_extraction.py +791 -0
  78. datacompose-0.2.4/tests/unit/transformers/text/test_addresses/test_city_state_extraction.py +831 -0
  79. datacompose-0.2.4/tests/unit/transformers/text/test_addresses/test_clean_addresses.py +291 -0
  80. datacompose-0.2.4/tests/unit/transformers/text/test_addresses/test_country_extraction.py +247 -0
  81. datacompose-0.2.4/tests/unit/transformers/text/test_addresses/test_data_addresses.py +627 -0
  82. datacompose-0.2.4/tests/unit/transformers/text/test_addresses/test_po_box_extraction.py +317 -0
  83. datacompose-0.2.4/tests/unit/transformers/text/test_addresses/test_street_extraction.py +1038 -0
  84. datacompose-0.2.4/tests/unit/transformers/text/test_addresses/test_zip_code_extraction.py +996 -0
  85. datacompose-0.2.4/tests/unit/transformers/text/test_emails/test_debug_long_emails.py +46 -0
  86. datacompose-0.2.4/tests/unit/transformers/text/test_emails/test_email_extraction.py +848 -0
  87. datacompose-0.2.4/tests/unit/transformers/text/test_emails/test_email_optimized.py +219 -0
  88. datacompose-0.2.4/tests/unit/transformers/text/test_phone_numbers/test_phone_extraction.py +611 -0
  89. datacompose-0.2.4/tests/unit/transformers/text/test_phone_numbers/test_phone_formatting.py +489 -0
  90. datacompose-0.2.4/tests/yaml_specs/__init__.py +1 -0
@@ -0,0 +1,108 @@
1
+ # Changelog
2
+
3
+ All notable changes to Datacompose will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [0.2.4] - 2024-08-12
11
+
12
+ ### Added
13
+ - **Phone Number Primitives**: Complete set of 45+ phone number transformation functions
14
+ - NANP validation and formatting (North American Numbering Plan)
15
+ - International phone support with E.164 formatting
16
+ - Extension handling and toll-free detection
17
+ - Phone number extraction from text
18
+ - Letter-to-number conversion (1-800-FLOWERS support)
19
+ - **Address Improvements**: Enhanced street extraction and standardization
20
+ - Fixed numbered street extraction ("5th Avenue" correctly returns "5th")
21
+ - Improved null handling in street extraction
22
+ - Custom mapping support for street suffix standardization
23
+ - **Utils Export**: Generated code now includes `utils/primitives.py` for standalone deployment
24
+ - PrimitiveRegistry class embedded with generated code
25
+ - No runtime dependency on datacompose package
26
+ - Fallback imports for maximum compatibility
27
+
28
+ ### Changed
29
+ - **BREAKING**: Renamed `PrimitiveNameSpace` to `PrimitiveRegistry` throughout codebase
30
+ - **Major Architecture Shift**: Removed YAML/spec file system entirely
31
+ - No more YAML specifications or JSON replacements
32
+ - Direct primitive file copying instead of template rendering
33
+ - Simplified discovery system works with transformer directories
34
+ - Removed `validate` command completely
35
+ - **Import Strategy**: Primitives now try local utils import first, fall back to datacompose package
36
+ - **File Naming**: Generated files use plural form with primitives suffix
37
+ - `clean_emails` → `email_primitives.py`
38
+ - `clean_addresses` → `address_primitives.py`
39
+ - `clean_phone_numbers` → `phone_primitives.py`
40
+
41
+ ### Fixed
42
+ - Phone `normalize_separators` now correctly handles parentheses: `(555)123-4567` → `555-123-4567`
43
+ - Street extraction for numbered streets ("5th Avenue" issue)
44
+ - Compose decorator now requires namespace to be passed explicitly for proper method resolution
45
+ - `standardize_street_suffix` applies both custom and default mappings correctly
46
+ - Test failures due to namespace resolution in compose decorator
47
+
48
+ ### Removed
49
+ - All YAML/spec file functionality
50
+ - PostgreSQL generator references
51
+ - Jinja2 template dependencies
52
+ - `validate` command from CLI
53
+ - Old Spark integration tests (replaced with end-to-end tests)
54
+
55
+ ## [0.2.0] - 2024-XX-XX
56
+
57
+ ### Added
58
+ - **Primitive System**: New composable primitive architecture for building data pipelines
59
+ - `SmartPrimitive` class for partial application and parameter binding
60
+ - `PrimitiveRegistry` (originally PrimitiveNameSpace) for organizing related transformations
61
+ - Support for conditional primitives (boolean-returning functions)
62
+ - **Conditional Compilation**: AST-based pipeline compilation with if/else support
63
+ - `PipelineCompiler` for parsing and compiling conditional logic
64
+ - `StablePipeline` for executing compiled pipelines
65
+ - Full support for nested conditionals and complex branching
66
+ - **Comprehensive Testing**: 44+ tests covering conditional compilation scenarios
67
+ - Edge cases and null handling
68
+ - Complex nested logic
69
+ - Data-driven conditions
70
+ - Performance optimization tests
71
+ - Real-world use cases
72
+ - Parameter handling
73
+ - Error handling
74
+ - **Improved Architecture**: Dual approach for different runtime constraints
75
+ - Primitives for flexible runtimes (Python, Spark, Scala)
76
+ - Templates for rigid targets (SQL, PostgreSQL)
77
+
78
+ ### Changed
79
+ - Made PySpark an optional dependency
80
+ - Reorganized test structure with focused test files and shared fixtures
81
+ - Refined architecture to support both template-based and primitive-based approaches
82
+
83
+ ### Fixed
84
+ - Import paths for pipeline compilation modules
85
+ - Missing return statements in pipeline execution
86
+ - Conditional logic to use accumulated results correctly
87
+
88
+ ## [0.1.4] - 2024-XX-XX
89
+
90
+ ### Added
91
+ - Initial release of Datacompose
92
+ - Core framework for generating data cleaning UDFs
93
+ - Support for Spark, PostgreSQL, and Pandas targets
94
+ - Built-in specifications for common data cleaning tasks:
95
+ - Email address cleaning
96
+ - Phone number normalization
97
+ - Address standardization
98
+ - Job title standardization
99
+ - Date/time parsing
100
+ - CLI interface with commands:
101
+ - `datacompose init` - Initialize project
102
+ - `datacompose add` - Generate UDFs from specs
103
+ - `datacompose list` - List available targets and specs
104
+ - `datacompose validate` - Validate specification files
105
+ - YAML-based specification format
106
+ - Jinja2 templating for code generation
107
+ - Comprehensive test suite
108
+ - Documentation with Sphinx and Furo theme
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 datacompose
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,38 @@
1
+ # Include documentation
2
+ include README.md
3
+ include LICENSE
4
+ include CHANGELOG.md
5
+
6
+ # Include all YAML specifications
7
+ recursive-include datacompose/transformers *.yaml
8
+
9
+ # Include all Jinja2 templates
10
+ recursive-include datacompose/transformers *.j2
11
+ recursive-include datacompose/generators *.j2
12
+
13
+ # Include type hints
14
+ recursive-include datacompose py.typed
15
+
16
+ # Include test data (optional, remove if you don't want tests in distribution)
17
+ recursive-include tests *.py
18
+ recursive-include tests *.csv
19
+ recursive-include tests *.yaml
20
+
21
+ # Exclude unnecessary files
22
+ global-exclude *.pyc
23
+ global-exclude *.pyo
24
+ global-exclude __pycache__
25
+ global-exclude .DS_Store
26
+ global-exclude .git*
27
+ global-exclude *.swp
28
+ global-exclude *~
29
+
30
+ # Exclude development files
31
+ exclude .pre-commit-config.yaml
32
+ exclude .gitignore
33
+ exclude docker-compose.yml
34
+ exclude Dockerfile
35
+ exclude Makefile
36
+ prune docs/build
37
+ prune notebooks
38
+ prune scripts
@@ -0,0 +1,431 @@
1
+ Metadata-Version: 2.4
2
+ Name: datacompose
3
+ Version: 0.2.4
4
+ Summary: Copy-pasteable data transformation primitives for PySpark. Inspired by shadcn-svelte.
5
+ Author: Datacompose Contributors
6
+ Maintainer: Datacompose Contributors
7
+ License: MIT
8
+ Project-URL: Homepage, https://github.com/datacompose/datacompose
9
+ Project-URL: Documentation, https://github.com/datacompose/datacompose/tree/main/docs
10
+ Project-URL: Repository, https://github.com/datacompose/datacompose.git
11
+ Project-URL: Issues, https://github.com/datacompose/datacompose/issues
12
+ Project-URL: Changelog, https://github.com/datacompose/datacompose/blob/main/CHANGELOG.md
13
+ Keywords: data-cleaning,data-quality,udf,spark,postgres,code-generation,data-pipeline,etl
14
+ Classifier: Development Status :: 4 - Beta
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: Topic :: Software Development :: Code Generators
17
+ Classifier: Topic :: Database
18
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
+ Classifier: License :: OSI Approved :: MIT License
20
+ Classifier: Programming Language :: Python :: 3
21
+ Classifier: Programming Language :: Python :: 3.8
22
+ Classifier: Programming Language :: Python :: 3.9
23
+ Classifier: Programming Language :: Python :: 3.10
24
+ Classifier: Programming Language :: Python :: 3.11
25
+ Classifier: Programming Language :: Python :: 3.12
26
+ Classifier: Operating System :: OS Independent
27
+ Requires-Python: >=3.8
28
+ Description-Content-Type: text/markdown
29
+ License-File: LICENSE
30
+ Requires-Dist: jinja2>=3.0.0
31
+ Requires-Dist: pyyaml>=6.0
32
+ Requires-Dist: click>=8.0.0
33
+ Provides-Extra: dev
34
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
35
+ Requires-Dist: black>=23.0.0; extra == "dev"
36
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
37
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
38
+ Provides-Extra: docs
39
+ Requires-Dist: sphinx>=7.2.0; extra == "docs"
40
+ Requires-Dist: furo>=2024.1.0; extra == "docs"
41
+ Requires-Dist: myst-parser>=2.0.0; extra == "docs"
42
+ Requires-Dist: sphinx-autodoc-typehints>=1.25.0; extra == "docs"
43
+ Requires-Dist: sphinx-copybutton>=0.5.2; extra == "docs"
44
+ Requires-Dist: sphinx-tabs>=3.4.0; extra == "docs"
45
+ Requires-Dist: sphinx-click>=5.1.0; extra == "docs"
46
+ Dynamic: license-file
47
+
48
+ # Datacompose
49
+
50
+ A powerful data transformation framework for building reusable, composable data cleaning pipelines in PySpark.
51
+
52
+ ## Overview
53
+
54
+ Datacompose provides a declarative way to build data transformation pipelines using composable primitives. It generates optimized, standalone PySpark code that can be deployed without runtime dependencies.
55
+
56
+ ## Key Features
57
+
58
+ - **Composable Primitives**: Build complex transformations from simple, reusable functions
59
+ - **Smart Partial Application**: Configure transformations with parameters for reuse
60
+ - **Pipeline Compilation**: Convert declarative pipeline definitions into optimized Spark operations
61
+ - **Code Generation**: Generate standalone PySpark code with embedded dependencies
62
+ - **Comprehensive Libraries**: Pre-built primitives for emails, addresses, and phone numbers
63
+ - **Conditional Logic**: Support for if/else branching in pipelines
64
+ - **Type-Safe Operations**: All transformations maintain Spark column type safety
65
+
66
+ ## Installation
67
+
68
+ ```bash
69
+ pip install datacompose
70
+ ```
71
+
72
+ ## Quick Start
73
+
74
+ ### 1. Initialize a Project
75
+
76
+ ```bash
77
+ datacompose init
78
+ ```
79
+
80
+ This creates a `datacompose.json` configuration file with default settings.
81
+
82
+ ### 2. Generate Transformation Code
83
+
84
+ ```bash
85
+ # Generate email cleaning primitives
86
+ datacompose add clean_emails --target pyspark
87
+
88
+ # Generate address standardization primitives
89
+ datacompose add clean_addresses --target pyspark
90
+
91
+ # Generate phone number validation primitives
92
+ datacompose add clean_phone_numbers --target pyspark
93
+ ```
94
+
95
+ ### 3. Use the Generated Code
96
+
97
+ ```python
98
+ from pyspark.sql import SparkSession
99
+ from pyspark.sql import functions as F
100
+
101
+ # Import the generated primitives
102
+ from build.pyspark.clean_emails.email_primitives import emails
103
+
104
+ # Create Spark session
105
+ spark = SparkSession.builder.appName("DataCleaning").getOrCreate()
106
+
107
+ # Load your data
108
+ df = spark.read.csv("data.csv", header=True)
109
+
110
+ # Apply email transformations
111
+ cleaned_df = df.withColumn(
112
+ "email_clean",
113
+ emails.standardize_email(F.col("email"))
114
+ ).withColumn(
115
+ "email_domain",
116
+ emails.extract_domain(F.col("email_clean"))
117
+ ).withColumn(
118
+ "is_valid",
119
+ emails.is_valid_email(F.col("email_clean"))
120
+ )
121
+
122
+ # Filter to valid emails only
123
+ valid_emails = cleaned_df.filter(F.col("is_valid"))
124
+ ```
125
+
126
+ ## Core Concepts
127
+
128
+ ### PrimitiveRegistry
129
+
130
+ A container for organizing related transformation functions:
131
+
132
+ ```python
133
+ from datacompose.operators.primitives import PrimitiveRegistry
134
+
135
+ # Create a registry for text operations
136
+ text = PrimitiveRegistry("text")
137
+
138
+ # Register transformation functions
139
+ @text.register()
140
+ def lowercase(col):
141
+ return F.lower(col)
142
+
143
+ @text.register()
144
+ def remove_spaces(col):
145
+ return F.regexp_replace(col, r'\s+', '')
146
+
147
+ # Use the transformations
148
+ df = df.withColumn("clean_text", text.lowercase(F.col("input")))
149
+ ```
150
+
151
+ ### SmartPrimitive
152
+
153
+ Enables partial application of transformations:
154
+
155
+ ```python
156
+ @text.register()
157
+ def trim(col, chars=' '):
158
+ return F.trim(col, chars)
159
+
160
+ # Direct usage
161
+ df = df.withColumn("trimmed", text.trim(F.col("input")))
162
+
163
+ # Pre-configured usage
164
+ trim_tabs = text.trim(chars='\t')
165
+ df = df.withColumn("no_tabs", trim_tabs(F.col("input")))
166
+ ```
167
+
168
+ ### Pipeline Composition
169
+
170
+ Build complex pipelines from simple primitives:
171
+
172
+ ```python
173
+ @text.compose(text=text)
174
+ def clean_pipeline():
175
+ text.trim()
176
+ text.lowercase()
177
+ text.remove_spaces()
178
+
179
+ # Apply the entire pipeline
180
+ df = df.withColumn("cleaned", clean_pipeline(F.col("input")))
181
+ ```
182
+
183
+ ### Conditional Pipelines
184
+
185
+ Add conditional logic to your transformations:
186
+
187
+ ```python
188
+ @text.register(is_conditional=True)
189
+ def is_valid_length(col):
190
+ return F.length(col) > 5
191
+
192
+ @text.register()
193
+ def truncate(col):
194
+ return F.substring(col, 1, 5)
195
+
196
+ @text.compose(text=text)
197
+ def smart_truncate():
198
+ if text.is_valid_length():
199
+ text.truncate()
200
+ ```
201
+
202
+ ## Available Primitives
203
+
204
+ ### Email Primitives
205
+
206
+ ```python
207
+ from build.pyspark.clean_emails.email_primitives import emails
208
+
209
+ # Validation
210
+ emails.is_valid_email(col)
211
+ emails.is_business_email(col)
212
+ emails.is_disposable_email(col)
213
+
214
+ # Extraction
215
+ emails.extract_domain(col)
216
+ emails.extract_username(col)
217
+ emails.extract_tld(col)
218
+
219
+ # Standardization
220
+ emails.standardize_email(col)
221
+ emails.normalize_gmail(col)
222
+ emails.fix_common_typos(col)
223
+
224
+ # Filtering
225
+ emails.filter_valid_emails(col)
226
+ emails.filter_business_emails(col)
227
+ ```
228
+
229
+ ### Address Primitives
230
+
231
+ ```python
232
+ from build.pyspark.clean_addresses.address_primitives import addresses
233
+
234
+ # Extraction
235
+ addresses.extract_street_number(col)
236
+ addresses.extract_street_name(col)
237
+ addresses.extract_city(col)
238
+ addresses.extract_state(col)
239
+ addresses.extract_zip_code(col)
240
+
241
+ # Standardization
242
+ addresses.standardize_state(col)
243
+ addresses.standardize_street_suffix(col)
244
+ addresses.standardize_direction(col)
245
+
246
+ # Validation
247
+ addresses.is_valid_zip_code(col)
248
+ addresses.is_valid_state(col)
249
+ addresses.is_po_box(col)
250
+ ```
251
+
252
+ ### Phone Number Primitives
253
+
254
+ ```python
255
+ from build.pyspark.clean_phone_numbers.phone_primitives import phones
256
+
257
+ # Validation
258
+ phones.is_valid_nanp(col)
259
+ phones.is_valid_international(col)
260
+ phones.is_toll_free(col)
261
+
262
+ # Extraction
263
+ phones.extract_country_code(col)
264
+ phones.extract_area_code(col)
265
+ phones.extract_exchange(col)
266
+ phones.extract_subscriber(col)
267
+
268
+ # Formatting
269
+ phones.format_nanp(col)
270
+ phones.format_e164(col)
271
+ phones.format_international(col)
272
+
273
+ # Standardization
274
+ phones.standardize_phone(col)
275
+ phones.clean_phone(col)
276
+ ```
277
+
278
+ ## Advanced Usage
279
+
280
+ ### Creating Custom Primitives
281
+
282
+ ```python
283
+ from datacompose.operators.primitives import PrimitiveRegistry
284
+
285
+ # Create your own registry
286
+ custom = PrimitiveRegistry("custom")
287
+
288
+ @custom.register()
289
+ def remove_special_chars(col):
290
+ return F.regexp_replace(col, r'[^a-zA-Z0-9\s]', '')
291
+
292
+ @custom.register()
293
+ def capitalize_words(col):
294
+ return F.initcap(col)
295
+
296
+ @custom.register(is_conditional=True)
297
+ def contains_numbers(col):
298
+ return col.rlike(r'\d+')
299
+
300
+ # Create a pipeline with your custom primitives
301
+ @custom.compose(custom=custom)
302
+ def clean_text():
303
+ custom.remove_special_chars()
304
+ if custom.contains_numbers():
305
+ custom.capitalize_words()
306
+ ```
307
+
308
+ ### Working with Parameters
309
+
310
+ ```python
311
+ @custom.register()
312
+ def pad_string(col, length=10, fill_char='0'):
313
+ return F.lpad(col, length, fill_char)
314
+
315
+ # Use with different parameters
316
+ df = df.withColumn("padded_10", custom.pad_string(F.col("id")))
317
+ df = df.withColumn("padded_5", custom.pad_string(length=5)(F.col("id")))
318
+ df = df.withColumn("padded_x", custom.pad_string(length=8, fill_char='X')(F.col("id")))
319
+ ```
320
+
321
+ ### Combining Multiple Registries
322
+
323
+ ```python
324
+ from build.pyspark.clean_emails.email_primitives import emails
325
+ from build.pyspark.clean_phones.phone_primitives import phones
326
+
327
+ # Create a combined validation pipeline
328
+ validation = PrimitiveRegistry("validation")
329
+
330
+ @validation.compose(emails=emails, phones=phones)
331
+ def validate_contact_info():
332
+ # Check email
333
+ if emails.is_valid_email():
334
+ emails.standardize_email()
335
+
336
+ # Check phone
337
+ if phones.is_valid_phone():
338
+ phones.standardize_phone()
339
+ ```
340
+
341
+ ## CLI Commands
342
+
343
+ ### Initialize a Project
344
+ ```bash
345
+ datacompose init [--yes]
346
+ ```
347
+
348
+ ### Add Transformers
349
+ ```bash
350
+ datacompose add <transformer> [--target TARGET] [--output OUTPUT] [--verbose]
351
+
352
+ # Examples
353
+ datacompose add clean_emails --target pyspark
354
+ datacompose add clean_addresses --target pyspark --output ./custom/path
355
+ datacompose add clean_phone_numbers --target pyspark --verbose
356
+ ```
357
+
358
+ ### List Available Transformers
359
+ ```bash
360
+ datacompose list transformers
361
+ datacompose list generators
362
+ ```
363
+
364
+ ## Project Structure
365
+
366
+ After running `datacompose add`, your project will have the following structure:
367
+
368
+ ```
369
+ project/
370
+ ├── datacompose.json # Configuration file
371
+ ├── build/
372
+ │ └── pyspark/
373
+ │ ├── clean_emails/
374
+ │ │ ├── email_primitives.py # Generated email primitives
375
+ │ │ └── utils/
376
+ │ │ └── primitives.py # Core framework (embedded)
377
+ │ ├── clean_addresses/
378
+ │ │ ├── address_primitives.py
379
+ │ │ └── utils/
380
+ │ │ └── primitives.py
381
+ │ └── clean_phone_numbers/
382
+ │ ├── phone_primitives.py
383
+ │ └── utils/
384
+ │ └── primitives.py
385
+ ```
386
+
387
+ ## Configuration
388
+
389
+ The `datacompose.json` file configures default settings:
390
+
391
+ ```json
392
+ {
393
+ "version": "1.0.0",
394
+ "targets": {
395
+ "pyspark": {
396
+ "output": "./build/pyspark",
397
+ "generator": "SparkPandasUDFGenerator"
398
+ }
399
+ },
400
+ "templates": {
401
+ "directory": "src/transformers/templates"
402
+ }
403
+ }
404
+ ```
405
+
406
+
407
+
408
+ ## Performance Considerations
409
+
410
+ - Primitives are designed to be efficient Spark operations
411
+ - Pipelines are compiled to minimize intermediate columns
412
+ - Conditional logic uses Spark's `when/otherwise` for vectorized operations
413
+ - Generated code has no runtime dependencies beyond PySpark
414
+
415
+ ## Philosophy & Inspiration
416
+
417
+ Datacompose is inspired by [shadcn-svelte](https://www.shadcn-svelte.com/) and [huntabyte](https://github.com/huntabyte)'s approach to component libraries. Just as shadcn-svelte provides "copy and paste" components rather than npm packages, Datacompose generates data transformation code that becomes part of YOUR codebase.
418
+
419
+ **Why we believe in this approach:**
420
+
421
+ - **You Own Your Code**: No external dependencies to manage or worry about breaking changes
422
+ - **Full Transparency**: Every transformation is readable, debuggable PySpark code you can understand
423
+ - **Customization First**: Need to adjust transformation? Just edit the code
424
+ - **Learn by Reading**: The generated code serves as documentation and learning material
425
+
426
+ This is NOT a traditional library - it's a code generator that gives you production-ready data transformation primitives that you can modify to fit your exact needs.
427
+
428
+
429
+ ## License
430
+
431
+ MIT License - see LICENSE file for details