datacompose 0.2.4__tar.gz → 0.2.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacompose might be problematic. Click here for more details.
- {datacompose-0.2.4 → datacompose-0.2.4.1}/CHANGELOG.md +13 -1
- {datacompose-0.2.4/datacompose.egg-info → datacompose-0.2.4.1}/PKG-INFO +24 -6
- {datacompose-0.2.4 → datacompose-0.2.4.1}/README.md +18 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/cli/commands/add.py +12 -27
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/cli/commands/init.py +2 -2
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/generators/base.py +23 -36
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/generators/pyspark/generator.py +7 -7
- {datacompose-0.2.4 → datacompose-0.2.4.1/datacompose.egg-info}/PKG-INFO +24 -6
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose.egg-info/SOURCES.txt +6 -1
- {datacompose-0.2.4 → datacompose-0.2.4.1}/pyproject.toml +6 -6
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/integration/test_end_to_end.py +15 -10
- datacompose-0.2.4.1/tests/integration/test_full_workflow.py +297 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/integration/test_generated_imports.py +93 -39
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/cli/test_add_command.py +1 -1
- datacompose-0.2.4.1/tests/unit/cli/test_add_command_complete.py +432 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/cli/test_init_command.py +1 -1
- datacompose-0.2.4.1/tests/unit/cli/test_init_command_complete.py +654 -0
- datacompose-0.2.4.1/tests/unit/cli/test_main_complete.py +377 -0
- datacompose-0.2.4.1/tests/unit/cli/test_validation_complete.py +400 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/generators/test_base_generator.py +6 -133
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/generators/test_spark_generator.py +18 -19
- datacompose-0.2.4.1/tests/unit/operators/test_primitives_complete.py +338 -0
- datacompose-0.2.4/datacompose/cli/commands/upgrade.py +0 -7
- {datacompose-0.2.4 → datacompose-0.2.4.1}/LICENSE +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/MANIFEST.in +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/__init__.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/cli/__init__.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/cli/colors.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/cli/commands/__init__.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/cli/commands/list.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/cli/main.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/cli/validation.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/generators/__init__.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/generators/pyspark/__init__.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/operators/__init__.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/operators/primitives.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/transformers/__init__.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/transformers/discovery.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/transformers/text/__init__.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/transformers/text/clean_addresses/__init__.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/transformers/text/clean_addresses/pyspark/pyspark_primitives.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/transformers/text/clean_emails/__init__.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/transformers/text/clean_emails/pyspark/pyspark_primitives.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/transformers/text/clean_phone_numbers/__init__.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose/transformers/text/clean_phone_numbers/pyspark/pyspark_primitives.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose.egg-info/dependency_links.txt +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose.egg-info/entry_points.txt +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose.egg-info/requires.txt +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/datacompose.egg-info/top_level.txt +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/setup.cfg +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/__init__.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/integration/__init__.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/cli/.venv/bin/activate_this.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/cli/.venv/lib/python3.12/site-packages/_virtualenv.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/cli/__init__.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/cli/build/__init__.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/cli/build/postgres/__init__.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/cli/build/postgres/clean_emails/__init__.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/cli/build/postgres/clean_emails/email_cleaner_udf_spec.yaml +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/cli/build/postgres/clean_emails/test_email_cleaner_udf.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/cli/build/spark/__init__.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/cli/build/spark/clean_emails/__init__.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/cli/build/spark/clean_emails/email_cleaner_udf.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/cli/build/spark/clean_emails/email_cleaner_udf_spec.yaml +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/cli/build/spark/clean_emails/test_email_cleaner_udf.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/cli/test_add_validation.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/cli/test_list_command.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/cli/test_main.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/generators/__init__.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/operators/conditional_tests_common.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/operators/conftest.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/operators/test_conditional_complex_logic.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/operators/test_conditional_data_driven.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/operators/test_conditional_edge_cases.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/operators/test_conditional_error_handling.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/operators/test_conditional_parameters.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/operators/test_conditional_performance.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/operators/test_conditional_real_world.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/operators/test_operators.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/transformers/__init__.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/transformers/test_discovery.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/transformers/text/common/test_common.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/transformers/text/test_addresses/test_building_unit_extraction.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/transformers/text/test_addresses/test_city_state_extraction.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/transformers/text/test_addresses/test_clean_addresses.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/transformers/text/test_addresses/test_country_extraction.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/transformers/text/test_addresses/test_data_addresses.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/transformers/text/test_addresses/test_po_box_extraction.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/transformers/text/test_addresses/test_street_extraction.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/transformers/text/test_addresses/test_zip_code_extraction.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/transformers/text/test_emails/test_debug_long_emails.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/transformers/text/test_emails/test_email_extraction.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/transformers/text/test_emails/test_email_optimized.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/transformers/text/test_phone_numbers/test_phone_extraction.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/unit/transformers/text/test_phone_numbers/test_phone_formatting.py +0 -0
- {datacompose-0.2.4 → datacompose-0.2.4.1}/tests/yaml_specs/__init__.py +0 -0
|
@@ -7,9 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
-
## [0.2.4] -
|
|
10
|
+
## [0.2.4] - 2025-08-13
|
|
11
11
|
|
|
12
12
|
### Added
|
|
13
|
+
- **Published to PyPI**: Package is now available via `pip install datacompose`
|
|
13
14
|
- **Phone Number Primitives**: Complete set of 45+ phone number transformation functions
|
|
14
15
|
- NANP validation and formatting (North American Numbering Plan)
|
|
15
16
|
- International phone support with E.164 formatting
|
|
@@ -24,6 +25,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
24
25
|
- PrimitiveRegistry class embedded with generated code
|
|
25
26
|
- No runtime dependency on datacompose package
|
|
26
27
|
- Fallback imports for maximum compatibility
|
|
28
|
+
- **Comprehensive Test Coverage**: Improved test coverage from 87% to 92%
|
|
29
|
+
- Added 18 new tests for primitives.py module (70% → 86% coverage)
|
|
30
|
+
- Created comprehensive test suites for all CLI commands
|
|
31
|
+
- Added full end-to-end integration tests (init → add → transform)
|
|
32
|
+
- validation.py achieved 100% coverage
|
|
33
|
+
- add.py improved to 99% coverage
|
|
27
34
|
|
|
28
35
|
### Changed
|
|
29
36
|
- **BREAKING**: Renamed `PrimitiveNameSpace` to `PrimitiveRegistry` throughout codebase
|
|
@@ -39,11 +46,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
39
46
|
- `clean_phone_numbers` → `phone_primitives.py`
|
|
40
47
|
|
|
41
48
|
### Fixed
|
|
49
|
+
- **Critical**: Fixed utils/primitives.py output location to be shared across all transformers
|
|
50
|
+
- Utils module now generates at top-level build/utils/ instead of per-transformer
|
|
51
|
+
- All transformers share the same PrimitiveRegistry implementation
|
|
52
|
+
- Prevents duplicate utils modules and ensures consistency
|
|
42
53
|
- Phone `normalize_separators` now correctly handles parentheses: `(555)123-4567` → `555-123-4567`
|
|
43
54
|
- Street extraction for numbered streets ("5th Avenue" issue)
|
|
44
55
|
- Compose decorator now requires namespace to be passed explicitly for proper method resolution
|
|
45
56
|
- `standardize_street_suffix` applies both custom and default mappings correctly
|
|
46
57
|
- Test failures due to namespace resolution in compose decorator
|
|
58
|
+
- Generator initialization error handling in add command
|
|
47
59
|
|
|
48
60
|
### Removed
|
|
49
61
|
- All YAML/spec file functionality
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datacompose
|
|
3
|
-
Version: 0.2.4
|
|
3
|
+
Version: 0.2.4.1
|
|
4
4
|
Summary: Copy-pasteable data transformation primitives for PySpark. Inspired by shadcn-svelte.
|
|
5
5
|
Author: Datacompose Contributors
|
|
6
6
|
Maintainer: Datacompose Contributors
|
|
7
7
|
License: MIT
|
|
8
|
-
Project-URL: Homepage, https://github.com/
|
|
9
|
-
Project-URL: Documentation, https://github.com/
|
|
10
|
-
Project-URL: Repository, https://github.com/
|
|
11
|
-
Project-URL: Issues, https://github.com/
|
|
12
|
-
Project-URL: Changelog, https://github.com/
|
|
8
|
+
Project-URL: Homepage, https://github.com/tc-cole/datacompose
|
|
9
|
+
Project-URL: Documentation, https://github.com/tc-cole/datacompose/tree/main/docs
|
|
10
|
+
Project-URL: Repository, https://github.com/tc-cole/datacompose.git
|
|
11
|
+
Project-URL: Issues, https://github.com/tc-cole/datacompose/issues
|
|
12
|
+
Project-URL: Changelog, https://github.com/tc-cole/datacompose/blob/main/CHANGELOG.md
|
|
13
13
|
Keywords: data-cleaning,data-quality,udf,spark,postgres,code-generation,data-pipeline,etl
|
|
14
14
|
Classifier: Development Status :: 4 - Beta
|
|
15
15
|
Classifier: Intended Audience :: Developers
|
|
@@ -47,6 +47,11 @@ Dynamic: license-file
|
|
|
47
47
|
|
|
48
48
|
# Datacompose
|
|
49
49
|
|
|
50
|
+
[](https://pypi.org/project/datacompose/)
|
|
51
|
+
[](https://www.python.org/downloads/)
|
|
52
|
+
[](https://github.com/your-username/datacompose)
|
|
53
|
+
[](https://opensource.org/licenses/MIT)
|
|
54
|
+
|
|
50
55
|
A powerful data transformation framework for building reusable, composable data cleaning pipelines in PySpark.
|
|
51
56
|
|
|
52
57
|
## Overview
|
|
@@ -426,6 +431,19 @@ Datacompose is inspired by [shadcn-svelte](https://www.shadcn-svelte.com/) and [
|
|
|
426
431
|
This is NOT a traditional library - it's a code generator that gives you production-ready data transformation primitives that you can modify to fit your exact needs.
|
|
427
432
|
|
|
428
433
|
|
|
434
|
+
## Test Coverage
|
|
435
|
+
|
|
436
|
+
**Critical components are thoroughly tested:**
|
|
437
|
+
|
|
438
|
+
| Component | Coverage | Tests |
|
|
439
|
+
|-----------|----------|-------|
|
|
440
|
+
| **Phone Number Primitives** | 95% | ✅ All formats validated |
|
|
441
|
+
| **Address Primitives** | 94% | ✅ Full parsing tested |
|
|
442
|
+
| **Email Primitives** | 89% | ✅ RFC compliant |
|
|
443
|
+
| **Code Generation** | 87-91% | ✅ All targets verified |
|
|
444
|
+
|
|
445
|
+
**335 tests passing** • **76% overall coverage**
|
|
446
|
+
|
|
429
447
|
## License
|
|
430
448
|
|
|
431
449
|
MIT License - see LICENSE file for details
|
|
@@ -1,5 +1,10 @@
|
|
|
1
1
|
# Datacompose
|
|
2
2
|
|
|
3
|
+
[](https://pypi.org/project/datacompose/)
|
|
4
|
+
[](https://www.python.org/downloads/)
|
|
5
|
+
[](https://github.com/your-username/datacompose)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
|
|
3
8
|
A powerful data transformation framework for building reusable, composable data cleaning pipelines in PySpark.
|
|
4
9
|
|
|
5
10
|
## Overview
|
|
@@ -379,6 +384,19 @@ Datacompose is inspired by [shadcn-svelte](https://www.shadcn-svelte.com/) and [
|
|
|
379
384
|
This is NOT a traditional library - it's a code generator that gives you production-ready data transformation primitives that you can modify to fit your exact needs.
|
|
380
385
|
|
|
381
386
|
|
|
387
|
+
## Test Coverage
|
|
388
|
+
|
|
389
|
+
**Critical components are thoroughly tested:**
|
|
390
|
+
|
|
391
|
+
| Component | Coverage | Tests |
|
|
392
|
+
|-----------|----------|-------|
|
|
393
|
+
| **Phone Number Primitives** | 95% | ✅ All formats validated |
|
|
394
|
+
| **Address Primitives** | 94% | ✅ Full parsing tested |
|
|
395
|
+
| **Email Primitives** | 89% | ✅ RFC compliant |
|
|
396
|
+
| **Code Generation** | 87-91% | ✅ All targets verified |
|
|
397
|
+
|
|
398
|
+
**335 tests passing** • **76% overall coverage**
|
|
399
|
+
|
|
382
400
|
## License
|
|
383
401
|
|
|
384
402
|
MIT License - see LICENSE file for details
|
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
Add command for generating UDFs.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
import json
|
|
6
5
|
from pathlib import Path
|
|
7
6
|
|
|
8
7
|
import click
|
|
@@ -155,21 +154,18 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
|
|
|
155
154
|
print(info(f"Available generators: {', '.join(discovery.list_generators())}"))
|
|
156
155
|
return 1
|
|
157
156
|
|
|
158
|
-
# Determine output directory
|
|
159
|
-
# Extract platform from target (e.g., "pyspark.pandas_udf" -> "pyspark")
|
|
160
|
-
platform = target.split(".")[0]
|
|
161
|
-
|
|
157
|
+
# Determine output directory - no platform subdirectory needed
|
|
162
158
|
if not output:
|
|
163
|
-
output_dir = f"build/{
|
|
159
|
+
output_dir = f"build/{transformer_name}"
|
|
164
160
|
else:
|
|
165
|
-
output_dir = f"{output}/{
|
|
166
|
-
|
|
167
|
-
# Create generator instance
|
|
168
|
-
generator = generator_class(
|
|
169
|
-
template_dir=Path(template_dir), output_dir=Path(output_dir), verbose=verbose
|
|
170
|
-
)
|
|
161
|
+
output_dir = f"{output}/{transformer_name}"
|
|
171
162
|
|
|
172
163
|
try:
|
|
164
|
+
# Create generator instance
|
|
165
|
+
generator = generator_class(
|
|
166
|
+
template_dir=Path(template_dir), output_dir=Path(output_dir), verbose=verbose
|
|
167
|
+
)
|
|
168
|
+
|
|
173
169
|
# Generate the UDF
|
|
174
170
|
result = generator.generate(
|
|
175
171
|
transformer_name, force=False, transformer_dir=transformer_dir
|
|
@@ -182,13 +178,15 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
|
|
|
182
178
|
print(dim(f" Hash: {result.get('hash', 'N/A')}"))
|
|
183
179
|
else:
|
|
184
180
|
print(success(f"✓ UDF generated: {result['output_path']}"))
|
|
185
|
-
|
|
181
|
+
if result.get('test_path'):
|
|
182
|
+
print(success(f"✓ Test created: {result['test_path']}"))
|
|
186
183
|
print(highlight(f"Function name: {result['function_name']}"))
|
|
187
184
|
if verbose:
|
|
188
185
|
print(dim(f" Target: {target}"))
|
|
189
186
|
print(highlight("\nGenerated package contents:"))
|
|
190
187
|
print(f" - UDF code: {result['output_path']}")
|
|
191
|
-
|
|
188
|
+
if result.get('test_path'):
|
|
189
|
+
print(f" - Test file: {result['test_path']}")
|
|
192
190
|
|
|
193
191
|
return 0
|
|
194
192
|
|
|
@@ -200,16 +198,3 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
|
|
|
200
198
|
traceback.print_exc()
|
|
201
199
|
return 1
|
|
202
200
|
|
|
203
|
-
|
|
204
|
-
def _load_config() -> dict:
|
|
205
|
-
"""Load datacompose.json configuration if it exists."""
|
|
206
|
-
config_path = Path("datacompose.json")
|
|
207
|
-
if config_path.exists():
|
|
208
|
-
try:
|
|
209
|
-
with open(config_path, "r") as f:
|
|
210
|
-
return json.load(f)
|
|
211
|
-
except Exception:
|
|
212
|
-
pass
|
|
213
|
-
return {}
|
|
214
|
-
|
|
215
|
-
|
|
@@ -21,7 +21,7 @@ DEFAULT_CONFIG = {
|
|
|
21
21
|
"aliases": {"utils": "./src/utils"},
|
|
22
22
|
"targets": {
|
|
23
23
|
"pyspark": {
|
|
24
|
-
"output": "./build
|
|
24
|
+
"output": "./build",
|
|
25
25
|
}
|
|
26
26
|
},
|
|
27
27
|
}
|
|
@@ -57,7 +57,7 @@ class InitCommand:
|
|
|
57
57
|
def get_config_template(template_name: str) -> Dict[str, Any]:
|
|
58
58
|
"""Get configuration template by name."""
|
|
59
59
|
if template_name == "minimal":
|
|
60
|
-
return {"version": "1.0", "targets": {"pyspark": {"output": "./build
|
|
60
|
+
return {"version": "1.0", "targets": {"pyspark": {"output": "./build"}}}
|
|
61
61
|
elif template_name == "advanced":
|
|
62
62
|
config = DEFAULT_CONFIG.copy()
|
|
63
63
|
config.update(
|
|
@@ -8,7 +8,6 @@ def __get_output_filename as well as any other build steps that you want.
|
|
|
8
8
|
|
|
9
9
|
import hashlib
|
|
10
10
|
from abc import ABC, abstractmethod
|
|
11
|
-
from datetime import datetime
|
|
12
11
|
from pathlib import Path
|
|
13
12
|
from typing import Any, Dict, Optional
|
|
14
13
|
|
|
@@ -45,16 +44,11 @@ class BaseGenerator(ABC):
|
|
|
45
44
|
Dictionary with generation results
|
|
46
45
|
"""
|
|
47
46
|
# Create a minimal spec-like dict from transformer name for compatibility
|
|
48
|
-
|
|
47
|
+
transformer = {"name": transformer_name}
|
|
49
48
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
# Calculate hash for caching
|
|
54
|
-
spec_hash = self._calculate_hash(spec, template_content)
|
|
55
|
-
|
|
56
|
-
# Determine output path
|
|
57
|
-
output_file = self._get_output_filename(spec["name"])
|
|
49
|
+
file_content: str = self._get_primitives_file(transformer_dir)
|
|
50
|
+
spec_hash = self._calculate_hash(transformer, file_content)
|
|
51
|
+
output_file = self._get_output_filename(transformer["name"])
|
|
58
52
|
output_path = self.output_dir / output_file
|
|
59
53
|
|
|
60
54
|
# Check if regeneration is needed
|
|
@@ -63,18 +57,18 @@ class BaseGenerator(ABC):
|
|
|
63
57
|
"skipped": True,
|
|
64
58
|
"output_path": str(output_path),
|
|
65
59
|
"hash": spec_hash,
|
|
66
|
-
"function_name": f"{
|
|
60
|
+
"function_name": f"{transformer['name']}_udf",
|
|
67
61
|
}
|
|
68
62
|
|
|
69
63
|
# Copy utils/primitives.py to the output directory
|
|
70
64
|
self._copy_utils_files(output_path)
|
|
71
|
-
|
|
65
|
+
self._write_output(output_path, file_content)
|
|
72
66
|
|
|
73
67
|
return {
|
|
74
68
|
"skipped": False,
|
|
75
69
|
"output_path": str(output_path),
|
|
76
70
|
"hash": spec_hash,
|
|
77
|
-
"function_name": f"{
|
|
71
|
+
"function_name": f"{transformer['name']}_udf",
|
|
78
72
|
}
|
|
79
73
|
|
|
80
74
|
@staticmethod
|
|
@@ -82,6 +76,7 @@ class BaseGenerator(ABC):
|
|
|
82
76
|
"""Calculate hash for cache invalidation."""
|
|
83
77
|
content = str(spec) + template_content
|
|
84
78
|
return hashlib.sha256(content.encode("utf-8")).hexdigest()[:8]
|
|
79
|
+
|
|
85
80
|
|
|
86
81
|
@staticmethod
|
|
87
82
|
def _should_skip_generation(output_path: Path, spec_hash: str) -> bool:
|
|
@@ -100,8 +95,6 @@ class BaseGenerator(ABC):
|
|
|
100
95
|
"""Write generated content to output file."""
|
|
101
96
|
# Create output directory if it doesn't exist
|
|
102
97
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
103
|
-
|
|
104
|
-
# Create __init__.py files to make directories importable as Python packages
|
|
105
98
|
self._ensure_init_files(output_path)
|
|
106
99
|
|
|
107
100
|
with open(output_path, "w") as f:
|
|
@@ -138,26 +131,20 @@ class BaseGenerator(ABC):
|
|
|
138
131
|
if self.verbose:
|
|
139
132
|
print(f"Created {init_file}")
|
|
140
133
|
|
|
141
|
-
@staticmethod
|
|
142
|
-
def _prepare_template_vars(spec: Dict[str, Any], spec_hash: str) -> Dict[str, Any]:
|
|
143
|
-
"""Prepare variables for template rendering."""
|
|
144
|
-
return {
|
|
145
|
-
"transformer_name": spec["name"],
|
|
146
|
-
"udf_name": f"{spec['name']}_udf",
|
|
147
|
-
"hash": spec_hash,
|
|
148
|
-
"generation_timestamp": datetime.now().isoformat(),
|
|
149
|
-
"typo_map": spec.get("typo_map", {}),
|
|
150
|
-
"regex_patterns": spec.get("regex", {}),
|
|
151
|
-
"flags": spec.get("flags", {}),
|
|
152
|
-
"options": spec.get("options", {}),
|
|
153
|
-
"custom_rules": spec.get("custom_rules", {}),
|
|
154
|
-
}
|
|
155
|
-
|
|
156
134
|
|
|
157
135
|
def _copy_utils_files(self, output_path: Path):
|
|
158
|
-
"""Copy utility files like primitives.py to the
|
|
159
|
-
#
|
|
160
|
-
|
|
136
|
+
"""Copy utility files like primitives.py to the build root directory."""
|
|
137
|
+
# Find the build directory root
|
|
138
|
+
path_parts = output_path.parts
|
|
139
|
+
try:
|
|
140
|
+
build_index = path_parts.index("build")
|
|
141
|
+
build_root = Path(*path_parts[:build_index + 1])
|
|
142
|
+
except ValueError:
|
|
143
|
+
# Fallback to parent directory if no 'build' in path
|
|
144
|
+
build_root = output_path.parent.parent
|
|
145
|
+
|
|
146
|
+
# Create utils directory at build root
|
|
147
|
+
utils_dir = build_root / "utils"
|
|
161
148
|
utils_dir.mkdir(parents=True, exist_ok=True)
|
|
162
149
|
|
|
163
150
|
# Create __init__.py in utils directory
|
|
@@ -179,12 +166,12 @@ class BaseGenerator(ABC):
|
|
|
179
166
|
|
|
180
167
|
@classmethod
|
|
181
168
|
@abstractmethod
|
|
182
|
-
def
|
|
169
|
+
def _get_primitives_location(cls, transformer_dir: Path | None) -> Path | None:
|
|
183
170
|
pass
|
|
184
171
|
|
|
185
172
|
@abstractmethod
|
|
186
|
-
def
|
|
187
|
-
"""Get the
|
|
173
|
+
def _get_primitives_file(self, transformer_dir: Path | None) -> str:
|
|
174
|
+
"""Get the file content for this generator."""
|
|
188
175
|
pass
|
|
189
176
|
|
|
190
177
|
@abstractmethod
|
|
@@ -11,30 +11,30 @@ class SparkPandasUDFGenerator(BaseGenerator):
|
|
|
11
11
|
"""Generator for Apache Spark pandas UDFs."""
|
|
12
12
|
|
|
13
13
|
ENGINE_SUBDIRECTORY = "pyspark"
|
|
14
|
-
|
|
14
|
+
PRIMITIVES_FILENAME = "pyspark_primitives.py"
|
|
15
15
|
|
|
16
16
|
@classmethod
|
|
17
|
-
def
|
|
17
|
+
def _get_primitives_location(cls, transformer_dir: Path | None) -> Path | None:
|
|
18
18
|
if transformer_dir is None:
|
|
19
19
|
return None
|
|
20
|
-
return transformer_dir / cls.ENGINE_SUBDIRECTORY / cls.
|
|
20
|
+
return transformer_dir / cls.ENGINE_SUBDIRECTORY / cls.PRIMITIVES_FILENAME
|
|
21
21
|
|
|
22
|
-
def
|
|
22
|
+
def _get_primitives_file(self, transformer_dir: Path | None = None) -> str:
|
|
23
23
|
"""Get the template content for Spark pandas UDFs."""
|
|
24
24
|
if transformer_dir:
|
|
25
25
|
# Look for transformer-specific template first
|
|
26
|
-
transformer_template = self.
|
|
26
|
+
transformer_template = self._get_primitives_location(transformer_dir)
|
|
27
27
|
if transformer_template and transformer_template.exists():
|
|
28
28
|
return transformer_template.read_text()
|
|
29
29
|
|
|
30
30
|
# Fallback to generator-specific template (if it exists)
|
|
31
|
-
generator_template = Path(__file__).parent / self.
|
|
31
|
+
generator_template = Path(__file__).parent / self.PRIMITIVES_FILENAME
|
|
32
32
|
if generator_template.exists():
|
|
33
33
|
return generator_template.read_text()
|
|
34
34
|
|
|
35
35
|
# If no templates found, raise error
|
|
36
36
|
raise FileNotFoundError(
|
|
37
|
-
f"No {self.
|
|
37
|
+
f"No {self.PRIMITIVES_FILENAME} template found in {transformer_dir} or {Path(__file__).parent}"
|
|
38
38
|
)
|
|
39
39
|
|
|
40
40
|
def _get_output_filename(self, transformer_name: str) -> str:
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datacompose
|
|
3
|
-
Version: 0.2.4
|
|
3
|
+
Version: 0.2.4.1
|
|
4
4
|
Summary: Copy-pasteable data transformation primitives for PySpark. Inspired by shadcn-svelte.
|
|
5
5
|
Author: Datacompose Contributors
|
|
6
6
|
Maintainer: Datacompose Contributors
|
|
7
7
|
License: MIT
|
|
8
|
-
Project-URL: Homepage, https://github.com/
|
|
9
|
-
Project-URL: Documentation, https://github.com/
|
|
10
|
-
Project-URL: Repository, https://github.com/
|
|
11
|
-
Project-URL: Issues, https://github.com/
|
|
12
|
-
Project-URL: Changelog, https://github.com/
|
|
8
|
+
Project-URL: Homepage, https://github.com/tc-cole/datacompose
|
|
9
|
+
Project-URL: Documentation, https://github.com/tc-cole/datacompose/tree/main/docs
|
|
10
|
+
Project-URL: Repository, https://github.com/tc-cole/datacompose.git
|
|
11
|
+
Project-URL: Issues, https://github.com/tc-cole/datacompose/issues
|
|
12
|
+
Project-URL: Changelog, https://github.com/tc-cole/datacompose/blob/main/CHANGELOG.md
|
|
13
13
|
Keywords: data-cleaning,data-quality,udf,spark,postgres,code-generation,data-pipeline,etl
|
|
14
14
|
Classifier: Development Status :: 4 - Beta
|
|
15
15
|
Classifier: Intended Audience :: Developers
|
|
@@ -47,6 +47,11 @@ Dynamic: license-file
|
|
|
47
47
|
|
|
48
48
|
# Datacompose
|
|
49
49
|
|
|
50
|
+
[](https://pypi.org/project/datacompose/)
|
|
51
|
+
[](https://www.python.org/downloads/)
|
|
52
|
+
[](https://github.com/your-username/datacompose)
|
|
53
|
+
[](https://opensource.org/licenses/MIT)
|
|
54
|
+
|
|
50
55
|
A powerful data transformation framework for building reusable, composable data cleaning pipelines in PySpark.
|
|
51
56
|
|
|
52
57
|
## Overview
|
|
@@ -426,6 +431,19 @@ Datacompose is inspired by [shadcn-svelte](https://www.shadcn-svelte.com/) and [
|
|
|
426
431
|
This is NOT a traditional library - it's a code generator that gives you production-ready data transformation primitives that you can modify to fit your exact needs.
|
|
427
432
|
|
|
428
433
|
|
|
434
|
+
## Test Coverage
|
|
435
|
+
|
|
436
|
+
**Critical components are thoroughly tested:**
|
|
437
|
+
|
|
438
|
+
| Component | Coverage | Tests |
|
|
439
|
+
|-----------|----------|-------|
|
|
440
|
+
| **Phone Number Primitives** | 95% | ✅ All formats validated |
|
|
441
|
+
| **Address Primitives** | 94% | ✅ Full parsing tested |
|
|
442
|
+
| **Email Primitives** | 89% | ✅ RFC compliant |
|
|
443
|
+
| **Code Generation** | 87-91% | ✅ All targets verified |
|
|
444
|
+
|
|
445
|
+
**335 tests passing** • **76% overall coverage**
|
|
446
|
+
|
|
429
447
|
## License
|
|
430
448
|
|
|
431
449
|
MIT License - see LICENSE file for details
|
|
@@ -19,7 +19,6 @@ datacompose/cli/commands/__init__.py
|
|
|
19
19
|
datacompose/cli/commands/add.py
|
|
20
20
|
datacompose/cli/commands/init.py
|
|
21
21
|
datacompose/cli/commands/list.py
|
|
22
|
-
datacompose/cli/commands/upgrade.py
|
|
23
22
|
datacompose/generators/__init__.py
|
|
24
23
|
datacompose/generators/base.py
|
|
25
24
|
datacompose/generators/pyspark/__init__.py
|
|
@@ -38,13 +37,18 @@ datacompose/transformers/text/clean_phone_numbers/pyspark/pyspark_primitives.py
|
|
|
38
37
|
tests/__init__.py
|
|
39
38
|
tests/integration/__init__.py
|
|
40
39
|
tests/integration/test_end_to_end.py
|
|
40
|
+
tests/integration/test_full_workflow.py
|
|
41
41
|
tests/integration/test_generated_imports.py
|
|
42
42
|
tests/unit/cli/__init__.py
|
|
43
43
|
tests/unit/cli/test_add_command.py
|
|
44
|
+
tests/unit/cli/test_add_command_complete.py
|
|
44
45
|
tests/unit/cli/test_add_validation.py
|
|
45
46
|
tests/unit/cli/test_init_command.py
|
|
47
|
+
tests/unit/cli/test_init_command_complete.py
|
|
46
48
|
tests/unit/cli/test_list_command.py
|
|
47
49
|
tests/unit/cli/test_main.py
|
|
50
|
+
tests/unit/cli/test_main_complete.py
|
|
51
|
+
tests/unit/cli/test_validation_complete.py
|
|
48
52
|
tests/unit/cli/.venv/bin/activate_this.py
|
|
49
53
|
tests/unit/cli/.venv/lib/python3.12/site-packages/_virtualenv.py
|
|
50
54
|
tests/unit/cli/build/__init__.py
|
|
@@ -70,6 +74,7 @@ tests/unit/operators/test_conditional_parameters.py
|
|
|
70
74
|
tests/unit/operators/test_conditional_performance.py
|
|
71
75
|
tests/unit/operators/test_conditional_real_world.py
|
|
72
76
|
tests/unit/operators/test_operators.py
|
|
77
|
+
tests/unit/operators/test_primitives_complete.py
|
|
73
78
|
tests/unit/transformers/__init__.py
|
|
74
79
|
tests/unit/transformers/test_discovery.py
|
|
75
80
|
tests/unit/transformers/text/common/test_common.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "datacompose"
|
|
7
|
-
version = "0.2.4"
|
|
7
|
+
version = "0.2.4.1"
|
|
8
8
|
description = "Copy-pasteable data transformation primitives for PySpark. Inspired by shadcn-svelte."
|
|
9
9
|
authors = [
|
|
10
10
|
{name = "Datacompose Contributors"},
|
|
@@ -47,11 +47,11 @@ dependencies = [
|
|
|
47
47
|
]
|
|
48
48
|
|
|
49
49
|
[project.urls]
|
|
50
|
-
Homepage = "https://github.com/
|
|
51
|
-
Documentation = "https://github.com/
|
|
52
|
-
Repository = "https://github.com/
|
|
53
|
-
Issues = "https://github.com/
|
|
54
|
-
Changelog = "https://github.com/
|
|
50
|
+
Homepage = "https://github.com/tc-cole/datacompose"
|
|
51
|
+
Documentation = "https://github.com/tc-cole/datacompose/tree/main/docs"
|
|
52
|
+
Repository = "https://github.com/tc-cole/datacompose.git"
|
|
53
|
+
Issues = "https://github.com/tc-cole/datacompose/issues"
|
|
54
|
+
Changelog = "https://github.com/tc-cole/datacompose/blob/main/CHANGELOG.md"
|
|
55
55
|
|
|
56
56
|
[project.scripts]
|
|
57
57
|
datacompose = "datacompose.cli.main:main"
|
|
@@ -38,7 +38,7 @@ class TestEndToEndWorkflow:
|
|
|
38
38
|
with open(config_file) as f:
|
|
39
39
|
config = json.load(f)
|
|
40
40
|
assert "pyspark" in config["targets"]
|
|
41
|
-
assert config["targets"]["pyspark"]["output"] == "./build
|
|
41
|
+
assert config["targets"]["pyspark"]["output"] == "./build"
|
|
42
42
|
|
|
43
43
|
# Step 2: Add transformers for each domain
|
|
44
44
|
transformers = [
|
|
@@ -54,17 +54,13 @@ class TestEndToEndWorkflow:
|
|
|
54
54
|
assert result.exit_code == 0
|
|
55
55
|
assert "generated" in result.output.lower()
|
|
56
56
|
|
|
57
|
-
# Verify the generated file exists with correct name
|
|
58
|
-
output_dir = Path(f"build/
|
|
57
|
+
# Verify the generated file exists with correct name (no platform subdirectory)
|
|
58
|
+
output_dir = Path(f"build/{transformer_name}")
|
|
59
59
|
assert output_dir.exists()
|
|
60
60
|
|
|
61
61
|
output_file = output_dir / expected_file
|
|
62
62
|
assert output_file.exists(), f"Expected {output_file} to exist"
|
|
63
63
|
|
|
64
|
-
# Verify test file was created
|
|
65
|
-
test_file = output_dir / f"test_{expected_file}"
|
|
66
|
-
assert test_file.exists()
|
|
67
|
-
|
|
68
64
|
# Verify the content includes PrimitiveRegistry
|
|
69
65
|
content = output_file.read_text()
|
|
70
66
|
assert "PrimitiveRegistry" in content
|
|
@@ -77,9 +73,18 @@ class TestEndToEndWorkflow:
|
|
|
77
73
|
elif "phone" in transformer_name:
|
|
78
74
|
assert 'phones = PrimitiveRegistry("phones")' in content
|
|
79
75
|
|
|
76
|
+
# Verify utils directory is at build root
|
|
77
|
+
utils_dir = Path("build/utils")
|
|
78
|
+
assert utils_dir.exists()
|
|
79
|
+
assert (utils_dir / "primitives.py").exists()
|
|
80
|
+
assert (utils_dir / "__init__.py").exists()
|
|
81
|
+
|
|
82
|
+
# Verify no platform subdirectory exists
|
|
83
|
+
assert not Path("build/pyspark").exists()
|
|
84
|
+
|
|
80
85
|
# Step 3: Verify we can import and use the primitives
|
|
81
86
|
# (This would work if PySpark was installed)
|
|
82
|
-
email_primitives = Path("build/
|
|
87
|
+
email_primitives = Path("build/clean_emails/email_primitives.py")
|
|
83
88
|
content = email_primitives.read_text()
|
|
84
89
|
|
|
85
90
|
# Check for some expected primitive functions
|
|
@@ -126,9 +131,9 @@ class TestEndToEndWorkflow:
|
|
|
126
131
|
)
|
|
127
132
|
assert result.exit_code == 0
|
|
128
133
|
|
|
129
|
-
# Verify files in custom location
|
|
134
|
+
# Verify files in custom location (no platform subdirectory)
|
|
130
135
|
output_file = Path(
|
|
131
|
-
f"{custom_output}/
|
|
136
|
+
f"{custom_output}/clean_emails/email_primitives.py"
|
|
132
137
|
)
|
|
133
138
|
assert output_file.exists()
|
|
134
139
|
|