datacompose 0.2.4__tar.gz → 0.2.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacompose might be problematic. Click here for more details.

Files changed (105) hide show
  1. {datacompose-0.2.4 → datacompose-0.2.5.2}/CHANGELOG.md +58 -4
  2. datacompose-0.2.5.2/PKG-INFO +94 -0
  3. datacompose-0.2.5.2/README.md +44 -0
  4. {datacompose-0.2.4 → datacompose-0.2.5.2}/datacompose/cli/commands/add.py +53 -40
  5. {datacompose-0.2.4 → datacompose-0.2.5.2}/datacompose/cli/commands/init.py +35 -9
  6. {datacompose-0.2.4 → datacompose-0.2.5.2}/datacompose/cli/commands/list.py +2 -2
  7. datacompose-0.2.5.2/datacompose/cli/config.py +80 -0
  8. {datacompose-0.2.4 → datacompose-0.2.5.2}/datacompose/cli/main.py +3 -3
  9. {datacompose-0.2.4 → datacompose-0.2.5.2}/datacompose/generators/base.py +29 -41
  10. datacompose-0.2.5.2/datacompose/generators/pyspark/generator.py +46 -0
  11. {datacompose-0.2.4/datacompose/transformers/text/clean_addresses → datacompose-0.2.5.2/datacompose/transformers/text/addresses}/pyspark/pyspark_primitives.py +68 -13
  12. {datacompose-0.2.4/datacompose/transformers/text/clean_emails → datacompose-0.2.5.2/datacompose/transformers/text/emails}/pyspark/pyspark_primitives.py +53 -1
  13. {datacompose-0.2.4/datacompose/transformers/text/clean_phone_numbers → datacompose-0.2.5.2/datacompose/transformers/text/phone_numbers}/pyspark/pyspark_primitives.py +377 -327
  14. datacompose-0.2.5.2/datacompose.egg-info/PKG-INFO +94 -0
  15. {datacompose-0.2.4 → datacompose-0.2.5.2}/datacompose.egg-info/SOURCES.txt +15 -7
  16. datacompose-0.2.5.2/datacompose.egg-info/requires.txt +21 -0
  17. {datacompose-0.2.4 → datacompose-0.2.5.2}/pyproject.toml +16 -13
  18. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/integration/test_end_to_end.py +30 -23
  19. datacompose-0.2.5.2/tests/integration/test_full_workflow.py +298 -0
  20. datacompose-0.2.5.2/tests/integration/test_generated_imports.py +149 -0
  21. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/cli/test_add_command.py +15 -5
  22. datacompose-0.2.5.2/tests/unit/cli/test_add_command_complete.py +451 -0
  23. datacompose-0.2.5.2/tests/unit/cli/test_add_default_target.py +327 -0
  24. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/cli/test_add_validation.py +5 -5
  25. datacompose-0.2.5.2/tests/unit/cli/test_config.py +250 -0
  26. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/cli/test_init_command.py +1 -1
  27. datacompose-0.2.5.2/tests/unit/cli/test_init_command_complete.py +654 -0
  28. datacompose-0.2.5.2/tests/unit/cli/test_main_complete.py +377 -0
  29. datacompose-0.2.5.2/tests/unit/cli/test_validation_complete.py +400 -0
  30. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/generators/test_base_generator.py +13 -140
  31. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/generators/test_spark_generator.py +36 -37
  32. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/operators/test_operators.py +4 -4
  33. datacompose-0.2.5.2/tests/unit/operators/test_primitives_complete.py +338 -0
  34. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/transformers/test_discovery.py +6 -6
  35. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/transformers/text/test_addresses/test_building_unit_extraction.py +2 -2
  36. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/transformers/text/test_addresses/test_city_state_extraction.py +13 -13
  37. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/transformers/text/test_addresses/test_clean_addresses.py +4 -4
  38. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/transformers/text/test_addresses/test_country_extraction.py +1 -1
  39. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/transformers/text/test_addresses/test_data_addresses.py +1 -1
  40. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/transformers/text/test_addresses/test_po_box_extraction.py +1 -1
  41. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/transformers/text/test_addresses/test_street_extraction.py +1 -1
  42. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/transformers/text/test_addresses/test_zip_code_extraction.py +28 -28
  43. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/transformers/text/test_emails/test_debug_long_emails.py +1 -1
  44. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/transformers/text/test_emails/test_email_extraction.py +1 -1
  45. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/transformers/text/test_emails/test_email_optimized.py +1 -1
  46. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/transformers/text/test_phone_numbers/test_phone_extraction.py +30 -30
  47. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/transformers/text/test_phone_numbers/test_phone_formatting.py +31 -31
  48. datacompose-0.2.4/PKG-INFO +0 -431
  49. datacompose-0.2.4/README.md +0 -384
  50. datacompose-0.2.4/datacompose/cli/commands/upgrade.py +0 -7
  51. datacompose-0.2.4/datacompose/generators/pyspark/generator.py +0 -51
  52. datacompose-0.2.4/datacompose.egg-info/PKG-INFO +0 -431
  53. datacompose-0.2.4/datacompose.egg-info/requires.txt +0 -18
  54. datacompose-0.2.4/tests/integration/test_generated_imports.py +0 -165
  55. {datacompose-0.2.4 → datacompose-0.2.5.2}/LICENSE +0 -0
  56. {datacompose-0.2.4 → datacompose-0.2.5.2}/MANIFEST.in +0 -0
  57. {datacompose-0.2.4 → datacompose-0.2.5.2}/datacompose/__init__.py +0 -0
  58. {datacompose-0.2.4 → datacompose-0.2.5.2}/datacompose/cli/__init__.py +0 -0
  59. {datacompose-0.2.4 → datacompose-0.2.5.2}/datacompose/cli/colors.py +0 -0
  60. {datacompose-0.2.4 → datacompose-0.2.5.2}/datacompose/cli/commands/__init__.py +0 -0
  61. {datacompose-0.2.4 → datacompose-0.2.5.2}/datacompose/cli/validation.py +0 -0
  62. {datacompose-0.2.4 → datacompose-0.2.5.2}/datacompose/generators/__init__.py +0 -0
  63. {datacompose-0.2.4 → datacompose-0.2.5.2}/datacompose/generators/pyspark/__init__.py +0 -0
  64. {datacompose-0.2.4 → datacompose-0.2.5.2}/datacompose/operators/__init__.py +0 -0
  65. {datacompose-0.2.4 → datacompose-0.2.5.2}/datacompose/operators/primitives.py +0 -0
  66. {datacompose-0.2.4 → datacompose-0.2.5.2}/datacompose/transformers/__init__.py +0 -0
  67. {datacompose-0.2.4 → datacompose-0.2.5.2}/datacompose/transformers/discovery.py +0 -0
  68. {datacompose-0.2.4 → datacompose-0.2.5.2}/datacompose/transformers/text/__init__.py +0 -0
  69. {datacompose-0.2.4/datacompose/transformers/text/clean_addresses → datacompose-0.2.5.2/datacompose/transformers/text/addresses}/__init__.py +0 -0
  70. {datacompose-0.2.4/datacompose/transformers/text/clean_emails → datacompose-0.2.5.2/datacompose/transformers/text/emails}/__init__.py +0 -0
  71. {datacompose-0.2.4/datacompose/transformers/text/clean_phone_numbers → datacompose-0.2.5.2/datacompose/transformers/text/phone_numbers}/__init__.py +0 -0
  72. {datacompose-0.2.4 → datacompose-0.2.5.2}/datacompose.egg-info/dependency_links.txt +0 -0
  73. {datacompose-0.2.4 → datacompose-0.2.5.2}/datacompose.egg-info/entry_points.txt +0 -0
  74. {datacompose-0.2.4 → datacompose-0.2.5.2}/datacompose.egg-info/top_level.txt +0 -0
  75. {datacompose-0.2.4 → datacompose-0.2.5.2}/setup.cfg +0 -0
  76. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/__init__.py +0 -0
  77. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/integration/__init__.py +0 -0
  78. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/cli/.venv/bin/activate_this.py +0 -0
  79. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/cli/.venv/lib/python3.12/site-packages/_virtualenv.py +0 -0
  80. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/cli/__init__.py +0 -0
  81. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/cli/build/__init__.py +0 -0
  82. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/cli/build/postgres/__init__.py +0 -0
  83. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/cli/build/postgres/clean_emails/__init__.py +0 -0
  84. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/cli/build/postgres/clean_emails/email_cleaner_udf_spec.yaml +0 -0
  85. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/cli/build/postgres/clean_emails/test_email_cleaner_udf.py +0 -0
  86. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/cli/build/spark/__init__.py +0 -0
  87. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/cli/build/spark/clean_emails/__init__.py +0 -0
  88. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/cli/build/spark/clean_emails/email_cleaner_udf.py +0 -0
  89. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/cli/build/spark/clean_emails/email_cleaner_udf_spec.yaml +0 -0
  90. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/cli/build/spark/clean_emails/test_email_cleaner_udf.py +0 -0
  91. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/cli/test_list_command.py +0 -0
  92. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/cli/test_main.py +0 -0
  93. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/generators/__init__.py +0 -0
  94. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/operators/conditional_tests_common.py +0 -0
  95. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/operators/conftest.py +0 -0
  96. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/operators/test_conditional_complex_logic.py +0 -0
  97. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/operators/test_conditional_data_driven.py +0 -0
  98. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/operators/test_conditional_edge_cases.py +0 -0
  99. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/operators/test_conditional_error_handling.py +0 -0
  100. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/operators/test_conditional_parameters.py +0 -0
  101. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/operators/test_conditional_performance.py +0 -0
  102. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/operators/test_conditional_real_world.py +0 -0
  103. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/transformers/__init__.py +0 -0
  104. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/unit/transformers/text/common/test_common.py +0 -0
  105. {datacompose-0.2.4 → datacompose-0.2.5.2}/tests/yaml_specs/__init__.py +0 -0
@@ -7,9 +7,52 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
- ## [0.2.4] - 2024-08-12
10
+ ## [0.2.5.2] - 2025-08-22
11
+
12
+ ### Fixed
13
+ - **Import Paths**: Updated import paths in phone_numbers pyspark primitives for clarity and consistency
14
+ - **Documentation**: Improved docstring documentation across primitives
15
+
16
+ ## [0.2.5.1] - 2025-08-22
17
+
18
+ ### Changed
19
+ - **Import Paths**: Renamed imports to be more transparent and clear
20
+
21
+ ### Added
22
+ - **Documentation**: Added clear module-level docstrings throughout the codebase
23
+ - **Unit Tests**: Added comprehensive unit tests for default initialization and datacompose.json configuration
24
+ - Tests for default target auto-selection with single target
25
+ - Tests for explicit target override behavior
26
+ - Tests for configuration file validation
27
+ - Tests for output path resolution from config
28
+
29
+ ### Fixed
30
+ - **CLI Tests**: Fixed all failing default target configuration tests
31
+ - Added proper validation mocks for non-existent platforms in tests
32
+ - Fixed error message assertion for invalid platform validation
33
+ - Properly mocked generator class hierarchy for output path testing
34
+ - All 13 CLI default target tests now passing (100% pass rate)
35
+
36
+ ## [0.2.5] - 2025-08-21
37
+
38
+ ### Changed
39
+ - **Documentation**: Streamlined README to be more concise
40
+ - Removed extensive code examples (now on website)
41
+ - Reduced from 390 lines to 44 lines
42
+ - Focused on core features and philosophy
43
+ - Added link to datacompose.io for detailed documentation
44
+
45
+ ### Fixed
46
+ - **Test Suite**: Fixed failing CLI tests for `add` command
47
+ - Tests now properly mock ConfigLoader for isolated filesystem environments
48
+ - `test_add_invalid_transformer` correctly validates transformer not found error
49
+ - `test_complete_transformer_success` updated to match actual transformer names
50
+ - All CLI command tests passing with proper configuration mocking
51
+
52
+ ## [0.2.4] - 2025-08-13
11
53
 
12
54
  ### Added
55
+ - **Published to PyPI**: Package is now available via `pip install datacompose`
13
56
  - **Phone Number Primitives**: Complete set of 45+ phone number transformation functions
14
57
  - NANP validation and formatting (North American Numbering Plan)
15
58
  - International phone support with E.164 formatting
@@ -24,6 +67,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
24
67
  - PrimitiveRegistry class embedded with generated code
25
68
  - No runtime dependency on datacompose package
26
69
  - Fallback imports for maximum compatibility
70
+ - **Comprehensive Test Coverage**: Improved test coverage from 87% to 92%
71
+ - Added 18 new tests for primitives.py module (70% → 86% coverage)
72
+ - Created comprehensive test suites for all CLI commands
73
+ - Added full end-to-end integration tests (init → add → transform)
74
+ - validation.py achieved 100% coverage
75
+ - add.py improved to 99% coverage
27
76
 
28
77
  ### Changed
29
78
  - **BREAKING**: Renamed `PrimitiveNameSpace` to `PrimitiveRegistry` throughout codebase
@@ -34,16 +83,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
34
83
  - Removed `validate` command completely
35
84
  - **Import Strategy**: Primitives now try local utils import first, fall back to datacompose package
36
85
  - **File Naming**: Generated files use plural form with primitives suffix
37
- - `clean_emails` → `email_primitives.py`
38
- - `clean_addresses` → `address_primitives.py`
39
- - `clean_phone_numbers` → `phone_primitives.py`
86
+ - `emails` → `email_primitives.py`
87
+ - `addresses` → `address_primitives.py`
88
+ - `phone_numbers` → `phone_primitives.py`
40
89
 
41
90
  ### Fixed
91
+ - **Critical**: Fixed utils/primitives.py output location to be shared across all transformers
92
+ - Utils module now generates at top-level build/utils/ instead of per-transformer
93
+ - All transformers share the same PrimitiveRegistry implementation
94
+ - Prevents duplicate utils modules and ensures consistency
42
95
  - Phone `normalize_separators` now correctly handles parentheses: `(555)123-4567` → `555-123-4567`
43
96
  - Street extraction for numbered streets ("5th Avenue" issue)
44
97
  - Compose decorator now requires namespace to be passed explicitly for proper method resolution
45
98
  - `standardize_street_suffix` applies both custom and default mappings correctly
46
99
  - Test failures due to namespace resolution in compose decorator
100
+ - Generator initialization error handling in add command
47
101
 
48
102
  ### Removed
49
103
  - All YAML/spec file functionality
@@ -0,0 +1,94 @@
1
+ Metadata-Version: 2.4
2
+ Name: datacompose
3
+ Version: 0.2.5.2
4
+ Summary: Copy-pasteable data transformation primitives for PySpark. Inspired by shadcn-svelte.
5
+ Author: Datacompose Contributors
6
+ Maintainer: Datacompose Contributors
7
+ License: MIT
8
+ Project-URL: Homepage, https://github.com/tc-cole/datacompose
9
+ Project-URL: Documentation, https://github.com/tc-cole/datacompose/tree/main/docs
10
+ Project-URL: Repository, https://github.com/tc-cole/datacompose.git
11
+ Project-URL: Issues, https://github.com/tc-cole/datacompose/issues
12
+ Project-URL: Changelog, https://github.com/tc-cole/datacompose/blob/main/CHANGELOG.md
13
+ Keywords: data-cleaning,data-quality,udf,spark,postgres,code-generation,data-pipeline,etl
14
+ Classifier: Development Status :: 4 - Beta
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: Topic :: Software Development :: Code Generators
17
+ Classifier: Topic :: Database
18
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
+ Classifier: License :: OSI Approved :: MIT License
20
+ Classifier: Programming Language :: Python :: 3
21
+ Classifier: Programming Language :: Python :: 3.8
22
+ Classifier: Programming Language :: Python :: 3.9
23
+ Classifier: Programming Language :: Python :: 3.10
24
+ Classifier: Programming Language :: Python :: 3.11
25
+ Classifier: Programming Language :: Python :: 3.12
26
+ Classifier: Operating System :: OS Independent
27
+ Requires-Python: >=3.8
28
+ Description-Content-Type: text/markdown
29
+ License-File: LICENSE
30
+ Requires-Dist: jinja2>=3.0.0
31
+ Requires-Dist: pyyaml>=6.0
32
+ Requires-Dist: click>=8.0.0
33
+ Provides-Extra: dev
34
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
35
+ Requires-Dist: black>=23.0.0; extra == "dev"
36
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
37
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
38
+ Provides-Extra: docs
39
+ Requires-Dist: mkdocs>=1.5.3; extra == "docs"
40
+ Requires-Dist: mkdocs-material>=9.5.0; extra == "docs"
41
+ Requires-Dist: mkdocs-material-extensions>=1.3; extra == "docs"
42
+ Requires-Dist: mkdocs-minify-plugin>=0.7.1; extra == "docs"
43
+ Requires-Dist: mkdocs-redirects>=1.2.1; extra == "docs"
44
+ Requires-Dist: mike>=2.0.0; extra == "docs"
45
+ Requires-Dist: pymdown-extensions>=10.5; extra == "docs"
46
+ Requires-Dist: pygments>=2.17.0; extra == "docs"
47
+ Requires-Dist: mkdocs-git-revision-date-localized-plugin>=1.2.2; extra == "docs"
48
+ Requires-Dist: mkdocs-glightbox>=0.3.5; extra == "docs"
49
+ Dynamic: license-file
50
+
51
+ # Datacompose
52
+
53
+ [![PyPI version](https://badge.fury.io/py/datacompose.svg)](https://pypi.org/project/datacompose/)
54
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
55
+ [![Coverage](https://img.shields.io/badge/coverage-92%25-brightgreen.svg)](https://github.com/your-username/datacompose)
56
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
57
+
58
+ A powerful data transformation framework for building reusable, composable data cleaning pipelines in PySpark.
59
+
60
+ ## Installation
61
+
62
+ ```bash
63
+ pip install datacompose
64
+ ```
65
+
66
+ ## What is Datacompose?
67
+
68
+ Datacompose provides production-ready PySpark data transformation primitives that become part of YOUR codebase. Inspired by [shadcn](https://ui.shadcn.com/)'s approach to components, we believe in giving you full ownership and control over your code.
69
+
70
+ ### Key Features
71
+
72
+ - **No Runtime Dependencies**: Standalone PySpark code that runs without Datacompose
73
+ - **Composable Primitives**: Build complex transformations from simple, reusable functions
74
+ - **Smart Partial Application**: Pre-configure transformations with parameters for reuse
75
+ - **Optimized Operations**: Efficient Spark transformations with minimal overhead
76
+ - **Comprehensive Libraries**: Pre-built primitives for emails, addresses, and phone numbers
77
+
78
+ ### Available Transformers
79
+
80
+ - **Emails**: Validation, extraction, standardization, typo correction
81
+ - **Addresses**: Street parsing, state/zip validation, PO Box detection
82
+ - **Phone Numbers**: NANP/international validation, formatting, toll-free detection
83
+
84
+ ## Documentation
85
+
86
+ For detailed documentation, examples, and API reference, visit [datacompose.io](https://datacompose.io).
87
+
88
+ ## Philosophy
89
+
90
+ This is NOT a traditional library - it gives you production-ready data transformation primitives that you can modify to fit your exact needs. You own the code, with no external dependencies to manage or worry about breaking changes.
91
+
92
+ ## License
93
+
94
+ MIT License - see LICENSE file for details
@@ -0,0 +1,44 @@
1
+ # Datacompose
2
+
3
+ [![PyPI version](https://badge.fury.io/py/datacompose.svg)](https://pypi.org/project/datacompose/)
4
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
5
+ [![Coverage](https://img.shields.io/badge/coverage-92%25-brightgreen.svg)](https://github.com/your-username/datacompose)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
+
8
+ A powerful data transformation framework for building reusable, composable data cleaning pipelines in PySpark.
9
+
10
+ ## Installation
11
+
12
+ ```bash
13
+ pip install datacompose
14
+ ```
15
+
16
+ ## What is Datacompose?
17
+
18
+ Datacompose provides production-ready PySpark data transformation primitives that become part of YOUR codebase. Inspired by [shadcn](https://ui.shadcn.com/)'s approach to components, we believe in giving you full ownership and control over your code.
19
+
20
+ ### Key Features
21
+
22
+ - **No Runtime Dependencies**: Standalone PySpark code that runs without Datacompose
23
+ - **Composable Primitives**: Build complex transformations from simple, reusable functions
24
+ - **Smart Partial Application**: Pre-configure transformations with parameters for reuse
25
+ - **Optimized Operations**: Efficient Spark transformations with minimal overhead
26
+ - **Comprehensive Libraries**: Pre-built primitives for emails, addresses, and phone numbers
27
+
28
+ ### Available Transformers
29
+
30
+ - **Emails**: Validation, extraction, standardization, typo correction
31
+ - **Addresses**: Street parsing, state/zip validation, PO Box detection
32
+ - **Phone Numbers**: NANP/international validation, formatting, toll-free detection
33
+
34
+ ## Documentation
35
+
36
+ For detailed documentation, examples, and API reference, visit [datacompose.io](https://datacompose.io).
37
+
38
+ ## Philosophy
39
+
40
+ This is NOT a traditional library - it gives you production-ready data transformation primitives that you can modify to fit your exact needs. You own the code, with no external dependencies to manage or worry about breaking changes.
41
+
42
+ ## License
43
+
44
+ MIT License - see LICENSE file for details
@@ -2,12 +2,12 @@
2
2
  Add command for generating UDFs.
3
3
  """
4
4
 
5
- import json
6
5
  from pathlib import Path
7
6
 
8
7
  import click
9
8
 
10
9
  from datacompose.cli.colors import dim, error, highlight, info, success
10
+ from datacompose.cli.config import ConfigLoader
11
11
  from datacompose.cli.validation import validate_platform, validate_type_for_platform
12
12
  from datacompose.transformers.discovery import TransformerDiscovery
13
13
 
@@ -86,28 +86,48 @@ _MODULE_DIR = Path(__file__).parent
86
86
  @click.option(
87
87
  "--target",
88
88
  "-t",
89
- default="pyspark",
89
+ default=None,
90
90
  shell_complete=complete_target,
91
- help="Target platform (e.g., 'pyspark', 'postgres', 'snowflake'). Default: pyspark",
91
+ help="Target platform (e.g., 'pyspark', 'postgres', 'snowflake'). Uses default from datacompose.json if not specified",
92
92
  )
93
93
  @click.option(
94
94
  "--type",
95
95
  shell_complete=complete_type,
96
96
  help="UDF type for the platform (e.g., 'pandas_udf', 'sql_udf'). Uses platform default if not specified",
97
97
  )
98
- @click.option("--output", "-o", help="Output directory (default: build/{target})")
99
98
  @click.option(
100
- "--template-dir",
101
- default="src/transformers/templates",
102
- help="Directory containing templates (default: src/transformers/templates)",
99
+ "--output",
100
+ "-o",
101
+ help="Output directory (default: from config or transformers/{target})",
103
102
  )
104
103
  @click.option("--verbose", "-v", is_flag=True, help="Verbose output")
105
104
  @click.pass_context
106
- def add(ctx, transformer, target, type, output, template_dir, verbose):
105
+ def add(ctx, transformer, target, type, output, verbose):
107
106
  """Add UDFs for transformers.
108
107
 
109
- TRANSFORMER: Transformer to add UDF for (e.g., 'clean_emails')
108
+ TRANSFORMER: Transformer to add UDF for (e.g., 'emails')
110
109
  """
110
+ # Load config to get default target if not specified
111
+ config = ConfigLoader.load_config()
112
+
113
+ if target is None:
114
+ # Try to get default target from config
115
+ target = ConfigLoader.get_default_target(config)
116
+ if target is None:
117
+ print(
118
+ error(
119
+ "Error: No target specified and no default target found in datacompose.json"
120
+ )
121
+ )
122
+ print(
123
+ info(
124
+ "Please specify a target with --target or run 'datacompose init' to set up defaults"
125
+ )
126
+ )
127
+ ctx.exit(1)
128
+ elif verbose:
129
+ print(dim(f"Using default target from config: {target}"))
130
+
111
131
  # Initialize discovery for validation
112
132
  discovery = TransformerDiscovery()
113
133
 
@@ -120,12 +140,12 @@ def add(ctx, transformer, target, type, output, template_dir, verbose):
120
140
  ctx.exit(1)
121
141
 
122
142
  # Combine target and type into generator reference
123
- exit_code = _run_add(transformer, target, output, template_dir, verbose)
143
+ exit_code = _run_add(transformer, target, output, verbose)
124
144
  if exit_code != 0:
125
145
  ctx.exit(exit_code)
126
146
 
127
147
 
128
- def _run_add(transformer, target, output, template_dir, verbose) -> int:
148
+ def _run_add(transformer, target, output, verbose) -> int:
129
149
  """Execute the add command."""
130
150
  # Initialize discovery
131
151
  discovery = TransformerDiscovery()
@@ -136,9 +156,7 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
136
156
  if not transformer_path:
137
157
  print(error(f"Error: Transformer not found: {transformer}"))
138
158
  print(
139
- info(
140
- f"Available transformers: {', '.join(discovery.list_transformers())}"
141
- )
159
+ info(f"Available transformers: {', '.join(discovery.list_transformers())}")
142
160
  )
143
161
  return 1
144
162
  else:
@@ -156,20 +174,27 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
156
174
  return 1
157
175
 
158
176
  # Determine output directory
159
- # Extract platform from target (e.g., "pyspark.pandas_udf" -> "pyspark")
160
- platform = target.split(".")[0]
161
-
162
177
  if not output:
163
- output_dir = f"build/{platform}/{transformer_name}"
178
+ # Try to get output from config first
179
+ config = ConfigLoader.load_config()
180
+ config_output = ConfigLoader.get_target_output(config, target)
181
+ if config_output:
182
+ # Config output already includes 'transformers/pyspark', so use it directly
183
+ output_dir = config_output
184
+ else:
185
+ output_dir = f"transformers/{target}"
164
186
  else:
165
- output_dir = f"{output}/{platform}/{transformer_name}"
166
-
167
- # Create generator instance
168
- generator = generator_class(
169
- template_dir=Path(template_dir), output_dir=Path(output_dir), verbose=verbose
170
- )
187
+ output_dir = output
171
188
 
172
189
  try:
190
+ # Create generator instance
191
+ # Note: template_dir is required by base class but not used by current generators
192
+ generator = generator_class(
193
+ template_dir=Path("."), # Placeholder - not actually used
194
+ output_dir=Path(output_dir),
195
+ verbose=verbose,
196
+ )
197
+
173
198
  # Generate the UDF
174
199
  result = generator.generate(
175
200
  transformer_name, force=False, transformer_dir=transformer_dir
@@ -182,13 +207,15 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
182
207
  print(dim(f" Hash: {result.get('hash', 'N/A')}"))
183
208
  else:
184
209
  print(success(f"✓ UDF generated: {result['output_path']}"))
185
- print(success(f"✓ Test created: {result['test_path']}"))
210
+ if result.get("test_path"):
211
+ print(success(f"✓ Test created: {result['test_path']}"))
186
212
  print(highlight(f"Function name: {result['function_name']}"))
187
213
  if verbose:
188
214
  print(dim(f" Target: {target}"))
189
215
  print(highlight("\nGenerated package contents:"))
190
216
  print(f" - UDF code: {result['output_path']}")
191
- print(f" - Test file: {result['test_path']}")
217
+ if result.get("test_path"):
218
+ print(f" - Test file: {result['test_path']}")
192
219
 
193
220
  return 0
194
221
 
@@ -199,17 +226,3 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
199
226
 
200
227
  traceback.print_exc()
201
228
  return 1
202
-
203
-
204
- def _load_config() -> dict:
205
- """Load datacompose.json configuration if it exists."""
206
- config_path = Path("datacompose.json")
207
- if config_path.exists():
208
- try:
209
- with open(config_path, "r") as f:
210
- return json.load(f)
211
- except Exception:
212
- pass
213
- return {}
214
-
215
-
@@ -18,10 +18,11 @@ from datacompose.cli.colors import dim, error, highlight, info, success
18
18
 
19
19
  DEFAULT_CONFIG = {
20
20
  "version": "1.0",
21
+ "default_target": "pyspark",
21
22
  "aliases": {"utils": "./src/utils"},
22
23
  "targets": {
23
24
  "pyspark": {
24
- "output": "./build/pyspark",
25
+ "output": "./transformers/pyspark",
25
26
  }
26
27
  },
27
28
  }
@@ -57,7 +58,7 @@ class InitCommand:
57
58
  def get_config_template(template_name: str) -> Dict[str, Any]:
58
59
  """Get configuration template by name."""
59
60
  if template_name == "minimal":
60
- return {"version": "1.0", "targets": {"pyspark": {"output": "./build/pyspark"}}}
61
+ return {"version": "1.0", "default_target": "pyspark", "targets": {"pyspark": {"output": "./transformers/pyspark"}}}
61
62
  elif template_name == "advanced":
62
63
  config = DEFAULT_CONFIG.copy()
63
64
  config.update(
@@ -65,10 +66,10 @@ class InitCommand:
65
66
  "style": "custom",
66
67
  "aliases": {
67
68
  "utils": "./src/utils",
68
- "build": "./build",
69
+ "transformers": "./transformers",
69
70
  },
70
71
  "include": ["src/**/*"],
71
- "exclude": ["__pycache__", "build", "*.pyc", ".pytest_cache"],
72
+ "exclude": ["__pycache__", "transformers", "*.pyc", ".pytest_cache"],
72
73
  "testing": {"framework": "pytest", "test_dir": "./tests"},
73
74
  }
74
75
  )
@@ -184,7 +185,7 @@ class InitCommand:
184
185
 
185
186
  # Select targets with multi-select
186
187
  available_targets = {
187
- "pyspark": {"output": "./build/pyspark", "name": "PySpark (Apache Spark)"},
188
+ "pyspark": {"output": "./transformers/pyspark", "name": "PySpark (Apache Spark)"},
188
189
  }
189
190
 
190
191
  selected_targets = InitCommand.prompt_for_targets(available_targets)
@@ -199,6 +200,31 @@ class InitCommand:
199
200
 
200
201
  # Update targets with user selections
201
202
  config["targets"] = selected_targets
203
+
204
+ # Set default target to the first selected target (or only target if single)
205
+ target_keys = list(selected_targets.keys())
206
+ if len(target_keys) == 1:
207
+ config["default_target"] = target_keys[0]
208
+ elif len(target_keys) > 1:
209
+ # Ask user to select default target
210
+ print(highlight("\nSelect Default Target"))
211
+ print(dim("Which platform should be used by default when running 'datacompose add'?\n"))
212
+ for i, key in enumerate(target_keys, 1):
213
+ print(f" {i}. {key}")
214
+ print()
215
+
216
+ while True:
217
+ choice = input(f"Select default target (1-{len(target_keys)}): ").strip()
218
+ try:
219
+ choice_idx = int(choice) - 1
220
+ if 0 <= choice_idx < len(target_keys):
221
+ config["default_target"] = target_keys[choice_idx]
222
+ print(dim(f"Default target set to: {target_keys[choice_idx]}\n"))
223
+ break
224
+ else:
225
+ print(error("Invalid selection. Please try again."))
226
+ except ValueError:
227
+ print(error("Please enter a number."))
202
228
 
203
229
  print() # Add spacing
204
230
  return config
@@ -403,11 +429,11 @@ def _run_init(force, output, verbose, yes, skip_completion) -> int:
403
429
  "2. Source your shell config or restart terminal for tab completion"
404
430
  )
405
431
  print(
406
- "3. Add your first transformer: datacompose add clean_emails --target pyspark"
432
+ "3. Add your first transformer: datacompose add emails"
407
433
  )
408
434
  else:
409
435
  print(
410
- "2. Add your first transformer: datacompose add clean_emails --target pyspark"
436
+ "2. Add your first transformer: datacompose add emails"
411
437
  )
412
438
  if not skip_completion:
413
439
  print(
@@ -419,7 +445,7 @@ def _run_init(force, output, verbose, yes, skip_completion) -> int:
419
445
  print(success("✓ Tab completion configured"))
420
446
  print(
421
447
  highlight(
422
- "\nRun 'datacompose add clean_emails --target pyspark' to get started"
448
+ "\nRun 'datacompose add emails' to get started"
423
449
  )
424
450
  )
425
451
  print(
@@ -430,7 +456,7 @@ def _run_init(force, output, verbose, yes, skip_completion) -> int:
430
456
  else:
431
457
  print(
432
458
  highlight(
433
- "\nRun 'datacompose add clean_emails --target pyspark' to get started"
459
+ "\nRun 'datacompose add emails' to get started"
434
460
  )
435
461
  )
436
462
  if not skip_completion and not yes:
@@ -95,7 +95,7 @@ class ListCommand:
95
95
  print(f" • {transformer_name}")
96
96
 
97
97
  print("\nUsage: datacompose add <transformer> --target <platform> [--type <type>]")
98
- print("Example: datacompose add clean_emails --target pyspark")
98
+ print("Example: datacompose add emails --target pyspark")
99
99
  return 0
100
100
 
101
101
  @staticmethod
@@ -114,5 +114,5 @@ class ListCommand:
114
114
  print(f" • {gen_type} ({gen_class.__name__})")
115
115
 
116
116
  print("\nUsage: datacompose add <transformer> --target <platform> [--type <type>]")
117
- print("Example: datacompose add clean_emails --target pyspark")
117
+ print("Example: datacompose add emails --target pyspark")
118
118
  return 0
@@ -0,0 +1,80 @@
1
+ """
2
+ Configuration management for Datacompose CLI.
3
+ """
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Any, Dict, Optional
8
+
9
+
10
+ class ConfigLoader:
11
+ """Load and manage Datacompose configuration."""
12
+
13
+ DEFAULT_CONFIG_FILE = "datacompose.json"
14
+
15
+ @staticmethod
16
+ def load_config(config_path: Optional[Path] = None) -> Optional[Dict[str, Any]]:
17
+ """Load configuration from datacompose.json.
18
+
19
+ Args:
20
+ config_path: Optional path to config file. Defaults to ./datacompose.json
21
+
22
+ Returns:
23
+ Config dictionary or None if not found
24
+ """
25
+ if config_path is None:
26
+ config_path = Path(ConfigLoader.DEFAULT_CONFIG_FILE)
27
+
28
+ if not config_path.exists():
29
+ return None
30
+
31
+ try:
32
+ with open(config_path, 'r') as f:
33
+ return json.load(f)
34
+ except (json.JSONDecodeError, IOError):
35
+ return None
36
+
37
+ @staticmethod
38
+ def get_default_target(config: Optional[Dict[str, Any]] = None) -> Optional[str]:
39
+ """Get the default target from config.
40
+
41
+ Args:
42
+ config: Optional config dict. If None, will load from file.
43
+
44
+ Returns:
45
+ Default target name or None
46
+ """
47
+ if config is None:
48
+ config = ConfigLoader.load_config()
49
+
50
+ if not config:
51
+ return None
52
+
53
+ # Check for explicit default_target setting
54
+ if "default_target" in config:
55
+ return config["default_target"]
56
+
57
+ # Otherwise use the first target if only one exists
58
+ targets = config.get("targets", {})
59
+ if len(targets) == 1:
60
+ return list(targets.keys())[0]
61
+
62
+ return None
63
+
64
+ @staticmethod
65
+ def get_target_output(config: Optional[Dict[str, Any]], target: str) -> Optional[str]:
66
+ """Get the output directory for a specific target.
67
+
68
+ Args:
69
+ config: Config dictionary
70
+ target: Target name
71
+
72
+ Returns:
73
+ Output directory path or None
74
+ """
75
+ if not config:
76
+ return None
77
+
78
+ targets = config.get("targets", {})
79
+ target_config = targets.get(target, {})
80
+ return target_config.get("output")
@@ -25,9 +25,9 @@ def cli(ctx):
25
25
  """Generate data cleaning UDFs for various platforms.
26
26
 
27
27
  Examples:
28
- datacompose init
29
- datacompose add clean_emails --target pyspark
30
- datacompose add clean_emails --target snowflake --output sql/udfs/
28
+ datacompose init # Set up project with default target
29
+ datacompose add emails # Uses default target from config
30
+ datacompose add emails --target snowflake --output sql/udfs/
31
31
  datacompose list targets
32
32
  """
33
33
  pass