datacompose 0.2.4.1__py3-none-any.whl → 0.2.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacompose might be problematic. Click here for more details.
- datacompose/cli/__init__.py +1 -1
- datacompose/cli/commands/add.py +49 -21
- datacompose/cli/commands/init.py +35 -9
- datacompose/cli/commands/list.py +2 -2
- datacompose/cli/config.py +80 -0
- datacompose/cli/main.py +3 -3
- datacompose/generators/base.py +15 -14
- datacompose/generators/pyspark/generator.py +5 -10
- datacompose/operators/__init__.py +1 -1
- datacompose/operators/primitives.py +57 -19
- datacompose/transformers/text/{clean_addresses → addresses}/pyspark/pyspark_primitives.py +68 -13
- datacompose/transformers/text/{clean_emails → emails}/pyspark/pyspark_primitives.py +53 -1
- datacompose/transformers/text/{clean_phone_numbers → phone_numbers}/pyspark/pyspark_primitives.py +416 -366
- datacompose-0.2.6.0.dist-info/METADATA +94 -0
- datacompose-0.2.6.0.dist-info/RECORD +31 -0
- datacompose-0.2.4.1.dist-info/METADATA +0 -449
- datacompose-0.2.4.1.dist-info/RECORD +0 -30
- /datacompose/transformers/text/{clean_addresses → addresses}/__init__.py +0 -0
- /datacompose/transformers/text/{clean_emails → emails}/__init__.py +0 -0
- /datacompose/transformers/text/{clean_phone_numbers → phone_numbers}/__init__.py +0 -0
- {datacompose-0.2.4.1.dist-info → datacompose-0.2.6.0.dist-info}/WHEEL +0 -0
- {datacompose-0.2.4.1.dist-info → datacompose-0.2.6.0.dist-info}/entry_points.txt +0 -0
- {datacompose-0.2.4.1.dist-info → datacompose-0.2.6.0.dist-info}/licenses/LICENSE +0 -0
- {datacompose-0.2.4.1.dist-info → datacompose-0.2.6.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datacompose
|
|
3
|
+
Version: 0.2.6.0
|
|
4
|
+
Summary: Copy-pasteable data transformation primitives for PySpark. Inspired by shadcn-svelte.
|
|
5
|
+
Author: Datacompose Contributors
|
|
6
|
+
Maintainer: Datacompose Contributors
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/tc-cole/datacompose
|
|
9
|
+
Project-URL: Documentation, https://github.com/tc-cole/datacompose/tree/main/docs
|
|
10
|
+
Project-URL: Repository, https://github.com/tc-cole/datacompose.git
|
|
11
|
+
Project-URL: Issues, https://github.com/tc-cole/datacompose/issues
|
|
12
|
+
Project-URL: Changelog, https://github.com/tc-cole/datacompose/blob/main/CHANGELOG.md
|
|
13
|
+
Keywords: data-cleaning,data-quality,udf,spark,postgres,code-generation,data-pipeline,etl
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Topic :: Software Development :: Code Generators
|
|
17
|
+
Classifier: Topic :: Database
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
26
|
+
Classifier: Operating System :: OS Independent
|
|
27
|
+
Requires-Python: >=3.8
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
License-File: LICENSE
|
|
30
|
+
Requires-Dist: jinja2>=3.0.0
|
|
31
|
+
Requires-Dist: pyyaml>=6.0
|
|
32
|
+
Requires-Dist: click>=8.0.0
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
35
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
36
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
37
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
38
|
+
Provides-Extra: docs
|
|
39
|
+
Requires-Dist: mkdocs>=1.5.3; extra == "docs"
|
|
40
|
+
Requires-Dist: mkdocs-material>=9.5.0; extra == "docs"
|
|
41
|
+
Requires-Dist: mkdocs-material-extensions>=1.3; extra == "docs"
|
|
42
|
+
Requires-Dist: mkdocs-minify-plugin>=0.7.1; extra == "docs"
|
|
43
|
+
Requires-Dist: mkdocs-redirects>=1.2.1; extra == "docs"
|
|
44
|
+
Requires-Dist: mike>=2.0.0; extra == "docs"
|
|
45
|
+
Requires-Dist: pymdown-extensions>=10.5; extra == "docs"
|
|
46
|
+
Requires-Dist: pygments>=2.17.0; extra == "docs"
|
|
47
|
+
Requires-Dist: mkdocs-git-revision-date-localized-plugin>=1.2.2; extra == "docs"
|
|
48
|
+
Requires-Dist: mkdocs-glightbox>=0.3.5; extra == "docs"
|
|
49
|
+
Dynamic: license-file
|
|
50
|
+
|
|
51
|
+
# Datacompose
|
|
52
|
+
|
|
53
|
+
[](https://pypi.org/project/datacompose/)
|
|
54
|
+
[](https://www.python.org/downloads/)
|
|
55
|
+
[](https://github.com/your-username/datacompose)
|
|
56
|
+
[](https://opensource.org/licenses/MIT)
|
|
57
|
+
|
|
58
|
+
A powerful data transformation framework for building reusable, composable data cleaning pipelines in PySpark.
|
|
59
|
+
|
|
60
|
+
## Installation
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
pip install datacompose
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## What is Datacompose?
|
|
67
|
+
|
|
68
|
+
Datacompose provides production-ready PySpark data transformation primitives that become part of YOUR codebase. Inspired by [shadcn](https://ui.shadcn.com/)'s approach to components, we believe in giving you full ownership and control over your code.
|
|
69
|
+
|
|
70
|
+
### Key Features
|
|
71
|
+
|
|
72
|
+
- **No Runtime Dependencies**: Standalone PySpark code that runs without Datacompose
|
|
73
|
+
- **Composable Primitives**: Build complex transformations from simple, reusable functions
|
|
74
|
+
- **Smart Partial Application**: Pre-configure transformations with parameters for reuse
|
|
75
|
+
- **Optimized Operations**: Efficient Spark transformations with minimal overhead
|
|
76
|
+
- **Comprehensive Libraries**: Pre-built primitives for emails, addresses, and phone numbers
|
|
77
|
+
|
|
78
|
+
### Available Transformers
|
|
79
|
+
|
|
80
|
+
- **Emails**: Validation, extraction, standardization, typo correction
|
|
81
|
+
- **Addresses**: Street parsing, state/zip validation, PO Box detection
|
|
82
|
+
- **Phone Numbers**: NANP/international validation, formatting, toll-free detection
|
|
83
|
+
|
|
84
|
+
## Documentation
|
|
85
|
+
|
|
86
|
+
For detailed documentation, examples, and API reference, visit [datacompose.io](https://datacompose.io).
|
|
87
|
+
|
|
88
|
+
## Philosophy
|
|
89
|
+
|
|
90
|
+
This is NOT a traditional library - it gives you production-ready data transformation primitives that you can modify to fit your exact needs. You own the code, with no external dependencies to manage or worry about breaking changes.
|
|
91
|
+
|
|
92
|
+
## License
|
|
93
|
+
|
|
94
|
+
MIT License - see LICENSE file for details
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
datacompose/__init__.py,sha256=kbQEzEvheAMsm3-MT4nWSuX42fjrfgDWoR8WZmC_WX4,34
|
|
2
|
+
datacompose/cli/__init__.py,sha256=UuHpU3QypWZC9NEjI89BNUW_PcRfMUyJbZDKJvAm1hQ,109
|
|
3
|
+
datacompose/cli/colors.py,sha256=Ax7jHhdAIuq5x3663gJ7_MzFCBOJv38DqNXts5t4SLs,1756
|
|
4
|
+
datacompose/cli/config.py,sha256=vvY6xGgIdybUuujdPrDI_CsSUUD9CEfODG8Kem4jqVQ,2353
|
|
5
|
+
datacompose/cli/main.py,sha256=d87hG1nxDVNTRZVi7ctNQQ06lwe5KbLGJMChlHtR3Kc,1343
|
|
6
|
+
datacompose/cli/validation.py,sha256=8WMZ9wtPgFk9eBgMS_wtkncFz_-BmH4E8V57tjp3YoI,2526
|
|
7
|
+
datacompose/cli/commands/__init__.py,sha256=Bu58UsnkGRbVFS92U2Px_KxlUPrdlbSY6wlvP6tet2o,38
|
|
8
|
+
datacompose/cli/commands/add.py,sha256=N4tqtOEXmaP6vpO4ZfhFH6qBznmFOYRU3y9TEvc2da0,7707
|
|
9
|
+
datacompose/cli/commands/init.py,sha256=prLBWsvuAxRWiT8UhqqhaS9tWc9sUToM6MrGWCdjakM,17986
|
|
10
|
+
datacompose/cli/commands/list.py,sha256=mXihUMrnwLUoIG-FpNb8-XJ0VZfh0v3exHq1m_Mrprg,3855
|
|
11
|
+
datacompose/generators/__init__.py,sha256=dFJWJScu8mkP0ZKIQtVlJ36PQW-LwCYBijuNwLSevZw,48
|
|
12
|
+
datacompose/generators/base.py,sha256=EgpHwaaSxAP1Ygq5Wtyq4ez-wG0oPwDEbiKgLsEilD0,6761
|
|
13
|
+
datacompose/generators/pyspark/__init__.py,sha256=ayoKDGtbt2KwFcNt2QxHKt8z83Kzy4ySw9Gg7j9ZMTY,33
|
|
14
|
+
datacompose/generators/pyspark/generator.py,sha256=be4GATA8rmLAg4_wZ3Ox3vC3up_OXMOajjIUJQrDQ10,1735
|
|
15
|
+
datacompose/operators/__init__.py,sha256=Eacc0JDCzeuTeLGO_N9Nz9pOc1D3_6BxEcpCibzrpz8,588
|
|
16
|
+
datacompose/operators/primitives.py,sha256=FxhtgP7aizKsnNBgh5oTqwc9m8QSjLTpRoG5zu6rFns,23615
|
|
17
|
+
datacompose/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
|
+
datacompose/transformers/discovery.py,sha256=JgMtmd8PkhmwLqS17NeKSv9MRneY9tdOsOK32A6luOQ,7048
|
|
19
|
+
datacompose/transformers/text/__init__.py,sha256=Mq0UmgYlBV8T18IkvHAS1TImEzWyGciCqxaCv324hFQ,36
|
|
20
|
+
datacompose/transformers/text/addresses/__init__.py,sha256=l5TItGrGBn69Mlq0CaRGJa-SwpyuUEYWvG5N26s3Pco,39
|
|
21
|
+
datacompose/transformers/text/addresses/pyspark/pyspark_primitives.py,sha256=gf_-UbCglfsT3HoNdw8ZAXUT7zS4h9uZ2BevG05GwpY,62389
|
|
22
|
+
datacompose/transformers/text/emails/__init__.py,sha256=snZLOJsxrPDOi8gIISlRxc6YlskKxUyu0NnOZCE5cIU,34
|
|
23
|
+
datacompose/transformers/text/emails/pyspark/pyspark_primitives.py,sha256=zQpntW4-RsDv5C1TWp0put10UyUEamP1BxvVYbr2Q58,23785
|
|
24
|
+
datacompose/transformers/text/phone_numbers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
+
datacompose/transformers/text/phone_numbers/pyspark/pyspark_primitives.py,sha256=PhyC4GEHJiUVHNulRDEjLUoZgnlBnxN9VKzLr802QrI,28856
|
|
26
|
+
datacompose-0.2.6.0.dist-info/licenses/LICENSE,sha256=SCPOqmPhMikiyYDlKZ877fGHaE2O45cDBoJIomrlpDU,1067
|
|
27
|
+
datacompose-0.2.6.0.dist-info/METADATA,sha256=n5GlJ73W6LwLojmtIMLX_rqX7qCBd60lzcQo-iEzUEM,4352
|
|
28
|
+
datacompose-0.2.6.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
29
|
+
datacompose-0.2.6.0.dist-info/entry_points.txt,sha256=oeG9oGgDwajk4v0C1awdUTBx2GmhLpuNHCTAV-jurUc,58
|
|
30
|
+
datacompose-0.2.6.0.dist-info/top_level.txt,sha256=AX1qGkuJMD2YJLZKo40h-w4MeFxDZL6W1vbKKuTpW8I,12
|
|
31
|
+
datacompose-0.2.6.0.dist-info/RECORD,,
|
|
@@ -1,449 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: datacompose
|
|
3
|
-
Version: 0.2.4.1
|
|
4
|
-
Summary: Copy-pasteable data transformation primitives for PySpark. Inspired by shadcn-svelte.
|
|
5
|
-
Author: Datacompose Contributors
|
|
6
|
-
Maintainer: Datacompose Contributors
|
|
7
|
-
License: MIT
|
|
8
|
-
Project-URL: Homepage, https://github.com/tc-cole/datacompose
|
|
9
|
-
Project-URL: Documentation, https://github.com/tc-cole/datacompose/tree/main/docs
|
|
10
|
-
Project-URL: Repository, https://github.com/tc-cole/datacompose.git
|
|
11
|
-
Project-URL: Issues, https://github.com/tc-cole/datacompose/issues
|
|
12
|
-
Project-URL: Changelog, https://github.com/tc-cole/datacompose/blob/main/CHANGELOG.md
|
|
13
|
-
Keywords: data-cleaning,data-quality,udf,spark,postgres,code-generation,data-pipeline,etl
|
|
14
|
-
Classifier: Development Status :: 4 - Beta
|
|
15
|
-
Classifier: Intended Audience :: Developers
|
|
16
|
-
Classifier: Topic :: Software Development :: Code Generators
|
|
17
|
-
Classifier: Topic :: Database
|
|
18
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
20
|
-
Classifier: Programming Language :: Python :: 3
|
|
21
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
22
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
23
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
24
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
25
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
26
|
-
Classifier: Operating System :: OS Independent
|
|
27
|
-
Requires-Python: >=3.8
|
|
28
|
-
Description-Content-Type: text/markdown
|
|
29
|
-
License-File: LICENSE
|
|
30
|
-
Requires-Dist: jinja2>=3.0.0
|
|
31
|
-
Requires-Dist: pyyaml>=6.0
|
|
32
|
-
Requires-Dist: click>=8.0.0
|
|
33
|
-
Provides-Extra: dev
|
|
34
|
-
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
35
|
-
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
36
|
-
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
37
|
-
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
38
|
-
Provides-Extra: docs
|
|
39
|
-
Requires-Dist: sphinx>=7.2.0; extra == "docs"
|
|
40
|
-
Requires-Dist: furo>=2024.1.0; extra == "docs"
|
|
41
|
-
Requires-Dist: myst-parser>=2.0.0; extra == "docs"
|
|
42
|
-
Requires-Dist: sphinx-autodoc-typehints>=1.25.0; extra == "docs"
|
|
43
|
-
Requires-Dist: sphinx-copybutton>=0.5.2; extra == "docs"
|
|
44
|
-
Requires-Dist: sphinx-tabs>=3.4.0; extra == "docs"
|
|
45
|
-
Requires-Dist: sphinx-click>=5.1.0; extra == "docs"
|
|
46
|
-
Dynamic: license-file
|
|
47
|
-
|
|
48
|
-
# Datacompose
|
|
49
|
-
|
|
50
|
-
[](https://pypi.org/project/datacompose/)
|
|
51
|
-
[](https://www.python.org/downloads/)
|
|
52
|
-
[](https://github.com/your-username/datacompose)
|
|
53
|
-
[](https://opensource.org/licenses/MIT)
|
|
54
|
-
|
|
55
|
-
A powerful data transformation framework for building reusable, composable data cleaning pipelines in PySpark.
|
|
56
|
-
|
|
57
|
-
## Overview
|
|
58
|
-
|
|
59
|
-
Datacompose provides a declarative way to build data transformation pipelines using composable primitives. It generates optimized, standalone PySpark code that can be deployed without runtime dependencies.
|
|
60
|
-
|
|
61
|
-
## Key Features
|
|
62
|
-
|
|
63
|
-
- **Composable Primitives**: Build complex transformations from simple, reusable functions
|
|
64
|
-
- **Smart Partial Application**: Configure transformations with parameters for reuse
|
|
65
|
-
- **Pipeline Compilation**: Convert declarative pipeline definitions into optimized Spark operations
|
|
66
|
-
- **Code Generation**: Generate standalone PySpark code with embedded dependencies
|
|
67
|
-
- **Comprehensive Libraries**: Pre-built primitives for emails, addresses, and phone numbers
|
|
68
|
-
- **Conditional Logic**: Support for if/else branching in pipelines
|
|
69
|
-
- **Type-Safe Operations**: All transformations maintain Spark column type safety
|
|
70
|
-
|
|
71
|
-
## Installation
|
|
72
|
-
|
|
73
|
-
```bash
|
|
74
|
-
pip install datacompose
|
|
75
|
-
```
|
|
76
|
-
|
|
77
|
-
## Quick Start
|
|
78
|
-
|
|
79
|
-
### 1. Initialize a Project
|
|
80
|
-
|
|
81
|
-
```bash
|
|
82
|
-
datacompose init
|
|
83
|
-
```
|
|
84
|
-
|
|
85
|
-
This creates a `datacompose.json` configuration file with default settings.
|
|
86
|
-
|
|
87
|
-
### 2. Generate Transformation Code
|
|
88
|
-
|
|
89
|
-
```bash
|
|
90
|
-
# Generate email cleaning primitives
|
|
91
|
-
datacompose add clean_emails --target pyspark
|
|
92
|
-
|
|
93
|
-
# Generate address standardization primitives
|
|
94
|
-
datacompose add clean_addresses --target pyspark
|
|
95
|
-
|
|
96
|
-
# Generate phone number validation primitives
|
|
97
|
-
datacompose add clean_phone_numbers --target pyspark
|
|
98
|
-
```
|
|
99
|
-
|
|
100
|
-
### 3. Use the Generated Code
|
|
101
|
-
|
|
102
|
-
```python
|
|
103
|
-
from pyspark.sql import SparkSession
|
|
104
|
-
from pyspark.sql import functions as F
|
|
105
|
-
|
|
106
|
-
# Import the generated primitives
|
|
107
|
-
from build.pyspark.clean_emails.email_primitives import emails
|
|
108
|
-
|
|
109
|
-
# Create Spark session
|
|
110
|
-
spark = SparkSession.builder.appName("DataCleaning").getOrCreate()
|
|
111
|
-
|
|
112
|
-
# Load your data
|
|
113
|
-
df = spark.read.csv("data.csv", header=True)
|
|
114
|
-
|
|
115
|
-
# Apply email transformations
|
|
116
|
-
cleaned_df = df.withColumn(
|
|
117
|
-
"email_clean",
|
|
118
|
-
emails.standardize_email(F.col("email"))
|
|
119
|
-
).withColumn(
|
|
120
|
-
"email_domain",
|
|
121
|
-
emails.extract_domain(F.col("email_clean"))
|
|
122
|
-
).withColumn(
|
|
123
|
-
"is_valid",
|
|
124
|
-
emails.is_valid_email(F.col("email_clean"))
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
# Filter to valid emails only
|
|
128
|
-
valid_emails = cleaned_df.filter(F.col("is_valid"))
|
|
129
|
-
```
|
|
130
|
-
|
|
131
|
-
## Core Concepts
|
|
132
|
-
|
|
133
|
-
### PrimitiveRegistry
|
|
134
|
-
|
|
135
|
-
A container for organizing related transformation functions:
|
|
136
|
-
|
|
137
|
-
```python
|
|
138
|
-
from datacompose.operators.primitives import PrimitiveRegistry
|
|
139
|
-
|
|
140
|
-
# Create a registry for text operations
|
|
141
|
-
text = PrimitiveRegistry("text")
|
|
142
|
-
|
|
143
|
-
# Register transformation functions
|
|
144
|
-
@text.register()
|
|
145
|
-
def lowercase(col):
|
|
146
|
-
return F.lower(col)
|
|
147
|
-
|
|
148
|
-
@text.register()
|
|
149
|
-
def remove_spaces(col):
|
|
150
|
-
return F.regexp_replace(col, r'\s+', '')
|
|
151
|
-
|
|
152
|
-
# Use the transformations
|
|
153
|
-
df = df.withColumn("clean_text", text.lowercase(F.col("input")))
|
|
154
|
-
```
|
|
155
|
-
|
|
156
|
-
### SmartPrimitive
|
|
157
|
-
|
|
158
|
-
Enables partial application of transformations:
|
|
159
|
-
|
|
160
|
-
```python
|
|
161
|
-
@text.register()
|
|
162
|
-
def trim(col, chars=' '):
|
|
163
|
-
return F.trim(col, chars)
|
|
164
|
-
|
|
165
|
-
# Direct usage
|
|
166
|
-
df = df.withColumn("trimmed", text.trim(F.col("input")))
|
|
167
|
-
|
|
168
|
-
# Pre-configured usage
|
|
169
|
-
trim_tabs = text.trim(chars='\t')
|
|
170
|
-
df = df.withColumn("no_tabs", trim_tabs(F.col("input")))
|
|
171
|
-
```
|
|
172
|
-
|
|
173
|
-
### Pipeline Composition
|
|
174
|
-
|
|
175
|
-
Build complex pipelines from simple primitives:
|
|
176
|
-
|
|
177
|
-
```python
|
|
178
|
-
@text.compose(text=text)
|
|
179
|
-
def clean_pipeline():
|
|
180
|
-
text.trim()
|
|
181
|
-
text.lowercase()
|
|
182
|
-
text.remove_spaces()
|
|
183
|
-
|
|
184
|
-
# Apply the entire pipeline
|
|
185
|
-
df = df.withColumn("cleaned", clean_pipeline(F.col("input")))
|
|
186
|
-
```
|
|
187
|
-
|
|
188
|
-
### Conditional Pipelines
|
|
189
|
-
|
|
190
|
-
Add conditional logic to your transformations:
|
|
191
|
-
|
|
192
|
-
```python
|
|
193
|
-
@text.register(is_conditional=True)
|
|
194
|
-
def is_valid_length(col):
|
|
195
|
-
return F.length(col) > 5
|
|
196
|
-
|
|
197
|
-
@text.register()
|
|
198
|
-
def truncate(col):
|
|
199
|
-
return F.substring(col, 1, 5)
|
|
200
|
-
|
|
201
|
-
@text.compose(text=text)
|
|
202
|
-
def smart_truncate():
|
|
203
|
-
if text.is_valid_length():
|
|
204
|
-
text.truncate()
|
|
205
|
-
```
|
|
206
|
-
|
|
207
|
-
## Available Primitives
|
|
208
|
-
|
|
209
|
-
### Email Primitives
|
|
210
|
-
|
|
211
|
-
```python
|
|
212
|
-
from build.pyspark.clean_emails.email_primitives import emails
|
|
213
|
-
|
|
214
|
-
# Validation
|
|
215
|
-
emails.is_valid_email(col)
|
|
216
|
-
emails.is_business_email(col)
|
|
217
|
-
emails.is_disposable_email(col)
|
|
218
|
-
|
|
219
|
-
# Extraction
|
|
220
|
-
emails.extract_domain(col)
|
|
221
|
-
emails.extract_username(col)
|
|
222
|
-
emails.extract_tld(col)
|
|
223
|
-
|
|
224
|
-
# Standardization
|
|
225
|
-
emails.standardize_email(col)
|
|
226
|
-
emails.normalize_gmail(col)
|
|
227
|
-
emails.fix_common_typos(col)
|
|
228
|
-
|
|
229
|
-
# Filtering
|
|
230
|
-
emails.filter_valid_emails(col)
|
|
231
|
-
emails.filter_business_emails(col)
|
|
232
|
-
```
|
|
233
|
-
|
|
234
|
-
### Address Primitives
|
|
235
|
-
|
|
236
|
-
```python
|
|
237
|
-
from build.pyspark.clean_addresses.address_primitives import addresses
|
|
238
|
-
|
|
239
|
-
# Extraction
|
|
240
|
-
addresses.extract_street_number(col)
|
|
241
|
-
addresses.extract_street_name(col)
|
|
242
|
-
addresses.extract_city(col)
|
|
243
|
-
addresses.extract_state(col)
|
|
244
|
-
addresses.extract_zip_code(col)
|
|
245
|
-
|
|
246
|
-
# Standardization
|
|
247
|
-
addresses.standardize_state(col)
|
|
248
|
-
addresses.standardize_street_suffix(col)
|
|
249
|
-
addresses.standardize_direction(col)
|
|
250
|
-
|
|
251
|
-
# Validation
|
|
252
|
-
addresses.is_valid_zip_code(col)
|
|
253
|
-
addresses.is_valid_state(col)
|
|
254
|
-
addresses.is_po_box(col)
|
|
255
|
-
```
|
|
256
|
-
|
|
257
|
-
### Phone Number Primitives
|
|
258
|
-
|
|
259
|
-
```python
|
|
260
|
-
from build.pyspark.clean_phone_numbers.phone_primitives import phones
|
|
261
|
-
|
|
262
|
-
# Validation
|
|
263
|
-
phones.is_valid_nanp(col)
|
|
264
|
-
phones.is_valid_international(col)
|
|
265
|
-
phones.is_toll_free(col)
|
|
266
|
-
|
|
267
|
-
# Extraction
|
|
268
|
-
phones.extract_country_code(col)
|
|
269
|
-
phones.extract_area_code(col)
|
|
270
|
-
phones.extract_exchange(col)
|
|
271
|
-
phones.extract_subscriber(col)
|
|
272
|
-
|
|
273
|
-
# Formatting
|
|
274
|
-
phones.format_nanp(col)
|
|
275
|
-
phones.format_e164(col)
|
|
276
|
-
phones.format_international(col)
|
|
277
|
-
|
|
278
|
-
# Standardization
|
|
279
|
-
phones.standardize_phone(col)
|
|
280
|
-
phones.clean_phone(col)
|
|
281
|
-
```
|
|
282
|
-
|
|
283
|
-
## Advanced Usage
|
|
284
|
-
|
|
285
|
-
### Creating Custom Primitives
|
|
286
|
-
|
|
287
|
-
```python
|
|
288
|
-
from datacompose.operators.primitives import PrimitiveRegistry
|
|
289
|
-
|
|
290
|
-
# Create your own registry
|
|
291
|
-
custom = PrimitiveRegistry("custom")
|
|
292
|
-
|
|
293
|
-
@custom.register()
|
|
294
|
-
def remove_special_chars(col):
|
|
295
|
-
return F.regexp_replace(col, r'[^a-zA-Z0-9\s]', '')
|
|
296
|
-
|
|
297
|
-
@custom.register()
|
|
298
|
-
def capitalize_words(col):
|
|
299
|
-
return F.initcap(col)
|
|
300
|
-
|
|
301
|
-
@custom.register(is_conditional=True)
|
|
302
|
-
def contains_numbers(col):
|
|
303
|
-
return col.rlike(r'\d+')
|
|
304
|
-
|
|
305
|
-
# Create a pipeline with your custom primitives
|
|
306
|
-
@custom.compose(custom=custom)
|
|
307
|
-
def clean_text():
|
|
308
|
-
custom.remove_special_chars()
|
|
309
|
-
if custom.contains_numbers():
|
|
310
|
-
custom.capitalize_words()
|
|
311
|
-
```
|
|
312
|
-
|
|
313
|
-
### Working with Parameters
|
|
314
|
-
|
|
315
|
-
```python
|
|
316
|
-
@custom.register()
|
|
317
|
-
def pad_string(col, length=10, fill_char='0'):
|
|
318
|
-
return F.lpad(col, length, fill_char)
|
|
319
|
-
|
|
320
|
-
# Use with different parameters
|
|
321
|
-
df = df.withColumn("padded_10", custom.pad_string(F.col("id")))
|
|
322
|
-
df = df.withColumn("padded_5", custom.pad_string(length=5)(F.col("id")))
|
|
323
|
-
df = df.withColumn("padded_x", custom.pad_string(length=8, fill_char='X')(F.col("id")))
|
|
324
|
-
```
|
|
325
|
-
|
|
326
|
-
### Combining Multiple Registries
|
|
327
|
-
|
|
328
|
-
```python
|
|
329
|
-
from build.pyspark.clean_emails.email_primitives import emails
|
|
330
|
-
from build.pyspark.clean_phones.phone_primitives import phones
|
|
331
|
-
|
|
332
|
-
# Create a combined validation pipeline
|
|
333
|
-
validation = PrimitiveRegistry("validation")
|
|
334
|
-
|
|
335
|
-
@validation.compose(emails=emails, phones=phones)
|
|
336
|
-
def validate_contact_info():
|
|
337
|
-
# Check email
|
|
338
|
-
if emails.is_valid_email():
|
|
339
|
-
emails.standardize_email()
|
|
340
|
-
|
|
341
|
-
# Check phone
|
|
342
|
-
if phones.is_valid_phone():
|
|
343
|
-
phones.standardize_phone()
|
|
344
|
-
```
|
|
345
|
-
|
|
346
|
-
## CLI Commands
|
|
347
|
-
|
|
348
|
-
### Initialize a Project
|
|
349
|
-
```bash
|
|
350
|
-
datacompose init [--yes]
|
|
351
|
-
```
|
|
352
|
-
|
|
353
|
-
### Add Transformers
|
|
354
|
-
```bash
|
|
355
|
-
datacompose add <transformer> [--target TARGET] [--output OUTPUT] [--verbose]
|
|
356
|
-
|
|
357
|
-
# Examples
|
|
358
|
-
datacompose add clean_emails --target pyspark
|
|
359
|
-
datacompose add clean_addresses --target pyspark --output ./custom/path
|
|
360
|
-
datacompose add clean_phone_numbers --target pyspark --verbose
|
|
361
|
-
```
|
|
362
|
-
|
|
363
|
-
### List Available Transformers
|
|
364
|
-
```bash
|
|
365
|
-
datacompose list transformers
|
|
366
|
-
datacompose list generators
|
|
367
|
-
```
|
|
368
|
-
|
|
369
|
-
## Project Structure
|
|
370
|
-
|
|
371
|
-
After running `datacompose add`, your project will have the following structure:
|
|
372
|
-
|
|
373
|
-
```
|
|
374
|
-
project/
|
|
375
|
-
├── datacompose.json # Configuration file
|
|
376
|
-
├── build/
|
|
377
|
-
│ └── pyspark/
|
|
378
|
-
│ ├── clean_emails/
|
|
379
|
-
│ │ ├── email_primitives.py # Generated email primitives
|
|
380
|
-
│ │ └── utils/
|
|
381
|
-
│ │ └── primitives.py # Core framework (embedded)
|
|
382
|
-
│ ├── clean_addresses/
|
|
383
|
-
│ │ ├── address_primitives.py
|
|
384
|
-
│ │ └── utils/
|
|
385
|
-
│ │ └── primitives.py
|
|
386
|
-
│ └── clean_phone_numbers/
|
|
387
|
-
│ ├── phone_primitives.py
|
|
388
|
-
│ └── utils/
|
|
389
|
-
│ └── primitives.py
|
|
390
|
-
```
|
|
391
|
-
|
|
392
|
-
## Configuration
|
|
393
|
-
|
|
394
|
-
The `datacompose.json` file configures default settings:
|
|
395
|
-
|
|
396
|
-
```json
|
|
397
|
-
{
|
|
398
|
-
"version": "1.0.0",
|
|
399
|
-
"targets": {
|
|
400
|
-
"pyspark": {
|
|
401
|
-
"output": "./build/pyspark",
|
|
402
|
-
"generator": "SparkPandasUDFGenerator"
|
|
403
|
-
}
|
|
404
|
-
},
|
|
405
|
-
"templates": {
|
|
406
|
-
"directory": "src/transformers/templates"
|
|
407
|
-
}
|
|
408
|
-
}
|
|
409
|
-
```
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
## Performance Considerations
|
|
414
|
-
|
|
415
|
-
- Primitives are designed to be efficient Spark operations
|
|
416
|
-
- Pipelines are compiled to minimize intermediate columns
|
|
417
|
-
- Conditional logic uses Spark's `when/otherwise` for vectorized operations
|
|
418
|
-
- Generated code has no runtime dependencies beyond PySpark
|
|
419
|
-
|
|
420
|
-
## Philosophy & Inspiration
|
|
421
|
-
|
|
422
|
-
Datacompose is inspired by [shadcn-svelte](https://www.shadcn-svelte.com/) and [huntabyte](https://github.com/huntabyte)'s approach to component libraries. Just as shadcn-svelte provides "copy and paste" components rather than npm packages, Datacompose generates data transformation code that becomes part of YOUR codebase.
|
|
423
|
-
|
|
424
|
-
**Why we believe in this approach:**
|
|
425
|
-
|
|
426
|
-
- **You Own Your Code**: No external dependencies to manage or worry about breaking changes
|
|
427
|
-
- **Full Transparency**: Every transformation is readable, debuggable PySpark code you can understand
|
|
428
|
-
- **Customization First**: Need to adjust transformation? Just edit the code
|
|
429
|
-
- **Learn by Reading**: The generated code serves as documentation and learning material
|
|
430
|
-
|
|
431
|
-
This is NOT a traditional library - it's a code generator that gives you production-ready data transformation primitives that you can modify to fit your exact needs.
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
## Test Coverage
|
|
435
|
-
|
|
436
|
-
**Critical components are thoroughly tested:**
|
|
437
|
-
|
|
438
|
-
| Component | Coverage | Tests |
|
|
439
|
-
|-----------|----------|-------|
|
|
440
|
-
| **Phone Number Primitives** | 95% | ✅ All formats validated |
|
|
441
|
-
| **Address Primitives** | 94% | ✅ Full parsing tested |
|
|
442
|
-
| **Email Primitives** | 89% | ✅ RFC compliant |
|
|
443
|
-
| **Code Generation** | 87-91% | ✅ All targets verified |
|
|
444
|
-
|
|
445
|
-
**335 tests passing** • **76% overall coverage**
|
|
446
|
-
|
|
447
|
-
## License
|
|
448
|
-
|
|
449
|
-
MIT License - see LICENSE file for details
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
datacompose/__init__.py,sha256=kbQEzEvheAMsm3-MT4nWSuX42fjrfgDWoR8WZmC_WX4,34
|
|
2
|
-
datacompose/cli/__init__.py,sha256=a5UAbPnAm87gUq4YaoTPN5umknN5s2xgnztatI03So0,107
|
|
3
|
-
datacompose/cli/colors.py,sha256=Ax7jHhdAIuq5x3663gJ7_MzFCBOJv38DqNXts5t4SLs,1756
|
|
4
|
-
datacompose/cli/main.py,sha256=NjA6Uy1_A-xGaAEKKdXOrtMbAxOZ9Cn1aNDNYgHW9rg,1273
|
|
5
|
-
datacompose/cli/validation.py,sha256=8WMZ9wtPgFk9eBgMS_wtkncFz_-BmH4E8V57tjp3YoI,2526
|
|
6
|
-
datacompose/cli/commands/__init__.py,sha256=Bu58UsnkGRbVFS92U2Px_KxlUPrdlbSY6wlvP6tet2o,38
|
|
7
|
-
datacompose/cli/commands/add.py,sha256=Gk38dMHSeOHwtdG3ZZNQ5Zx2qe6rw6kFW2qE0aJLNN8,6710
|
|
8
|
-
datacompose/cli/commands/init.py,sha256=XEgxlXJn6JnkfqYFIJh_pqeUEAvosTTaJqisT67vhQI,16724
|
|
9
|
-
datacompose/cli/commands/list.py,sha256=MmRxMnghBLagg6IEh4lqCK0WR-0Ku-jxH8AT6WlajuU,3867
|
|
10
|
-
datacompose/generators/__init__.py,sha256=dFJWJScu8mkP0ZKIQtVlJ36PQW-LwCYBijuNwLSevZw,48
|
|
11
|
-
datacompose/generators/base.py,sha256=y0ATC8semn8KbZ_8P_aQvuvAmAQ-u-orN8aoWYdUpTc,6569
|
|
12
|
-
datacompose/generators/pyspark/__init__.py,sha256=ayoKDGtbt2KwFcNt2QxHKt8z83Kzy4ySw9Gg7j9ZMTY,33
|
|
13
|
-
datacompose/generators/pyspark/generator.py,sha256=_dVCEmxcJoaTp5xfgaXPSmxaeC0CuhZjpDB4AZOjaH0,1998
|
|
14
|
-
datacompose/operators/__init__.py,sha256=6g7Hp5261TkPghRgTfxKrizx0OH3Zga3OKHZ37I9_4E,586
|
|
15
|
-
datacompose/operators/primitives.py,sha256=rIERyKfPIULngHs9fRewXo6VjmbjyiOXvTCqiHGIur8,22022
|
|
16
|
-
datacompose/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
-
datacompose/transformers/discovery.py,sha256=JgMtmd8PkhmwLqS17NeKSv9MRneY9tdOsOK32A6luOQ,7048
|
|
18
|
-
datacompose/transformers/text/__init__.py,sha256=Mq0UmgYlBV8T18IkvHAS1TImEzWyGciCqxaCv324hFQ,36
|
|
19
|
-
datacompose/transformers/text/clean_addresses/__init__.py,sha256=l5TItGrGBn69Mlq0CaRGJa-SwpyuUEYWvG5N26s3Pco,39
|
|
20
|
-
datacompose/transformers/text/clean_addresses/pyspark/pyspark_primitives.py,sha256=wiENqpdcVAlfNhsyIM-JjX7lye4xcW0h1INHYMRrYlE,60249
|
|
21
|
-
datacompose/transformers/text/clean_emails/__init__.py,sha256=snZLOJsxrPDOi8gIISlRxc6YlskKxUyu0NnOZCE5cIU,34
|
|
22
|
-
datacompose/transformers/text/clean_emails/pyspark/pyspark_primitives.py,sha256=vIgPAcc6t8UCYSzvi79UplkTFqY9jIR9brZyhAhtLwY,21802
|
|
23
|
-
datacompose/transformers/text/clean_phone_numbers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
-
datacompose/transformers/text/clean_phone_numbers/pyspark/pyspark_primitives.py,sha256=BsCfgncFxM77M3k0hEZyARPJk4kq1PQZB40YRc9RR8M,26279
|
|
25
|
-
datacompose-0.2.4.1.dist-info/licenses/LICENSE,sha256=SCPOqmPhMikiyYDlKZ877fGHaE2O45cDBoJIomrlpDU,1067
|
|
26
|
-
datacompose-0.2.4.1.dist-info/METADATA,sha256=EVPvk2ik_kKdMd9PmAxWhAPY-XHEoJPp56kLaEN9qX4,12711
|
|
27
|
-
datacompose-0.2.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
28
|
-
datacompose-0.2.4.1.dist-info/entry_points.txt,sha256=oeG9oGgDwajk4v0C1awdUTBx2GmhLpuNHCTAV-jurUc,58
|
|
29
|
-
datacompose-0.2.4.1.dist-info/top_level.txt,sha256=AX1qGkuJMD2YJLZKo40h-w4MeFxDZL6W1vbKKuTpW8I,12
|
|
30
|
-
datacompose-0.2.4.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|