datacompose 0.2.6.1__py3-none-any.whl → 0.2.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacompose might be problematic. Click here for more details.
- datacompose/cli/__init__.py +1 -1
- datacompose/cli/commands/add.py +14 -0
- datacompose/cli/config.py +2 -0
- datacompose/cli/main.py +1 -1
- datacompose/operators/__init__.py +1 -1
- datacompose/transformers/text/addresses/pyspark/pyspark_primitives.py +2 -0
- datacompose/transformers/text/emails/pyspark/pyspark_primitives.py +17 -0
- datacompose/transformers/text/phone_numbers/pyspark/pyspark_primitives.py +15 -0
- datacompose-0.2.7.0.dist-info/METADATA +176 -0
- {datacompose-0.2.6.1.dist-info → datacompose-0.2.7.0.dist-info}/RECORD +14 -14
- datacompose-0.2.6.1.dist-info/METADATA +0 -94
- {datacompose-0.2.6.1.dist-info → datacompose-0.2.7.0.dist-info}/WHEEL +0 -0
- {datacompose-0.2.6.1.dist-info → datacompose-0.2.7.0.dist-info}/entry_points.txt +0 -0
- {datacompose-0.2.6.1.dist-info → datacompose-0.2.7.0.dist-info}/licenses/LICENSE +0 -0
- {datacompose-0.2.6.1.dist-info → datacompose-0.2.7.0.dist-info}/top_level.txt +0 -0
datacompose/cli/__init__.py
CHANGED
datacompose/cli/commands/add.py
CHANGED
|
@@ -111,6 +111,20 @@ def add(ctx, transformer, target, type, output, verbose):
|
|
|
111
111
|
config = ConfigLoader.load_config()
|
|
112
112
|
|
|
113
113
|
if target is None:
|
|
114
|
+
# If no config file exists or is malformed, fail early
|
|
115
|
+
if config is None:
|
|
116
|
+
print(
|
|
117
|
+
error(
|
|
118
|
+
"Error: No target specified and no config file found"
|
|
119
|
+
)
|
|
120
|
+
)
|
|
121
|
+
print(
|
|
122
|
+
info(
|
|
123
|
+
"Please specify a target with --target or run 'datacompose init' to set up defaults"
|
|
124
|
+
)
|
|
125
|
+
)
|
|
126
|
+
ctx.exit(1)
|
|
127
|
+
|
|
114
128
|
# Try to get default target from config
|
|
115
129
|
target = ConfigLoader.get_default_target(config)
|
|
116
130
|
if target is None:
|
datacompose/cli/config.py
CHANGED
datacompose/cli/main.py
CHANGED
|
@@ -19,7 +19,7 @@ from datacompose.cli.commands.list import list_cmd
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
@click.group()
|
|
22
|
-
@click.version_option("0.
|
|
22
|
+
@click.version_option("0.2.7.0", prog_name="datacompose")
|
|
23
23
|
@click.pass_context
|
|
24
24
|
def cli(ctx):
|
|
25
25
|
"""Generate data cleaning UDFs for various platforms.
|
|
@@ -749,6 +749,23 @@ def get_email_provider(col: Column) -> Column:
|
|
|
749
749
|
return result
|
|
750
750
|
|
|
751
751
|
|
|
752
|
+
@emails.register()
|
|
753
|
+
def hash_email_sha256(
|
|
754
|
+
col: Column, salt: str = "", standardize_first: bool = True
|
|
755
|
+
) -> Column:
|
|
756
|
+
"""Hash email with SHA256, with email-specific preprocessing."""
|
|
757
|
+
if standardize_first:
|
|
758
|
+
# Critical: hash the CANONICAL form for deduplication
|
|
759
|
+
email = get_canonical_email(col)
|
|
760
|
+
else:
|
|
761
|
+
email = col
|
|
762
|
+
|
|
763
|
+
# Only hash valid emails
|
|
764
|
+
return F.when(
|
|
765
|
+
is_valid_email(email), F.sha2(F.concat(email, F.lit(salt)), 256)
|
|
766
|
+
).otherwise(F.lit(None))
|
|
767
|
+
|
|
768
|
+
|
|
752
769
|
@emails.register()
|
|
753
770
|
def mask_email(col: Column, mask_char: str = "*", keep_chars: int = 3) -> Column:
|
|
754
771
|
"""
|
|
@@ -922,6 +922,21 @@ def get_region_from_area_code(col: Column) -> Column:
|
|
|
922
922
|
)
|
|
923
923
|
|
|
924
924
|
|
|
925
|
+
@phone_numbers.register()
|
|
926
|
+
def hash_phone_numbers_sha256(col:Column, salt:str="", standardize_first:bool=True) -> Column:
|
|
927
|
+
"""Hash email with SHA256, with email-specific preprocessing."""
|
|
928
|
+
if standardize_first:
|
|
929
|
+
phone_number = standardize_phone_numbers_e164(col)
|
|
930
|
+
|
|
931
|
+
else:
|
|
932
|
+
phone_number = col
|
|
933
|
+
|
|
934
|
+
return F.when(
|
|
935
|
+
is_valid_phone_numbers(phone_number),
|
|
936
|
+
F.sha2(F.concat(phone_number, F.lit(salt)), 256)
|
|
937
|
+
).otherwise(F.lit(None))
|
|
938
|
+
|
|
939
|
+
|
|
925
940
|
@phone_numbers.register()
|
|
926
941
|
def mask_phone_numbers(col: Column) -> Column:
|
|
927
942
|
"""
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datacompose
|
|
3
|
+
Version: 0.2.7.0
|
|
4
|
+
Summary: Copy-pasteable data transformation primitives for PySpark. Inspired by shadcn-svelte.
|
|
5
|
+
Author: Datacompose Contributors
|
|
6
|
+
Maintainer: Datacompose Contributors
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/tc-cole/datacompose
|
|
9
|
+
Project-URL: Documentation, https://github.com/tc-cole/datacompose/tree/main/docs
|
|
10
|
+
Project-URL: Repository, https://github.com/tc-cole/datacompose.git
|
|
11
|
+
Project-URL: Issues, https://github.com/tc-cole/datacompose/issues
|
|
12
|
+
Project-URL: Changelog, https://github.com/tc-cole/datacompose/blob/main/CHANGELOG.md
|
|
13
|
+
Keywords: data-cleaning,data-quality,udf,spark,postgres,code-generation,data-pipeline,etl
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Topic :: Software Development :: Code Generators
|
|
17
|
+
Classifier: Topic :: Database
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
26
|
+
Classifier: Operating System :: OS Independent
|
|
27
|
+
Requires-Python: >=3.8
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
License-File: LICENSE
|
|
30
|
+
Requires-Dist: jinja2>=3.0.0
|
|
31
|
+
Requires-Dist: pyyaml>=6.0
|
|
32
|
+
Requires-Dist: click>=8.0.0
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
35
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
36
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
37
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
38
|
+
Provides-Extra: docs
|
|
39
|
+
Requires-Dist: mkdocs>=1.5.3; extra == "docs"
|
|
40
|
+
Requires-Dist: mkdocs-material>=9.5.0; extra == "docs"
|
|
41
|
+
Requires-Dist: mkdocs-material-extensions>=1.3; extra == "docs"
|
|
42
|
+
Requires-Dist: mkdocs-minify-plugin>=0.7.1; extra == "docs"
|
|
43
|
+
Requires-Dist: mkdocs-redirects>=1.2.1; extra == "docs"
|
|
44
|
+
Requires-Dist: mike>=2.0.0; extra == "docs"
|
|
45
|
+
Requires-Dist: pymdown-extensions>=10.5; extra == "docs"
|
|
46
|
+
Requires-Dist: pygments>=2.17.0; extra == "docs"
|
|
47
|
+
Requires-Dist: mkdocs-git-revision-date-localized-plugin>=1.2.2; extra == "docs"
|
|
48
|
+
Requires-Dist: mkdocs-glightbox>=0.3.5; extra == "docs"
|
|
49
|
+
Dynamic: license-file
|
|
50
|
+
|
|
51
|
+
# DataCompose
|
|
52
|
+
|
|
53
|
+
PySpark transformations you can actually own and modify. No black boxes.
|
|
54
|
+
|
|
55
|
+
## Before vs After
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
# Before: Regex nightmare for addresses
|
|
59
|
+
df = df.withColumn("state_clean",
|
|
60
|
+
F.when(F.col("address").rlike(".*\\b(NY|N\\.Y\\.|New York|NewYork|Newyork)\\b.*"), "NY")
|
|
61
|
+
.when(F.col("address").rlike(".*\\b(CA|Cal\\.|Calif\\.|California)\\b.*"), "CA")
|
|
62
|
+
.when(F.col("address").rlike(".*\\b(IL|Ill\\.|Illinois|Illinios)\\b.*"), "IL")
|
|
63
|
+
.when(F.upper(F.col("address")).contains("NEW YORK"), "NY")
|
|
64
|
+
.when(F.regexp_extract(F.col("address"), ",\\s*([A-Z]{2})\\s+\\d{5}", 1) == "NY", "NY")
|
|
65
|
+
.when(F.regexp_extract(F.col("address"), "\\s+([A-Z]{2})\\s*$", 1) == "NY", "NY")
|
|
66
|
+
# ... handle "N.Y 10001" vs "NY, 10001" vs "New York 10001"
|
|
67
|
+
# ... handle misspellings like "Californai" or "Illnois"
|
|
68
|
+
# ... 50 more states × 10 variations each
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# After: One line
|
|
72
|
+
from builders.transformers.addresses import addresses
|
|
73
|
+
df = df.withColumn("state", addresses.standardize_state(F.col("address")))
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Installation
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
pip install datacompose
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## How It Works
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
# Copy transformers into YOUR repo
|
|
86
|
+
datacompose add phones
|
|
87
|
+
datacompose add addresses
|
|
88
|
+
datacompose add emails
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
# Use them like any Python module - this is your code now
|
|
93
|
+
from transformers.pyspark.addresses import addresses
|
|
94
|
+
|
|
95
|
+
df = (df
|
|
96
|
+
.withColumn("street_number", addresses.extract_street_number(F.col("address")))
|
|
97
|
+
.withColumn("street_name", addresses.extract_street_name(F.col("address")))
|
|
98
|
+
.withColumn("city", addresses.extract_city(F.col("address")))
|
|
99
|
+
.withColumn("state", addresses.standardize_state(F.col("address")))
|
|
100
|
+
.withColumn("zip", addresses.extract_zip_code(F.col("address")))
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Result:
|
|
104
|
+
+----------------------------------------+-------------+------------+-----------+-----+-------+
|
|
105
|
+
|address |street_number|street_name |city |state|zip |
|
|
106
|
+
+----------------------------------------+-------------+------------+-----------+-----+-------+
|
|
107
|
+
|123 Main St, New York, NY 10001 |123 |Main |New York |NY |10001 |
|
|
108
|
+
|456 Oak Ave Apt 5B, Los Angeles, CA 90001|456 |Oak |Los Angeles|CA |90001 |
|
|
109
|
+
|789 Pine Blvd, Chicago, IL 60601 |789 |Pine |Chicago |IL |60601 |
|
|
110
|
+
+----------------------------------------+-------------+------------+-----------+-----+-------+
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
The code lives in your repo. Modify it. Delete what you don't need. No external dependencies.
|
|
114
|
+
|
|
115
|
+
## Why Copy-to-Own?
|
|
116
|
+
|
|
117
|
+
- **Your data is weird** - Phone numbers with "ask for Bob"? We can't predict that. You can fix it.
|
|
118
|
+
- **No breaking changes** - Library updates can't break your pipeline at 2 AM
|
|
119
|
+
- **Actually debuggable** - Stack traces point to YOUR code, not site-packages
|
|
120
|
+
- **No dependency hell** - It's just PySpark. If Spark runs, this runs.
|
|
121
|
+
|
|
122
|
+
## Available Transformers
|
|
123
|
+
|
|
124
|
+
**Phones** - Standardize formats, extract from text, validate, handle extensions
|
|
125
|
+
**Addresses** - Parse components, standardize states, validate zips, detect PO boxes
|
|
126
|
+
**Emails** - Validate, extract domains, fix typos (gmial→gmail), standardize
|
|
127
|
+
|
|
128
|
+
More coming based on what you need.
|
|
129
|
+
|
|
130
|
+
## Real Example
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
# Messy customer data
|
|
134
|
+
df = spark.createDataFrame([
|
|
135
|
+
("(555) 123-4567 ext 89", "john.doe@gmial.com", "123 Main St Apt 4B"),
|
|
136
|
+
("555.987.6543", "JANE@COMPANY.COM", "456 Oak Ave, NY, NY 10001")
|
|
137
|
+
])
|
|
138
|
+
|
|
139
|
+
# Clean it
|
|
140
|
+
clean_df = (df
|
|
141
|
+
.withColumn("phone", phones.standardize_phone(F.col("phone")))
|
|
142
|
+
.withColumn("email", emails.fix_common_typos(F.col("email")))
|
|
143
|
+
.withColumn("street", addresses.extract_street_address(F.col("address")))
|
|
144
|
+
)
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## The Philosophy
|
|
148
|
+
|
|
149
|
+
```
|
|
150
|
+
█████████████ 60% - Already clean
|
|
151
|
+
████████ 30% - Common patterns (formatting, typos)
|
|
152
|
+
██ 8% - Edge cases (weird but fixable)
|
|
153
|
+
▌ 2% - Complete chaos (that's what interns are for)
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
We handle the 38% with patterns. You handle the 2% chaos.
|
|
157
|
+
|
|
158
|
+
## Documentation
|
|
159
|
+
|
|
160
|
+
Full docs at [datacompose.io](https://datacompose.io)
|
|
161
|
+
|
|
162
|
+
## Key Features
|
|
163
|
+
|
|
164
|
+
- **Zero dependencies** - Just PySpark code that runs anywhere Spark runs
|
|
165
|
+
- **Fully modifiable** - It's in your repo. Change whatever you need
|
|
166
|
+
- **Battle-tested patterns** - Built from real production data cleaning challenges
|
|
167
|
+
- **Composable functions** - Chain simple operations into complex pipelines
|
|
168
|
+
- **No breaking changes** - You control when and how to update
|
|
169
|
+
|
|
170
|
+
## License
|
|
171
|
+
|
|
172
|
+
MIT - It's your code now.
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
*Inspired by [shadcn/ui](https://ui.shadcn.com/) and [Svelte](https://svelte.dev/)'s approach to components - copy, don't install.*
|
|
@@ -1,31 +1,31 @@
|
|
|
1
1
|
datacompose/__init__.py,sha256=kbQEzEvheAMsm3-MT4nWSuX42fjrfgDWoR8WZmC_WX4,34
|
|
2
|
-
datacompose/cli/__init__.py,sha256=
|
|
2
|
+
datacompose/cli/__init__.py,sha256=DdkpcOpc9wicaH39CPFZKf2JX1BSEKxpxbgd6pum67g,109
|
|
3
3
|
datacompose/cli/colors.py,sha256=Ax7jHhdAIuq5x3663gJ7_MzFCBOJv38DqNXts5t4SLs,1756
|
|
4
|
-
datacompose/cli/config.py,sha256=
|
|
5
|
-
datacompose/cli/main.py,sha256=
|
|
4
|
+
datacompose/cli/config.py,sha256=xzZ2uP43_gAWSHxjBgJxWR-nUYfPviHJdJCzINtevbo,2413
|
|
5
|
+
datacompose/cli/main.py,sha256=dx_woCHc9_nYygKZ009-CN2tLoP1oyQPfPym-wKQTS4,1345
|
|
6
6
|
datacompose/cli/validation.py,sha256=8WMZ9wtPgFk9eBgMS_wtkncFz_-BmH4E8V57tjp3YoI,2526
|
|
7
7
|
datacompose/cli/commands/__init__.py,sha256=Bu58UsnkGRbVFS92U2Px_KxlUPrdlbSY6wlvP6tet2o,38
|
|
8
|
-
datacompose/cli/commands/add.py,sha256=
|
|
8
|
+
datacompose/cli/commands/add.py,sha256=0ICXRQqVrdczPWok48gHRPXam15GQdgGQvdgC_lJpuI,8160
|
|
9
9
|
datacompose/cli/commands/init.py,sha256=44QZEyqjAXOahfbFNBZAnEq2JCzT6UDB0GqC-fmyOko,18003
|
|
10
10
|
datacompose/cli/commands/list.py,sha256=mXihUMrnwLUoIG-FpNb8-XJ0VZfh0v3exHq1m_Mrprg,3855
|
|
11
11
|
datacompose/generators/__init__.py,sha256=dFJWJScu8mkP0ZKIQtVlJ36PQW-LwCYBijuNwLSevZw,48
|
|
12
12
|
datacompose/generators/base.py,sha256=EgpHwaaSxAP1Ygq5Wtyq4ez-wG0oPwDEbiKgLsEilD0,6761
|
|
13
13
|
datacompose/generators/pyspark/__init__.py,sha256=ayoKDGtbt2KwFcNt2QxHKt8z83Kzy4ySw9Gg7j9ZMTY,33
|
|
14
14
|
datacompose/generators/pyspark/generator.py,sha256=be4GATA8rmLAg4_wZ3Ox3vC3up_OXMOajjIUJQrDQ10,1735
|
|
15
|
-
datacompose/operators/__init__.py,sha256=
|
|
15
|
+
datacompose/operators/__init__.py,sha256=AREDlwEsqlK5ZQsQpwsNIF_He-Wlpct9-Wp6wBW4QQU,588
|
|
16
16
|
datacompose/operators/primitives.py,sha256=FxhtgP7aizKsnNBgh5oTqwc9m8QSjLTpRoG5zu6rFns,23615
|
|
17
17
|
datacompose/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
18
|
datacompose/transformers/discovery.py,sha256=JgMtmd8PkhmwLqS17NeKSv9MRneY9tdOsOK32A6luOQ,7048
|
|
19
19
|
datacompose/transformers/text/__init__.py,sha256=Mq0UmgYlBV8T18IkvHAS1TImEzWyGciCqxaCv324hFQ,36
|
|
20
20
|
datacompose/transformers/text/addresses/__init__.py,sha256=l5TItGrGBn69Mlq0CaRGJa-SwpyuUEYWvG5N26s3Pco,39
|
|
21
|
-
datacompose/transformers/text/addresses/pyspark/pyspark_primitives.py,sha256=
|
|
21
|
+
datacompose/transformers/text/addresses/pyspark/pyspark_primitives.py,sha256=AVneGoXX5TtHGfwOL2qD645YqiIBiV46_zEUnAhSwoE,62429
|
|
22
22
|
datacompose/transformers/text/emails/__init__.py,sha256=snZLOJsxrPDOi8gIISlRxc6YlskKxUyu0NnOZCE5cIU,34
|
|
23
|
-
datacompose/transformers/text/emails/pyspark/pyspark_primitives.py,sha256=
|
|
23
|
+
datacompose/transformers/text/emails/pyspark/pyspark_primitives.py,sha256=tWdKjG6daAXMLRp8BnbMfNqbh7zUci-XUjaep6wTbNk,24552
|
|
24
24
|
datacompose/transformers/text/phone_numbers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
-
datacompose/transformers/text/phone_numbers/pyspark/pyspark_primitives.py,sha256=
|
|
26
|
-
datacompose-0.2.
|
|
27
|
-
datacompose-0.2.
|
|
28
|
-
datacompose-0.2.
|
|
29
|
-
datacompose-0.2.
|
|
30
|
-
datacompose-0.2.
|
|
31
|
-
datacompose-0.2.
|
|
25
|
+
datacompose/transformers/text/phone_numbers/pyspark/pyspark_primitives.py,sha256=mCIq1jHts2tFs4wx8SNvws53rZ38HI1T4ynbu60w7E8,29371
|
|
26
|
+
datacompose-0.2.7.0.dist-info/licenses/LICENSE,sha256=SCPOqmPhMikiyYDlKZ877fGHaE2O45cDBoJIomrlpDU,1067
|
|
27
|
+
datacompose-0.2.7.0.dist-info/METADATA,sha256=ecGxOlDfjHvin7olozaB_MjecgWXIeoNmiQ3xhHaxx8,6898
|
|
28
|
+
datacompose-0.2.7.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
29
|
+
datacompose-0.2.7.0.dist-info/entry_points.txt,sha256=oeG9oGgDwajk4v0C1awdUTBx2GmhLpuNHCTAV-jurUc,58
|
|
30
|
+
datacompose-0.2.7.0.dist-info/top_level.txt,sha256=AX1qGkuJMD2YJLZKo40h-w4MeFxDZL6W1vbKKuTpW8I,12
|
|
31
|
+
datacompose-0.2.7.0.dist-info/RECORD,,
|
|
@@ -1,94 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: datacompose
|
|
3
|
-
Version: 0.2.6.1
|
|
4
|
-
Summary: Copy-pasteable data transformation primitives for PySpark. Inspired by shadcn-svelte.
|
|
5
|
-
Author: Datacompose Contributors
|
|
6
|
-
Maintainer: Datacompose Contributors
|
|
7
|
-
License: MIT
|
|
8
|
-
Project-URL: Homepage, https://github.com/tc-cole/datacompose
|
|
9
|
-
Project-URL: Documentation, https://github.com/tc-cole/datacompose/tree/main/docs
|
|
10
|
-
Project-URL: Repository, https://github.com/tc-cole/datacompose.git
|
|
11
|
-
Project-URL: Issues, https://github.com/tc-cole/datacompose/issues
|
|
12
|
-
Project-URL: Changelog, https://github.com/tc-cole/datacompose/blob/main/CHANGELOG.md
|
|
13
|
-
Keywords: data-cleaning,data-quality,udf,spark,postgres,code-generation,data-pipeline,etl
|
|
14
|
-
Classifier: Development Status :: 4 - Beta
|
|
15
|
-
Classifier: Intended Audience :: Developers
|
|
16
|
-
Classifier: Topic :: Software Development :: Code Generators
|
|
17
|
-
Classifier: Topic :: Database
|
|
18
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
20
|
-
Classifier: Programming Language :: Python :: 3
|
|
21
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
22
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
23
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
24
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
25
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
26
|
-
Classifier: Operating System :: OS Independent
|
|
27
|
-
Requires-Python: >=3.8
|
|
28
|
-
Description-Content-Type: text/markdown
|
|
29
|
-
License-File: LICENSE
|
|
30
|
-
Requires-Dist: jinja2>=3.0.0
|
|
31
|
-
Requires-Dist: pyyaml>=6.0
|
|
32
|
-
Requires-Dist: click>=8.0.0
|
|
33
|
-
Provides-Extra: dev
|
|
34
|
-
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
35
|
-
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
36
|
-
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
37
|
-
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
38
|
-
Provides-Extra: docs
|
|
39
|
-
Requires-Dist: mkdocs>=1.5.3; extra == "docs"
|
|
40
|
-
Requires-Dist: mkdocs-material>=9.5.0; extra == "docs"
|
|
41
|
-
Requires-Dist: mkdocs-material-extensions>=1.3; extra == "docs"
|
|
42
|
-
Requires-Dist: mkdocs-minify-plugin>=0.7.1; extra == "docs"
|
|
43
|
-
Requires-Dist: mkdocs-redirects>=1.2.1; extra == "docs"
|
|
44
|
-
Requires-Dist: mike>=2.0.0; extra == "docs"
|
|
45
|
-
Requires-Dist: pymdown-extensions>=10.5; extra == "docs"
|
|
46
|
-
Requires-Dist: pygments>=2.17.0; extra == "docs"
|
|
47
|
-
Requires-Dist: mkdocs-git-revision-date-localized-plugin>=1.2.2; extra == "docs"
|
|
48
|
-
Requires-Dist: mkdocs-glightbox>=0.3.5; extra == "docs"
|
|
49
|
-
Dynamic: license-file
|
|
50
|
-
|
|
51
|
-
# Datacompose
|
|
52
|
-
|
|
53
|
-
[](https://pypi.org/project/datacompose/)
|
|
54
|
-
[](https://www.python.org/downloads/)
|
|
55
|
-
[](https://github.com/your-username/datacompose)
|
|
56
|
-
[](https://opensource.org/licenses/MIT)
|
|
57
|
-
|
|
58
|
-
A powerful data transformation framework for building reusable, composable data cleaning pipelines in PySpark.
|
|
59
|
-
|
|
60
|
-
## Installation
|
|
61
|
-
|
|
62
|
-
```bash
|
|
63
|
-
pip install datacompose
|
|
64
|
-
```
|
|
65
|
-
|
|
66
|
-
## What is Datacompose?
|
|
67
|
-
|
|
68
|
-
Datacompose provides production-ready PySpark data transformation primitives that become part of YOUR codebase. Inspired by [shadcn](https://ui.shadcn.com/)'s approach to components, we believe in giving you full ownership and control over your code.
|
|
69
|
-
|
|
70
|
-
### Key Features
|
|
71
|
-
|
|
72
|
-
- **No Runtime Dependencies**: Standalone PySpark code that runs without Datacompose
|
|
73
|
-
- **Composable Primitives**: Build complex transformations from simple, reusable functions
|
|
74
|
-
- **Smart Partial Application**: Pre-configure transformations with parameters for reuse
|
|
75
|
-
- **Optimized Operations**: Efficient Spark transformations with minimal overhead
|
|
76
|
-
- **Comprehensive Libraries**: Pre-built primitives for emails, addresses, and phone numbers
|
|
77
|
-
|
|
78
|
-
### Available Transformers
|
|
79
|
-
|
|
80
|
-
- **Emails**: Validation, extraction, standardization, typo correction
|
|
81
|
-
- **Addresses**: Street parsing, state/zip validation, PO Box detection
|
|
82
|
-
- **Phone Numbers**: NANP/international validation, formatting, toll-free detection
|
|
83
|
-
|
|
84
|
-
## Documentation
|
|
85
|
-
|
|
86
|
-
For detailed documentation, examples, and API reference, visit [datacompose.io](https://datacompose.io).
|
|
87
|
-
|
|
88
|
-
## Philosophy
|
|
89
|
-
|
|
90
|
-
This is NOT a traditional library - it gives you production-ready data transformation primitives that you can modify to fit your exact needs. You own the code, with no external dependencies to manage or worry about breaking changes.
|
|
91
|
-
|
|
92
|
-
## License
|
|
93
|
-
|
|
94
|
-
MIT License - see LICENSE file for details
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|