datacompose 0.2.4__py3-none-any.whl → 0.2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacompose might be problematic. Click here for more details.
- datacompose/cli/commands/add.py +12 -27
- datacompose/cli/commands/init.py +2 -2
- datacompose/generators/base.py +23 -36
- datacompose/generators/pyspark/generator.py +7 -7
- {datacompose-0.2.4.dist-info → datacompose-0.2.4.1.dist-info}/METADATA +24 -6
- {datacompose-0.2.4.dist-info → datacompose-0.2.4.1.dist-info}/RECORD +10 -11
- datacompose/cli/commands/upgrade.py +0 -7
- {datacompose-0.2.4.dist-info → datacompose-0.2.4.1.dist-info}/WHEEL +0 -0
- {datacompose-0.2.4.dist-info → datacompose-0.2.4.1.dist-info}/entry_points.txt +0 -0
- {datacompose-0.2.4.dist-info → datacompose-0.2.4.1.dist-info}/licenses/LICENSE +0 -0
- {datacompose-0.2.4.dist-info → datacompose-0.2.4.1.dist-info}/top_level.txt +0 -0
datacompose/cli/commands/add.py
CHANGED
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
Add command for generating UDFs.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
import json
|
|
6
5
|
from pathlib import Path
|
|
7
6
|
|
|
8
7
|
import click
|
|
@@ -155,21 +154,18 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
|
|
|
155
154
|
print(info(f"Available generators: {', '.join(discovery.list_generators())}"))
|
|
156
155
|
return 1
|
|
157
156
|
|
|
158
|
-
# Determine output directory
|
|
159
|
-
# Extract platform from target (e.g., "pyspark.pandas_udf" -> "pyspark")
|
|
160
|
-
platform = target.split(".")[0]
|
|
161
|
-
|
|
157
|
+
# Determine output directory - no platform subdirectory needed
|
|
162
158
|
if not output:
|
|
163
|
-
output_dir = f"build/{
|
|
159
|
+
output_dir = f"build/{transformer_name}"
|
|
164
160
|
else:
|
|
165
|
-
output_dir = f"{output}/{
|
|
166
|
-
|
|
167
|
-
# Create generator instance
|
|
168
|
-
generator = generator_class(
|
|
169
|
-
template_dir=Path(template_dir), output_dir=Path(output_dir), verbose=verbose
|
|
170
|
-
)
|
|
161
|
+
output_dir = f"{output}/{transformer_name}"
|
|
171
162
|
|
|
172
163
|
try:
|
|
164
|
+
# Create generator instance
|
|
165
|
+
generator = generator_class(
|
|
166
|
+
template_dir=Path(template_dir), output_dir=Path(output_dir), verbose=verbose
|
|
167
|
+
)
|
|
168
|
+
|
|
173
169
|
# Generate the UDF
|
|
174
170
|
result = generator.generate(
|
|
175
171
|
transformer_name, force=False, transformer_dir=transformer_dir
|
|
@@ -182,13 +178,15 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
|
|
|
182
178
|
print(dim(f" Hash: {result.get('hash', 'N/A')}"))
|
|
183
179
|
else:
|
|
184
180
|
print(success(f"✓ UDF generated: {result['output_path']}"))
|
|
185
|
-
|
|
181
|
+
if result.get('test_path'):
|
|
182
|
+
print(success(f"✓ Test created: {result['test_path']}"))
|
|
186
183
|
print(highlight(f"Function name: {result['function_name']}"))
|
|
187
184
|
if verbose:
|
|
188
185
|
print(dim(f" Target: {target}"))
|
|
189
186
|
print(highlight("\nGenerated package contents:"))
|
|
190
187
|
print(f" - UDF code: {result['output_path']}")
|
|
191
|
-
|
|
188
|
+
if result.get('test_path'):
|
|
189
|
+
print(f" - Test file: {result['test_path']}")
|
|
192
190
|
|
|
193
191
|
return 0
|
|
194
192
|
|
|
@@ -200,16 +198,3 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
|
|
|
200
198
|
traceback.print_exc()
|
|
201
199
|
return 1
|
|
202
200
|
|
|
203
|
-
|
|
204
|
-
def _load_config() -> dict:
|
|
205
|
-
"""Load datacompose.json configuration if it exists."""
|
|
206
|
-
config_path = Path("datacompose.json")
|
|
207
|
-
if config_path.exists():
|
|
208
|
-
try:
|
|
209
|
-
with open(config_path, "r") as f:
|
|
210
|
-
return json.load(f)
|
|
211
|
-
except Exception:
|
|
212
|
-
pass
|
|
213
|
-
return {}
|
|
214
|
-
|
|
215
|
-
|
datacompose/cli/commands/init.py
CHANGED
|
@@ -21,7 +21,7 @@ DEFAULT_CONFIG = {
|
|
|
21
21
|
"aliases": {"utils": "./src/utils"},
|
|
22
22
|
"targets": {
|
|
23
23
|
"pyspark": {
|
|
24
|
-
"output": "./build
|
|
24
|
+
"output": "./build",
|
|
25
25
|
}
|
|
26
26
|
},
|
|
27
27
|
}
|
|
@@ -57,7 +57,7 @@ class InitCommand:
|
|
|
57
57
|
def get_config_template(template_name: str) -> Dict[str, Any]:
|
|
58
58
|
"""Get configuration template by name."""
|
|
59
59
|
if template_name == "minimal":
|
|
60
|
-
return {"version": "1.0", "targets": {"pyspark": {"output": "./build
|
|
60
|
+
return {"version": "1.0", "targets": {"pyspark": {"output": "./build"}}}
|
|
61
61
|
elif template_name == "advanced":
|
|
62
62
|
config = DEFAULT_CONFIG.copy()
|
|
63
63
|
config.update(
|
datacompose/generators/base.py
CHANGED
|
@@ -8,7 +8,6 @@ def __get_output_filename as well as any other build steps that you want.
|
|
|
8
8
|
|
|
9
9
|
import hashlib
|
|
10
10
|
from abc import ABC, abstractmethod
|
|
11
|
-
from datetime import datetime
|
|
12
11
|
from pathlib import Path
|
|
13
12
|
from typing import Any, Dict, Optional
|
|
14
13
|
|
|
@@ -45,16 +44,11 @@ class BaseGenerator(ABC):
|
|
|
45
44
|
Dictionary with generation results
|
|
46
45
|
"""
|
|
47
46
|
# Create a minimal spec-like dict from transformer name for compatibility
|
|
48
|
-
|
|
47
|
+
transformer = {"name": transformer_name}
|
|
49
48
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
# Calculate hash for caching
|
|
54
|
-
spec_hash = self._calculate_hash(spec, template_content)
|
|
55
|
-
|
|
56
|
-
# Determine output path
|
|
57
|
-
output_file = self._get_output_filename(spec["name"])
|
|
49
|
+
file_content: str = self._get_primitives_file(transformer_dir)
|
|
50
|
+
spec_hash = self._calculate_hash(transformer, file_content)
|
|
51
|
+
output_file = self._get_output_filename(transformer["name"])
|
|
58
52
|
output_path = self.output_dir / output_file
|
|
59
53
|
|
|
60
54
|
# Check if regeneration is needed
|
|
@@ -63,18 +57,18 @@ class BaseGenerator(ABC):
|
|
|
63
57
|
"skipped": True,
|
|
64
58
|
"output_path": str(output_path),
|
|
65
59
|
"hash": spec_hash,
|
|
66
|
-
"function_name": f"{
|
|
60
|
+
"function_name": f"{transformer['name']}_udf",
|
|
67
61
|
}
|
|
68
62
|
|
|
69
63
|
# Copy utils/primitives.py to the output directory
|
|
70
64
|
self._copy_utils_files(output_path)
|
|
71
|
-
|
|
65
|
+
self._write_output(output_path, file_content)
|
|
72
66
|
|
|
73
67
|
return {
|
|
74
68
|
"skipped": False,
|
|
75
69
|
"output_path": str(output_path),
|
|
76
70
|
"hash": spec_hash,
|
|
77
|
-
"function_name": f"{
|
|
71
|
+
"function_name": f"{transformer['name']}_udf",
|
|
78
72
|
}
|
|
79
73
|
|
|
80
74
|
@staticmethod
|
|
@@ -82,6 +76,7 @@ class BaseGenerator(ABC):
|
|
|
82
76
|
"""Calculate hash for cache invalidation."""
|
|
83
77
|
content = str(spec) + template_content
|
|
84
78
|
return hashlib.sha256(content.encode("utf-8")).hexdigest()[:8]
|
|
79
|
+
|
|
85
80
|
|
|
86
81
|
@staticmethod
|
|
87
82
|
def _should_skip_generation(output_path: Path, spec_hash: str) -> bool:
|
|
@@ -100,8 +95,6 @@ class BaseGenerator(ABC):
|
|
|
100
95
|
"""Write generated content to output file."""
|
|
101
96
|
# Create output directory if it doesn't exist
|
|
102
97
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
103
|
-
|
|
104
|
-
# Create __init__.py files to make directories importable as Python packages
|
|
105
98
|
self._ensure_init_files(output_path)
|
|
106
99
|
|
|
107
100
|
with open(output_path, "w") as f:
|
|
@@ -138,26 +131,20 @@ class BaseGenerator(ABC):
|
|
|
138
131
|
if self.verbose:
|
|
139
132
|
print(f"Created {init_file}")
|
|
140
133
|
|
|
141
|
-
@staticmethod
|
|
142
|
-
def _prepare_template_vars(spec: Dict[str, Any], spec_hash: str) -> Dict[str, Any]:
|
|
143
|
-
"""Prepare variables for template rendering."""
|
|
144
|
-
return {
|
|
145
|
-
"transformer_name": spec["name"],
|
|
146
|
-
"udf_name": f"{spec['name']}_udf",
|
|
147
|
-
"hash": spec_hash,
|
|
148
|
-
"generation_timestamp": datetime.now().isoformat(),
|
|
149
|
-
"typo_map": spec.get("typo_map", {}),
|
|
150
|
-
"regex_patterns": spec.get("regex", {}),
|
|
151
|
-
"flags": spec.get("flags", {}),
|
|
152
|
-
"options": spec.get("options", {}),
|
|
153
|
-
"custom_rules": spec.get("custom_rules", {}),
|
|
154
|
-
}
|
|
155
|
-
|
|
156
134
|
|
|
157
135
|
def _copy_utils_files(self, output_path: Path):
|
|
158
|
-
"""Copy utility files like primitives.py to the
|
|
159
|
-
#
|
|
160
|
-
|
|
136
|
+
"""Copy utility files like primitives.py to the build root directory."""
|
|
137
|
+
# Find the build directory root
|
|
138
|
+
path_parts = output_path.parts
|
|
139
|
+
try:
|
|
140
|
+
build_index = path_parts.index("build")
|
|
141
|
+
build_root = Path(*path_parts[:build_index + 1])
|
|
142
|
+
except ValueError:
|
|
143
|
+
# Fallback to parent directory if no 'build' in path
|
|
144
|
+
build_root = output_path.parent.parent
|
|
145
|
+
|
|
146
|
+
# Create utils directory at build root
|
|
147
|
+
utils_dir = build_root / "utils"
|
|
161
148
|
utils_dir.mkdir(parents=True, exist_ok=True)
|
|
162
149
|
|
|
163
150
|
# Create __init__.py in utils directory
|
|
@@ -179,12 +166,12 @@ class BaseGenerator(ABC):
|
|
|
179
166
|
|
|
180
167
|
@classmethod
|
|
181
168
|
@abstractmethod
|
|
182
|
-
def
|
|
169
|
+
def _get_primitives_location(cls, transformer_dir: Path | None) -> Path | None:
|
|
183
170
|
pass
|
|
184
171
|
|
|
185
172
|
@abstractmethod
|
|
186
|
-
def
|
|
187
|
-
"""Get the
|
|
173
|
+
def _get_primitives_file(self, transformer_dir: Path | None) -> str:
|
|
174
|
+
"""Get the file content for this generator."""
|
|
188
175
|
pass
|
|
189
176
|
|
|
190
177
|
@abstractmethod
|
|
@@ -11,30 +11,30 @@ class SparkPandasUDFGenerator(BaseGenerator):
|
|
|
11
11
|
"""Generator for Apache Spark pandas UDFs."""
|
|
12
12
|
|
|
13
13
|
ENGINE_SUBDIRECTORY = "pyspark"
|
|
14
|
-
|
|
14
|
+
PRIMITIVES_FILENAME = "pyspark_primitives.py"
|
|
15
15
|
|
|
16
16
|
@classmethod
|
|
17
|
-
def
|
|
17
|
+
def _get_primitives_location(cls, transformer_dir: Path | None) -> Path | None:
|
|
18
18
|
if transformer_dir is None:
|
|
19
19
|
return None
|
|
20
|
-
return transformer_dir / cls.ENGINE_SUBDIRECTORY / cls.
|
|
20
|
+
return transformer_dir / cls.ENGINE_SUBDIRECTORY / cls.PRIMITIVES_FILENAME
|
|
21
21
|
|
|
22
|
-
def
|
|
22
|
+
def _get_primitives_file(self, transformer_dir: Path | None = None) -> str:
|
|
23
23
|
"""Get the template content for Spark pandas UDFs."""
|
|
24
24
|
if transformer_dir:
|
|
25
25
|
# Look for transformer-specific template first
|
|
26
|
-
transformer_template = self.
|
|
26
|
+
transformer_template = self._get_primitives_location(transformer_dir)
|
|
27
27
|
if transformer_template and transformer_template.exists():
|
|
28
28
|
return transformer_template.read_text()
|
|
29
29
|
|
|
30
30
|
# Fallback to generator-specific template (if it exists)
|
|
31
|
-
generator_template = Path(__file__).parent / self.
|
|
31
|
+
generator_template = Path(__file__).parent / self.PRIMITIVES_FILENAME
|
|
32
32
|
if generator_template.exists():
|
|
33
33
|
return generator_template.read_text()
|
|
34
34
|
|
|
35
35
|
# If no templates found, raise error
|
|
36
36
|
raise FileNotFoundError(
|
|
37
|
-
f"No {self.
|
|
37
|
+
f"No {self.PRIMITIVES_FILENAME} template found in {transformer_dir} or {Path(__file__).parent}"
|
|
38
38
|
)
|
|
39
39
|
|
|
40
40
|
def _get_output_filename(self, transformer_name: str) -> str:
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datacompose
|
|
3
|
-
Version: 0.2.4
|
|
3
|
+
Version: 0.2.4.1
|
|
4
4
|
Summary: Copy-pasteable data transformation primitives for PySpark. Inspired by shadcn-svelte.
|
|
5
5
|
Author: Datacompose Contributors
|
|
6
6
|
Maintainer: Datacompose Contributors
|
|
7
7
|
License: MIT
|
|
8
|
-
Project-URL: Homepage, https://github.com/
|
|
9
|
-
Project-URL: Documentation, https://github.com/
|
|
10
|
-
Project-URL: Repository, https://github.com/
|
|
11
|
-
Project-URL: Issues, https://github.com/
|
|
12
|
-
Project-URL: Changelog, https://github.com/
|
|
8
|
+
Project-URL: Homepage, https://github.com/tc-cole/datacompose
|
|
9
|
+
Project-URL: Documentation, https://github.com/tc-cole/datacompose/tree/main/docs
|
|
10
|
+
Project-URL: Repository, https://github.com/tc-cole/datacompose.git
|
|
11
|
+
Project-URL: Issues, https://github.com/tc-cole/datacompose/issues
|
|
12
|
+
Project-URL: Changelog, https://github.com/tc-cole/datacompose/blob/main/CHANGELOG.md
|
|
13
13
|
Keywords: data-cleaning,data-quality,udf,spark,postgres,code-generation,data-pipeline,etl
|
|
14
14
|
Classifier: Development Status :: 4 - Beta
|
|
15
15
|
Classifier: Intended Audience :: Developers
|
|
@@ -47,6 +47,11 @@ Dynamic: license-file
|
|
|
47
47
|
|
|
48
48
|
# Datacompose
|
|
49
49
|
|
|
50
|
+
[](https://pypi.org/project/datacompose/)
|
|
51
|
+
[](https://www.python.org/downloads/)
|
|
52
|
+
[](https://github.com/your-username/datacompose)
|
|
53
|
+
[](https://opensource.org/licenses/MIT)
|
|
54
|
+
|
|
50
55
|
A powerful data transformation framework for building reusable, composable data cleaning pipelines in PySpark.
|
|
51
56
|
|
|
52
57
|
## Overview
|
|
@@ -426,6 +431,19 @@ Datacompose is inspired by [shadcn-svelte](https://www.shadcn-svelte.com/) and [
|
|
|
426
431
|
This is NOT a traditional library - it's a code generator that gives you production-ready data transformation primitives that you can modify to fit your exact needs.
|
|
427
432
|
|
|
428
433
|
|
|
434
|
+
## Test Coverage
|
|
435
|
+
|
|
436
|
+
**Critical components are thoroughly tested:**
|
|
437
|
+
|
|
438
|
+
| Component | Coverage | Tests |
|
|
439
|
+
|-----------|----------|-------|
|
|
440
|
+
| **Phone Number Primitives** | 95% | ✅ All formats validated |
|
|
441
|
+
| **Address Primitives** | 94% | ✅ Full parsing tested |
|
|
442
|
+
| **Email Primitives** | 89% | ✅ RFC compliant |
|
|
443
|
+
| **Code Generation** | 87-91% | ✅ All targets verified |
|
|
444
|
+
|
|
445
|
+
**335 tests passing** • **76% overall coverage**
|
|
446
|
+
|
|
429
447
|
## License
|
|
430
448
|
|
|
431
449
|
MIT License - see LICENSE file for details
|
|
@@ -4,14 +4,13 @@ datacompose/cli/colors.py,sha256=Ax7jHhdAIuq5x3663gJ7_MzFCBOJv38DqNXts5t4SLs,175
|
|
|
4
4
|
datacompose/cli/main.py,sha256=NjA6Uy1_A-xGaAEKKdXOrtMbAxOZ9Cn1aNDNYgHW9rg,1273
|
|
5
5
|
datacompose/cli/validation.py,sha256=8WMZ9wtPgFk9eBgMS_wtkncFz_-BmH4E8V57tjp3YoI,2526
|
|
6
6
|
datacompose/cli/commands/__init__.py,sha256=Bu58UsnkGRbVFS92U2Px_KxlUPrdlbSY6wlvP6tet2o,38
|
|
7
|
-
datacompose/cli/commands/add.py,sha256=
|
|
8
|
-
datacompose/cli/commands/init.py,sha256=
|
|
7
|
+
datacompose/cli/commands/add.py,sha256=Gk38dMHSeOHwtdG3ZZNQ5Zx2qe6rw6kFW2qE0aJLNN8,6710
|
|
8
|
+
datacompose/cli/commands/init.py,sha256=XEgxlXJn6JnkfqYFIJh_pqeUEAvosTTaJqisT67vhQI,16724
|
|
9
9
|
datacompose/cli/commands/list.py,sha256=MmRxMnghBLagg6IEh4lqCK0WR-0Ku-jxH8AT6WlajuU,3867
|
|
10
|
-
datacompose/cli/commands/upgrade.py,sha256=F0ra-HLVCP5MEdYOkKbvZ_cnhXFmKKw6IRBhmuWBGVI,163
|
|
11
10
|
datacompose/generators/__init__.py,sha256=dFJWJScu8mkP0ZKIQtVlJ36PQW-LwCYBijuNwLSevZw,48
|
|
12
|
-
datacompose/generators/base.py,sha256=
|
|
11
|
+
datacompose/generators/base.py,sha256=y0ATC8semn8KbZ_8P_aQvuvAmAQ-u-orN8aoWYdUpTc,6569
|
|
13
12
|
datacompose/generators/pyspark/__init__.py,sha256=ayoKDGtbt2KwFcNt2QxHKt8z83Kzy4ySw9Gg7j9ZMTY,33
|
|
14
|
-
datacompose/generators/pyspark/generator.py,sha256=
|
|
13
|
+
datacompose/generators/pyspark/generator.py,sha256=_dVCEmxcJoaTp5xfgaXPSmxaeC0CuhZjpDB4AZOjaH0,1998
|
|
15
14
|
datacompose/operators/__init__.py,sha256=6g7Hp5261TkPghRgTfxKrizx0OH3Zga3OKHZ37I9_4E,586
|
|
16
15
|
datacompose/operators/primitives.py,sha256=rIERyKfPIULngHs9fRewXo6VjmbjyiOXvTCqiHGIur8,22022
|
|
17
16
|
datacompose/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -23,9 +22,9 @@ datacompose/transformers/text/clean_emails/__init__.py,sha256=snZLOJsxrPDOi8gIIS
|
|
|
23
22
|
datacompose/transformers/text/clean_emails/pyspark/pyspark_primitives.py,sha256=vIgPAcc6t8UCYSzvi79UplkTFqY9jIR9brZyhAhtLwY,21802
|
|
24
23
|
datacompose/transformers/text/clean_phone_numbers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
24
|
datacompose/transformers/text/clean_phone_numbers/pyspark/pyspark_primitives.py,sha256=BsCfgncFxM77M3k0hEZyARPJk4kq1PQZB40YRc9RR8M,26279
|
|
26
|
-
datacompose-0.2.4.dist-info/licenses/LICENSE,sha256=SCPOqmPhMikiyYDlKZ877fGHaE2O45cDBoJIomrlpDU,1067
|
|
27
|
-
datacompose-0.2.4.dist-info/METADATA,sha256=
|
|
28
|
-
datacompose-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
29
|
-
datacompose-0.2.4.dist-info/entry_points.txt,sha256=oeG9oGgDwajk4v0C1awdUTBx2GmhLpuNHCTAV-jurUc,58
|
|
30
|
-
datacompose-0.2.4.dist-info/top_level.txt,sha256=AX1qGkuJMD2YJLZKo40h-w4MeFxDZL6W1vbKKuTpW8I,12
|
|
31
|
-
datacompose-0.2.4.dist-info/RECORD,,
|
|
25
|
+
datacompose-0.2.4.1.dist-info/licenses/LICENSE,sha256=SCPOqmPhMikiyYDlKZ877fGHaE2O45cDBoJIomrlpDU,1067
|
|
26
|
+
datacompose-0.2.4.1.dist-info/METADATA,sha256=EVPvk2ik_kKdMd9PmAxWhAPY-XHEoJPp56kLaEN9qX4,12711
|
|
27
|
+
datacompose-0.2.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
28
|
+
datacompose-0.2.4.1.dist-info/entry_points.txt,sha256=oeG9oGgDwajk4v0C1awdUTBx2GmhLpuNHCTAV-jurUc,58
|
|
29
|
+
datacompose-0.2.4.1.dist-info/top_level.txt,sha256=AX1qGkuJMD2YJLZKo40h-w4MeFxDZL6W1vbKKuTpW8I,12
|
|
30
|
+
datacompose-0.2.4.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|