datacompose 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacompose might be problematic. Click here for more details.
- datacompose/__init__.py +1 -0
- datacompose/cli/__init__.py +5 -0
- datacompose/cli/colors.py +80 -0
- datacompose/cli/commands/__init__.py +3 -0
- datacompose/cli/commands/add.py +215 -0
- datacompose/cli/commands/init.py +451 -0
- datacompose/cli/commands/list.py +118 -0
- datacompose/cli/commands/upgrade.py +7 -0
- datacompose/cli/main.py +59 -0
- datacompose/cli/validation.py +72 -0
- datacompose/generators/__init__.py +3 -0
- datacompose/generators/base.py +193 -0
- datacompose/generators/pyspark/__init__.py +1 -0
- datacompose/generators/pyspark/generator.py +51 -0
- datacompose/operators/__init__.py +21 -0
- datacompose/operators/primitives.py +595 -0
- datacompose/transformers/__init__.py +0 -0
- datacompose/transformers/discovery.py +186 -0
- datacompose/transformers/text/__init__.py +1 -0
- datacompose/transformers/text/clean_addresses/__init__.py +1 -0
- datacompose/transformers/text/clean_addresses/pyspark/pyspark_primitives.py +1967 -0
- datacompose/transformers/text/clean_emails/__init__.py +1 -0
- datacompose/transformers/text/clean_emails/pyspark/pyspark_primitives.py +781 -0
- datacompose/transformers/text/clean_phone_numbers/__init__.py +0 -0
- datacompose/transformers/text/clean_phone_numbers/pyspark/pyspark_primitives.py +941 -0
- datacompose-0.2.4.dist-info/METADATA +431 -0
- datacompose-0.2.4.dist-info/RECORD +31 -0
- datacompose-0.2.4.dist-info/WHEEL +5 -0
- datacompose-0.2.4.dist-info/entry_points.txt +2 -0
- datacompose-0.2.4.dist-info/licenses/LICENSE +21 -0
- datacompose-0.2.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Validation utilities for CLI commands.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from datacompose.transformers.discovery import TransformerDiscovery
|
|
6
|
+
from datacompose.cli.colors import error, info
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def validate_platform(platform: str, discovery: TransformerDiscovery) -> bool:
|
|
10
|
+
"""Validate that platform exists.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
platform: Platform name (e.g., 'pyspark', 'postgres', 'snowflake')
|
|
14
|
+
discovery: TransformerDiscovery instance
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
True if platform is valid, False otherwise
|
|
18
|
+
"""
|
|
19
|
+
available_generators = discovery.list_generators()
|
|
20
|
+
available_platforms = list(set(g.split(".")[0] for g in available_generators))
|
|
21
|
+
|
|
22
|
+
if platform not in available_platforms:
|
|
23
|
+
print(error(f"Platform '{platform}' not found."))
|
|
24
|
+
print(info(f"Available platforms: {', '.join(sorted(available_platforms))}"))
|
|
25
|
+
return False
|
|
26
|
+
return True
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def validate_type_for_platform(
|
|
30
|
+
platform: str, type_name: str, discovery: TransformerDiscovery
|
|
31
|
+
) -> bool:
|
|
32
|
+
"""Validate that type exists for the given platform.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
platform: Platform name (e.g., 'pyspark', 'postgres')
|
|
36
|
+
type_name: Type name (e.g., 'pandas_udf', 'sql_udf')
|
|
37
|
+
discovery: TransformerDiscovery instance
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
True if type is valid for platform, False otherwise
|
|
41
|
+
"""
|
|
42
|
+
available_generators = discovery.list_generators()
|
|
43
|
+
platform_generators = [
|
|
44
|
+
g for g in available_generators if g.startswith(f"{platform}.")
|
|
45
|
+
]
|
|
46
|
+
available_types = [g.split(".")[1] for g in platform_generators]
|
|
47
|
+
|
|
48
|
+
if type_name not in available_types:
|
|
49
|
+
print(error(f"Type '{type_name}' not available for platform '{platform}'."))
|
|
50
|
+
if available_types:
|
|
51
|
+
print(info(f"Available types for {platform}: {', '.join(available_types)}"))
|
|
52
|
+
else:
|
|
53
|
+
print(info(f"No generators available for platform '{platform}'."))
|
|
54
|
+
return False
|
|
55
|
+
return True
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def get_available_platforms(discovery: TransformerDiscovery) -> list[str]:
|
|
59
|
+
"""Get list of available platforms."""
|
|
60
|
+
available_generators = discovery.list_generators()
|
|
61
|
+
return sorted(set(g.split(".")[0] for g in available_generators))
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def get_available_types_for_platform(
|
|
65
|
+
platform: str, discovery: TransformerDiscovery
|
|
66
|
+
) -> list[str]:
|
|
67
|
+
"""Get list of available types for a specific platform."""
|
|
68
|
+
available_generators = discovery.list_generators()
|
|
69
|
+
platform_generators = [
|
|
70
|
+
g for g in available_generators if g.startswith(f"{platform}.")
|
|
71
|
+
]
|
|
72
|
+
return [g.split(".")[1] for g in platform_generators]
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base generator class for UDF generation.
|
|
3
|
+
|
|
4
|
+
The classes that inherit from this generator must implement the following methods:
|
|
5
|
+
def _get_template_content(self, transformer_dir: Path | None = None) -> str:
|
|
6
|
+
def __get_output_filename as well as any other build steps that you want.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any, Dict, Optional
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BaseGenerator(ABC):
|
|
17
|
+
"""Base class for UDF generators."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, template_dir: Path, output_dir: Path, verbose: bool = False):
|
|
20
|
+
"""Initialize the generator.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
template_dir: Directory containing templates
|
|
24
|
+
output_dir: Directory to write generated UDFs
|
|
25
|
+
verbose: Enable verbose output
|
|
26
|
+
"""
|
|
27
|
+
self.template_dir = template_dir
|
|
28
|
+
self.output_dir = output_dir
|
|
29
|
+
self.verbose = verbose
|
|
30
|
+
|
|
31
|
+
def generate(
|
|
32
|
+
self,
|
|
33
|
+
transformer_name: str,
|
|
34
|
+
force: bool = False,
|
|
35
|
+
transformer_dir: Optional[Path] = None,
|
|
36
|
+
) -> Dict[str, Any]:
|
|
37
|
+
"""Generate UDF for transformer.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
transformer_name: Name of the transformer
|
|
41
|
+
force: Force regeneration even if hash matches
|
|
42
|
+
transformer_dir: Directory containing the transformer (for template lookup)
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Dictionary with generation results
|
|
46
|
+
"""
|
|
47
|
+
# Create a minimal spec-like dict from transformer name for compatibility
|
|
48
|
+
spec = {"name": transformer_name}
|
|
49
|
+
|
|
50
|
+
# Get template content
|
|
51
|
+
template_content = self._get_template_content(transformer_dir)
|
|
52
|
+
|
|
53
|
+
# Calculate hash for caching
|
|
54
|
+
spec_hash = self._calculate_hash(spec, template_content)
|
|
55
|
+
|
|
56
|
+
# Determine output path
|
|
57
|
+
output_file = self._get_output_filename(spec["name"])
|
|
58
|
+
output_path = self.output_dir / output_file
|
|
59
|
+
|
|
60
|
+
# Check if regeneration is needed
|
|
61
|
+
if not force and self._should_skip_generation(output_path, spec_hash):
|
|
62
|
+
return {
|
|
63
|
+
"skipped": True,
|
|
64
|
+
"output_path": str(output_path),
|
|
65
|
+
"hash": spec_hash,
|
|
66
|
+
"function_name": f"{spec['name']}_udf",
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
# Copy utils/primitives.py to the output directory
|
|
70
|
+
self._copy_utils_files(output_path)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
return {
|
|
74
|
+
"skipped": False,
|
|
75
|
+
"output_path": str(output_path),
|
|
76
|
+
"hash": spec_hash,
|
|
77
|
+
"function_name": f"{spec['name']}_udf",
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
@staticmethod
|
|
81
|
+
def _calculate_hash(spec: Dict[str, Any], template_content: str) -> str:
|
|
82
|
+
"""Calculate hash for cache invalidation."""
|
|
83
|
+
content = str(spec) + template_content
|
|
84
|
+
return hashlib.sha256(content.encode("utf-8")).hexdigest()[:8]
|
|
85
|
+
|
|
86
|
+
@staticmethod
|
|
87
|
+
def _should_skip_generation(output_path: Path, spec_hash: str) -> bool:
|
|
88
|
+
"""Check if generation should be skipped based on hash."""
|
|
89
|
+
if not output_path.exists():
|
|
90
|
+
return False
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
with open(output_path, "r") as f:
|
|
94
|
+
first_lines = "".join(f.readlines()[:5])
|
|
95
|
+
return f"Hash: {spec_hash}" in first_lines
|
|
96
|
+
except Exception:
|
|
97
|
+
return False
|
|
98
|
+
|
|
99
|
+
def _write_output(self, output_path: Path, content: str):
|
|
100
|
+
"""Write generated content to output file."""
|
|
101
|
+
# Create output directory if it doesn't exist
|
|
102
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
103
|
+
|
|
104
|
+
# Create __init__.py files to make directories importable as Python packages
|
|
105
|
+
self._ensure_init_files(output_path)
|
|
106
|
+
|
|
107
|
+
with open(output_path, "w") as f:
|
|
108
|
+
f.write(content)
|
|
109
|
+
|
|
110
|
+
if self.verbose:
|
|
111
|
+
print(f"Wrote output to: {output_path}")
|
|
112
|
+
|
|
113
|
+
def _ensure_init_files(self, output_path: Path):
|
|
114
|
+
"""Ensure __init__.py files exist to make directories importable."""
|
|
115
|
+
# Get all directories from build down to the target directory
|
|
116
|
+
path_parts = output_path.parts
|
|
117
|
+
|
|
118
|
+
# Find the build directory index
|
|
119
|
+
try:
|
|
120
|
+
build_index = path_parts.index("build")
|
|
121
|
+
except ValueError:
|
|
122
|
+
# No build directory found, just create init for immediate parent
|
|
123
|
+
init_file = output_path.parent / "__init__.py"
|
|
124
|
+
if not init_file.exists():
|
|
125
|
+
init_file.touch()
|
|
126
|
+
if self.verbose:
|
|
127
|
+
print(f"Created {init_file}")
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
# Create __init__.py files for build and all subdirectories leading to output
|
|
131
|
+
for i in range(
|
|
132
|
+
build_index, len(path_parts) - 1
|
|
133
|
+
): # -1 to exclude the file itself
|
|
134
|
+
dir_path = Path(*path_parts[: i + 1])
|
|
135
|
+
init_file = dir_path / "__init__.py"
|
|
136
|
+
if not init_file.exists():
|
|
137
|
+
init_file.touch()
|
|
138
|
+
if self.verbose:
|
|
139
|
+
print(f"Created {init_file}")
|
|
140
|
+
|
|
141
|
+
@staticmethod
|
|
142
|
+
def _prepare_template_vars(spec: Dict[str, Any], spec_hash: str) -> Dict[str, Any]:
|
|
143
|
+
"""Prepare variables for template rendering."""
|
|
144
|
+
return {
|
|
145
|
+
"transformer_name": spec["name"],
|
|
146
|
+
"udf_name": f"{spec['name']}_udf",
|
|
147
|
+
"hash": spec_hash,
|
|
148
|
+
"generation_timestamp": datetime.now().isoformat(),
|
|
149
|
+
"typo_map": spec.get("typo_map", {}),
|
|
150
|
+
"regex_patterns": spec.get("regex", {}),
|
|
151
|
+
"flags": spec.get("flags", {}),
|
|
152
|
+
"options": spec.get("options", {}),
|
|
153
|
+
"custom_rules": spec.get("custom_rules", {}),
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _copy_utils_files(self, output_path: Path):
|
|
158
|
+
"""Copy utility files like primitives.py to the output directory."""
|
|
159
|
+
# Create utils directory at the same level as the output file
|
|
160
|
+
utils_dir = output_path.parent / "utils"
|
|
161
|
+
utils_dir.mkdir(parents=True, exist_ok=True)
|
|
162
|
+
|
|
163
|
+
# Create __init__.py in utils directory
|
|
164
|
+
init_file = utils_dir / "__init__.py"
|
|
165
|
+
if not init_file.exists():
|
|
166
|
+
init_file.touch()
|
|
167
|
+
if self.verbose:
|
|
168
|
+
print(f"Created {init_file}")
|
|
169
|
+
|
|
170
|
+
# Copy primitives.py from datacompose.operators
|
|
171
|
+
primitives_source = Path(__file__).parent.parent / "operators" / "primitives.py"
|
|
172
|
+
primitives_dest = utils_dir / "primitives.py"
|
|
173
|
+
|
|
174
|
+
if primitives_source.exists() and not primitives_dest.exists():
|
|
175
|
+
import shutil
|
|
176
|
+
shutil.copy2(primitives_source, primitives_dest)
|
|
177
|
+
if self.verbose:
|
|
178
|
+
print(f"Copied primitives.py to {primitives_dest}")
|
|
179
|
+
|
|
180
|
+
@classmethod
|
|
181
|
+
@abstractmethod
|
|
182
|
+
def _get_template_location(cls, transformer_dir: Path | None) -> Path | None:
|
|
183
|
+
pass
|
|
184
|
+
|
|
185
|
+
@abstractmethod
|
|
186
|
+
def _get_template_content(self, transformer_dir: Path | None) -> str:
|
|
187
|
+
"""Get the template content for this generator."""
|
|
188
|
+
pass
|
|
189
|
+
|
|
190
|
+
@abstractmethod
|
|
191
|
+
def _get_output_filename(self, transformer_name: str) -> str:
|
|
192
|
+
"""Get the output filename for generated UDF."""
|
|
193
|
+
pass
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Spark platform generators."""
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Spark pandas UDF generator.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from ..base import BaseGenerator
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SparkPandasUDFGenerator(BaseGenerator):
|
|
11
|
+
"""Generator for Apache Spark pandas UDFs."""
|
|
12
|
+
|
|
13
|
+
ENGINE_SUBDIRECTORY = "pyspark"
|
|
14
|
+
TEMPLATE_FILENAME = "pyspark_primitives.py"
|
|
15
|
+
|
|
16
|
+
@classmethod
|
|
17
|
+
def _get_template_location(cls, transformer_dir: Path | None) -> Path | None:
|
|
18
|
+
if transformer_dir is None:
|
|
19
|
+
return None
|
|
20
|
+
return transformer_dir / cls.ENGINE_SUBDIRECTORY / cls.TEMPLATE_FILENAME
|
|
21
|
+
|
|
22
|
+
def _get_template_content(self, transformer_dir: Path | None = None) -> str:
|
|
23
|
+
"""Get the template content for Spark pandas UDFs."""
|
|
24
|
+
if transformer_dir:
|
|
25
|
+
# Look for transformer-specific template first
|
|
26
|
+
transformer_template = self._get_template_location(transformer_dir)
|
|
27
|
+
if transformer_template and transformer_template.exists():
|
|
28
|
+
return transformer_template.read_text()
|
|
29
|
+
|
|
30
|
+
# Fallback to generator-specific template (if it exists)
|
|
31
|
+
generator_template = Path(__file__).parent / self.TEMPLATE_FILENAME
|
|
32
|
+
if generator_template.exists():
|
|
33
|
+
return generator_template.read_text()
|
|
34
|
+
|
|
35
|
+
# If no templates found, raise error
|
|
36
|
+
raise FileNotFoundError(
|
|
37
|
+
f"No {self.TEMPLATE_FILENAME} template found in {transformer_dir} or {Path(__file__).parent}"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def _get_output_filename(self, transformer_name: str) -> str:
|
|
41
|
+
"""Get the output filename for PySpark primitives."""
|
|
42
|
+
# Map transformer names to their primitive namespace names
|
|
43
|
+
name_mapping = {
|
|
44
|
+
"clean_emails": "email_primitives",
|
|
45
|
+
"clean_addresses": "address_primitives",
|
|
46
|
+
"clean_phone_numbers": "phone_primitives"
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# Use mapped name if available, otherwise fall back to transformer_name
|
|
50
|
+
output_name = name_mapping.get(transformer_name, f"{transformer_name}_primitives")
|
|
51
|
+
return f"{output_name}.py"
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Datacompose Operators Module
|
|
3
|
+
=======================
|
|
4
|
+
|
|
5
|
+
This module provides the core framework for building composable data transformation pipelines.
|
|
6
|
+
|
|
7
|
+
Main Components:
|
|
8
|
+
- SmartPrimitive: Enables partial application of transformations
|
|
9
|
+
- PrimitiveRegistry: Container for organizing related transformations
|
|
10
|
+
- PipelineCompiler: Compiles declarative syntax into executable pipelines
|
|
11
|
+
- StablePipeline: Runtime executor for compiled pipelines
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from .primitives import SmartPrimitive, PrimitiveRegistry
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"SmartPrimitive",
|
|
18
|
+
"PrimitiveRegistry",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
__version__ = "0.2.4"
|