datacompose 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacompose might be problematic. Click here for more details.
- datacompose/__init__.py +1 -0
- datacompose/cli/__init__.py +5 -0
- datacompose/cli/colors.py +80 -0
- datacompose/cli/commands/__init__.py +3 -0
- datacompose/cli/commands/add.py +215 -0
- datacompose/cli/commands/init.py +451 -0
- datacompose/cli/commands/list.py +118 -0
- datacompose/cli/commands/upgrade.py +7 -0
- datacompose/cli/main.py +59 -0
- datacompose/cli/validation.py +72 -0
- datacompose/generators/__init__.py +3 -0
- datacompose/generators/base.py +193 -0
- datacompose/generators/pyspark/__init__.py +1 -0
- datacompose/generators/pyspark/generator.py +51 -0
- datacompose/operators/__init__.py +21 -0
- datacompose/operators/primitives.py +595 -0
- datacompose/transformers/__init__.py +0 -0
- datacompose/transformers/discovery.py +186 -0
- datacompose/transformers/text/__init__.py +1 -0
- datacompose/transformers/text/clean_addresses/__init__.py +1 -0
- datacompose/transformers/text/clean_addresses/pyspark/pyspark_primitives.py +1967 -0
- datacompose/transformers/text/clean_emails/__init__.py +1 -0
- datacompose/transformers/text/clean_emails/pyspark/pyspark_primitives.py +781 -0
- datacompose/transformers/text/clean_phone_numbers/__init__.py +0 -0
- datacompose/transformers/text/clean_phone_numbers/pyspark/pyspark_primitives.py +941 -0
- datacompose-0.2.4.dist-info/METADATA +431 -0
- datacompose-0.2.4.dist-info/RECORD +31 -0
- datacompose-0.2.4.dist-info/WHEEL +5 -0
- datacompose-0.2.4.dist-info/entry_points.txt +2 -0
- datacompose-0.2.4.dist-info/licenses/LICENSE +21 -0
- datacompose-0.2.4.dist-info/top_level.txt +1 -0
datacompose/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Datacompose source package."""
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Simple color utilities for CLI output.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Colors:
|
|
10
|
+
"""ANSI color codes for terminal output."""
|
|
11
|
+
|
|
12
|
+
# Text colors
|
|
13
|
+
RED = "\033[91m"
|
|
14
|
+
GREEN = "\033[92m"
|
|
15
|
+
YELLOW = "\033[93m"
|
|
16
|
+
BLUE = "\033[94m"
|
|
17
|
+
MAGENTA = "\033[95m"
|
|
18
|
+
CYAN = "\033[96m"
|
|
19
|
+
WHITE = "\033[97m"
|
|
20
|
+
GRAY = "\033[90m"
|
|
21
|
+
|
|
22
|
+
# Styles
|
|
23
|
+
BOLD = "\033[1m"
|
|
24
|
+
DIM = "\033[2m"
|
|
25
|
+
UNDERLINE = "\033[4m"
|
|
26
|
+
|
|
27
|
+
# Reset
|
|
28
|
+
RESET = "\033[0m"
|
|
29
|
+
|
|
30
|
+
@classmethod
|
|
31
|
+
def is_enabled(cls) -> bool:
|
|
32
|
+
"""Check if colors should be enabled."""
|
|
33
|
+
# Disable colors if NO_COLOR env var is set
|
|
34
|
+
if os.getenv("NO_COLOR"):
|
|
35
|
+
return False
|
|
36
|
+
|
|
37
|
+
# Disable colors if not in a TTY
|
|
38
|
+
if not sys.stdout.isatty():
|
|
39
|
+
return False
|
|
40
|
+
|
|
41
|
+
return True
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def colorize(text: str, color: str = "", style: str = "") -> str:
|
|
45
|
+
"""Colorize text if colors are enabled."""
|
|
46
|
+
if not Colors.is_enabled():
|
|
47
|
+
return text
|
|
48
|
+
|
|
49
|
+
prefix = style + color
|
|
50
|
+
return f"{prefix}{text}{Colors.RESET}"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def success(text: str) -> str:
|
|
54
|
+
"""Green text for success messages."""
|
|
55
|
+
return colorize(text, Colors.GREEN, Colors.BOLD)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def error(text: str) -> str:
|
|
59
|
+
"""Red text for error messages."""
|
|
60
|
+
return colorize(text, Colors.RED, Colors.BOLD)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def warning(text: str) -> str:
|
|
64
|
+
"""Yellow text for warning messages."""
|
|
65
|
+
return colorize(text, Colors.YELLOW, Colors.BOLD)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def info(text: str) -> str:
|
|
69
|
+
"""Blue text for info messages."""
|
|
70
|
+
return colorize(text, Colors.BLUE)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def highlight(text: str) -> str:
|
|
74
|
+
"""Cyan text for highlighted text."""
|
|
75
|
+
return colorize(text, Colors.CYAN, Colors.BOLD)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def dim(text: str) -> str:
|
|
79
|
+
"""Dimmed text for less important info."""
|
|
80
|
+
return colorize(text, Colors.GRAY)
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Add command for generating UDFs.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
|
|
10
|
+
from datacompose.cli.colors import dim, error, highlight, info, success
|
|
11
|
+
from datacompose.cli.validation import validate_platform, validate_type_for_platform
|
|
12
|
+
from datacompose.transformers.discovery import TransformerDiscovery
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Completion functions for Click shell completion
|
|
16
|
+
def complete_transformer(ctx, param, incomplete):
|
|
17
|
+
"""Complete transformer names from discovery system."""
|
|
18
|
+
try:
|
|
19
|
+
discovery = TransformerDiscovery()
|
|
20
|
+
transformers = discovery.list_transformers()
|
|
21
|
+
return [
|
|
22
|
+
click.shell_completion.CompletionItem(t) # type: ignore
|
|
23
|
+
for t in transformers
|
|
24
|
+
if t.startswith(incomplete)
|
|
25
|
+
]
|
|
26
|
+
except Exception:
|
|
27
|
+
return []
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def complete_target(ctx, param, incomplete):
|
|
31
|
+
"""Complete target platforms from discovery system."""
|
|
32
|
+
try:
|
|
33
|
+
discovery = TransformerDiscovery()
|
|
34
|
+
generators = discovery.list_generators()
|
|
35
|
+
# Extract platform names (part before the dot)
|
|
36
|
+
platforms = list(set(gen.split(".")[0] for gen in generators))
|
|
37
|
+
return [
|
|
38
|
+
click.shell_completion.CompletionItem(p) # type: ignore
|
|
39
|
+
for p in platforms
|
|
40
|
+
if p.startswith(incomplete)
|
|
41
|
+
]
|
|
42
|
+
except Exception:
|
|
43
|
+
return []
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def complete_type(ctx, param, incomplete):
|
|
47
|
+
"""Complete generator types based on selected target."""
|
|
48
|
+
try:
|
|
49
|
+
discovery = TransformerDiscovery()
|
|
50
|
+
generators = discovery.list_generators()
|
|
51
|
+
|
|
52
|
+
# Try to get the target from context
|
|
53
|
+
target = None
|
|
54
|
+
if ctx.params.get("target"):
|
|
55
|
+
target = ctx.params["target"]
|
|
56
|
+
|
|
57
|
+
if target:
|
|
58
|
+
# Filter to types for this specific target
|
|
59
|
+
target_generators = [
|
|
60
|
+
gen for gen in generators if gen.startswith(f"{target}.")
|
|
61
|
+
]
|
|
62
|
+
types = [gen.split(".", 1)[1] for gen in target_generators if "." in gen]
|
|
63
|
+
return [
|
|
64
|
+
click.shell_completion.CompletionItem(t) # type: ignore
|
|
65
|
+
for t in types
|
|
66
|
+
if t.startswith(incomplete)
|
|
67
|
+
]
|
|
68
|
+
else:
|
|
69
|
+
# Return all available types
|
|
70
|
+
types = [gen.split(".", 1)[1] for gen in generators if "." in gen]
|
|
71
|
+
return [
|
|
72
|
+
click.shell_completion.CompletionItem(t) # type: ignore
|
|
73
|
+
for t in types
|
|
74
|
+
if t.startswith(incomplete)
|
|
75
|
+
]
|
|
76
|
+
except Exception:
|
|
77
|
+
return []
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# Get the directory where this module is located
|
|
81
|
+
_MODULE_DIR = Path(__file__).parent
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@click.command()
|
|
85
|
+
@click.argument("transformer", shell_complete=complete_transformer)
|
|
86
|
+
@click.option(
|
|
87
|
+
"--target",
|
|
88
|
+
"-t",
|
|
89
|
+
default="pyspark",
|
|
90
|
+
shell_complete=complete_target,
|
|
91
|
+
help="Target platform (e.g., 'pyspark', 'postgres', 'snowflake'). Default: pyspark",
|
|
92
|
+
)
|
|
93
|
+
@click.option(
|
|
94
|
+
"--type",
|
|
95
|
+
shell_complete=complete_type,
|
|
96
|
+
help="UDF type for the platform (e.g., 'pandas_udf', 'sql_udf'). Uses platform default if not specified",
|
|
97
|
+
)
|
|
98
|
+
@click.option("--output", "-o", help="Output directory (default: build/{target})")
|
|
99
|
+
@click.option(
|
|
100
|
+
"--template-dir",
|
|
101
|
+
default="src/transformers/templates",
|
|
102
|
+
help="Directory containing templates (default: src/transformers/templates)",
|
|
103
|
+
)
|
|
104
|
+
@click.option("--verbose", "-v", is_flag=True, help="Verbose output")
|
|
105
|
+
@click.pass_context
|
|
106
|
+
def add(ctx, transformer, target, type, output, template_dir, verbose):
|
|
107
|
+
"""Add UDFs for transformers.
|
|
108
|
+
|
|
109
|
+
TRANSFORMER: Transformer to add UDF for (e.g., 'clean_emails')
|
|
110
|
+
"""
|
|
111
|
+
# Initialize discovery for validation
|
|
112
|
+
discovery = TransformerDiscovery()
|
|
113
|
+
|
|
114
|
+
# Validate platform first
|
|
115
|
+
if not validate_platform(target, discovery):
|
|
116
|
+
ctx.exit(1)
|
|
117
|
+
|
|
118
|
+
# Validate type if specified
|
|
119
|
+
if type and not validate_type_for_platform(target, type, discovery):
|
|
120
|
+
ctx.exit(1)
|
|
121
|
+
|
|
122
|
+
# Combine target and type into generator reference
|
|
123
|
+
exit_code = _run_add(transformer, target, output, template_dir, verbose)
|
|
124
|
+
if exit_code != 0:
|
|
125
|
+
ctx.exit(exit_code)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _run_add(transformer, target, output, template_dir, verbose) -> int:
|
|
129
|
+
"""Execute the add command."""
|
|
130
|
+
# Initialize discovery
|
|
131
|
+
discovery = TransformerDiscovery()
|
|
132
|
+
|
|
133
|
+
# Resolve transformer
|
|
134
|
+
transformer_name, transformer_path = discovery.resolve_transformer(transformer)
|
|
135
|
+
|
|
136
|
+
if not transformer_path:
|
|
137
|
+
print(error(f"Error: Transformer not found: {transformer}"))
|
|
138
|
+
print(
|
|
139
|
+
info(
|
|
140
|
+
f"Available transformers: {', '.join(discovery.list_transformers())}"
|
|
141
|
+
)
|
|
142
|
+
)
|
|
143
|
+
return 1
|
|
144
|
+
else:
|
|
145
|
+
print(info(f"Using transformer: {transformer_name}"))
|
|
146
|
+
if verbose:
|
|
147
|
+
print(dim(f"Transformer path: {transformer_path}"))
|
|
148
|
+
# For discovered transformers, set transformer_dir
|
|
149
|
+
transformer_dir = transformer_path
|
|
150
|
+
|
|
151
|
+
# Resolve generator
|
|
152
|
+
generator_class = discovery.resolve_generator(target)
|
|
153
|
+
if not generator_class:
|
|
154
|
+
print(error(f"Error: Generator not found: {target}"))
|
|
155
|
+
print(info(f"Available generators: {', '.join(discovery.list_generators())}"))
|
|
156
|
+
return 1
|
|
157
|
+
|
|
158
|
+
# Determine output directory
|
|
159
|
+
# Extract platform from target (e.g., "pyspark.pandas_udf" -> "pyspark")
|
|
160
|
+
platform = target.split(".")[0]
|
|
161
|
+
|
|
162
|
+
if not output:
|
|
163
|
+
output_dir = f"build/{platform}/{transformer_name}"
|
|
164
|
+
else:
|
|
165
|
+
output_dir = f"{output}/{platform}/{transformer_name}"
|
|
166
|
+
|
|
167
|
+
# Create generator instance
|
|
168
|
+
generator = generator_class(
|
|
169
|
+
template_dir=Path(template_dir), output_dir=Path(output_dir), verbose=verbose
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
try:
|
|
173
|
+
# Generate the UDF
|
|
174
|
+
result = generator.generate(
|
|
175
|
+
transformer_name, force=False, transformer_dir=transformer_dir
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
if result.get("skipped"):
|
|
179
|
+
print(info(f"UDF already exists: {result['output_path']}"))
|
|
180
|
+
print(dim("No changes needed (hash matches)"))
|
|
181
|
+
if verbose:
|
|
182
|
+
print(dim(f" Hash: {result.get('hash', 'N/A')}"))
|
|
183
|
+
else:
|
|
184
|
+
print(success(f"✓ UDF generated: {result['output_path']}"))
|
|
185
|
+
print(success(f"✓ Test created: {result['test_path']}"))
|
|
186
|
+
print(highlight(f"Function name: {result['function_name']}"))
|
|
187
|
+
if verbose:
|
|
188
|
+
print(dim(f" Target: {target}"))
|
|
189
|
+
print(highlight("\nGenerated package contents:"))
|
|
190
|
+
print(f" - UDF code: {result['output_path']}")
|
|
191
|
+
print(f" - Test file: {result['test_path']}")
|
|
192
|
+
|
|
193
|
+
return 0
|
|
194
|
+
|
|
195
|
+
except Exception as e:
|
|
196
|
+
print(error(f"Add failed: {e}"))
|
|
197
|
+
if verbose:
|
|
198
|
+
import traceback
|
|
199
|
+
|
|
200
|
+
traceback.print_exc()
|
|
201
|
+
return 1
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _load_config() -> dict:
|
|
205
|
+
"""Load datacompose.json configuration if it exists."""
|
|
206
|
+
config_path = Path("datacompose.json")
|
|
207
|
+
if config_path.exists():
|
|
208
|
+
try:
|
|
209
|
+
with open(config_path, "r") as f:
|
|
210
|
+
return json.load(f)
|
|
211
|
+
except Exception:
|
|
212
|
+
pass
|
|
213
|
+
return {}
|
|
214
|
+
|
|
215
|
+
|