datacompose 0.2.4__py3-none-any.whl → 0.2.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacompose might be problematic. Click here for more details.
- datacompose/cli/commands/add.py +53 -40
- datacompose/cli/commands/init.py +35 -9
- datacompose/cli/commands/list.py +2 -2
- datacompose/cli/config.py +80 -0
- datacompose/cli/main.py +3 -3
- datacompose/generators/base.py +29 -41
- datacompose/generators/pyspark/generator.py +12 -17
- datacompose/transformers/text/{clean_addresses → addresses}/pyspark/pyspark_primitives.py +68 -13
- datacompose/transformers/text/{clean_emails → emails}/pyspark/pyspark_primitives.py +53 -1
- datacompose/transformers/text/{clean_phone_numbers → phone_numbers}/pyspark/pyspark_primitives.py +377 -327
- datacompose-0.2.5.2.dist-info/METADATA +94 -0
- datacompose-0.2.5.2.dist-info/RECORD +31 -0
- datacompose/cli/commands/upgrade.py +0 -7
- datacompose-0.2.4.dist-info/METADATA +0 -431
- datacompose-0.2.4.dist-info/RECORD +0 -31
- /datacompose/transformers/text/{clean_addresses → addresses}/__init__.py +0 -0
- /datacompose/transformers/text/{clean_emails → emails}/__init__.py +0 -0
- /datacompose/transformers/text/{clean_phone_numbers → phone_numbers}/__init__.py +0 -0
- {datacompose-0.2.4.dist-info → datacompose-0.2.5.2.dist-info}/WHEEL +0 -0
- {datacompose-0.2.4.dist-info → datacompose-0.2.5.2.dist-info}/entry_points.txt +0 -0
- {datacompose-0.2.4.dist-info → datacompose-0.2.5.2.dist-info}/licenses/LICENSE +0 -0
- {datacompose-0.2.4.dist-info → datacompose-0.2.5.2.dist-info}/top_level.txt +0 -0
datacompose/cli/commands/add.py
CHANGED
|
@@ -2,12 +2,12 @@
|
|
|
2
2
|
Add command for generating UDFs.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
import json
|
|
6
5
|
from pathlib import Path
|
|
7
6
|
|
|
8
7
|
import click
|
|
9
8
|
|
|
10
9
|
from datacompose.cli.colors import dim, error, highlight, info, success
|
|
10
|
+
from datacompose.cli.config import ConfigLoader
|
|
11
11
|
from datacompose.cli.validation import validate_platform, validate_type_for_platform
|
|
12
12
|
from datacompose.transformers.discovery import TransformerDiscovery
|
|
13
13
|
|
|
@@ -86,28 +86,48 @@ _MODULE_DIR = Path(__file__).parent
|
|
|
86
86
|
@click.option(
|
|
87
87
|
"--target",
|
|
88
88
|
"-t",
|
|
89
|
-
default=
|
|
89
|
+
default=None,
|
|
90
90
|
shell_complete=complete_target,
|
|
91
|
-
help="Target platform (e.g., 'pyspark', 'postgres', 'snowflake').
|
|
91
|
+
help="Target platform (e.g., 'pyspark', 'postgres', 'snowflake'). Uses default from datacompose.json if not specified",
|
|
92
92
|
)
|
|
93
93
|
@click.option(
|
|
94
94
|
"--type",
|
|
95
95
|
shell_complete=complete_type,
|
|
96
96
|
help="UDF type for the platform (e.g., 'pandas_udf', 'sql_udf'). Uses platform default if not specified",
|
|
97
97
|
)
|
|
98
|
-
@click.option("--output", "-o", help="Output directory (default: build/{target})")
|
|
99
98
|
@click.option(
|
|
100
|
-
"--
|
|
101
|
-
|
|
102
|
-
help="
|
|
99
|
+
"--output",
|
|
100
|
+
"-o",
|
|
101
|
+
help="Output directory (default: from config or transformers/{target})",
|
|
103
102
|
)
|
|
104
103
|
@click.option("--verbose", "-v", is_flag=True, help="Verbose output")
|
|
105
104
|
@click.pass_context
|
|
106
|
-
def add(ctx, transformer, target, type, output,
|
|
105
|
+
def add(ctx, transformer, target, type, output, verbose):
|
|
107
106
|
"""Add UDFs for transformers.
|
|
108
107
|
|
|
109
|
-
TRANSFORMER: Transformer to add UDF for (e.g., '
|
|
108
|
+
TRANSFORMER: Transformer to add UDF for (e.g., 'emails')
|
|
110
109
|
"""
|
|
110
|
+
# Load config to get default target if not specified
|
|
111
|
+
config = ConfigLoader.load_config()
|
|
112
|
+
|
|
113
|
+
if target is None:
|
|
114
|
+
# Try to get default target from config
|
|
115
|
+
target = ConfigLoader.get_default_target(config)
|
|
116
|
+
if target is None:
|
|
117
|
+
print(
|
|
118
|
+
error(
|
|
119
|
+
"Error: No target specified and no default target found in datacompose.json"
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
print(
|
|
123
|
+
info(
|
|
124
|
+
"Please specify a target with --target or run 'datacompose init' to set up defaults"
|
|
125
|
+
)
|
|
126
|
+
)
|
|
127
|
+
ctx.exit(1)
|
|
128
|
+
elif verbose:
|
|
129
|
+
print(dim(f"Using default target from config: {target}"))
|
|
130
|
+
|
|
111
131
|
# Initialize discovery for validation
|
|
112
132
|
discovery = TransformerDiscovery()
|
|
113
133
|
|
|
@@ -120,12 +140,12 @@ def add(ctx, transformer, target, type, output, template_dir, verbose):
|
|
|
120
140
|
ctx.exit(1)
|
|
121
141
|
|
|
122
142
|
# Combine target and type into generator reference
|
|
123
|
-
exit_code = _run_add(transformer, target, output,
|
|
143
|
+
exit_code = _run_add(transformer, target, output, verbose)
|
|
124
144
|
if exit_code != 0:
|
|
125
145
|
ctx.exit(exit_code)
|
|
126
146
|
|
|
127
147
|
|
|
128
|
-
def _run_add(transformer, target, output,
|
|
148
|
+
def _run_add(transformer, target, output, verbose) -> int:
|
|
129
149
|
"""Execute the add command."""
|
|
130
150
|
# Initialize discovery
|
|
131
151
|
discovery = TransformerDiscovery()
|
|
@@ -136,9 +156,7 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
|
|
|
136
156
|
if not transformer_path:
|
|
137
157
|
print(error(f"Error: Transformer not found: {transformer}"))
|
|
138
158
|
print(
|
|
139
|
-
info(
|
|
140
|
-
f"Available transformers: {', '.join(discovery.list_transformers())}"
|
|
141
|
-
)
|
|
159
|
+
info(f"Available transformers: {', '.join(discovery.list_transformers())}")
|
|
142
160
|
)
|
|
143
161
|
return 1
|
|
144
162
|
else:
|
|
@@ -156,20 +174,27 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
|
|
|
156
174
|
return 1
|
|
157
175
|
|
|
158
176
|
# Determine output directory
|
|
159
|
-
# Extract platform from target (e.g., "pyspark.pandas_udf" -> "pyspark")
|
|
160
|
-
platform = target.split(".")[0]
|
|
161
|
-
|
|
162
177
|
if not output:
|
|
163
|
-
|
|
178
|
+
# Try to get output from config first
|
|
179
|
+
config = ConfigLoader.load_config()
|
|
180
|
+
config_output = ConfigLoader.get_target_output(config, target)
|
|
181
|
+
if config_output:
|
|
182
|
+
# Config output already includes 'transformers/pyspark', so use it directly
|
|
183
|
+
output_dir = config_output
|
|
184
|
+
else:
|
|
185
|
+
output_dir = f"transformers/{target}"
|
|
164
186
|
else:
|
|
165
|
-
output_dir =
|
|
166
|
-
|
|
167
|
-
# Create generator instance
|
|
168
|
-
generator = generator_class(
|
|
169
|
-
template_dir=Path(template_dir), output_dir=Path(output_dir), verbose=verbose
|
|
170
|
-
)
|
|
187
|
+
output_dir = output
|
|
171
188
|
|
|
172
189
|
try:
|
|
190
|
+
# Create generator instance
|
|
191
|
+
# Note: template_dir is required by base class but not used by current generators
|
|
192
|
+
generator = generator_class(
|
|
193
|
+
template_dir=Path("."), # Placeholder - not actually used
|
|
194
|
+
output_dir=Path(output_dir),
|
|
195
|
+
verbose=verbose,
|
|
196
|
+
)
|
|
197
|
+
|
|
173
198
|
# Generate the UDF
|
|
174
199
|
result = generator.generate(
|
|
175
200
|
transformer_name, force=False, transformer_dir=transformer_dir
|
|
@@ -182,13 +207,15 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
|
|
|
182
207
|
print(dim(f" Hash: {result.get('hash', 'N/A')}"))
|
|
183
208
|
else:
|
|
184
209
|
print(success(f"✓ UDF generated: {result['output_path']}"))
|
|
185
|
-
|
|
210
|
+
if result.get("test_path"):
|
|
211
|
+
print(success(f"✓ Test created: {result['test_path']}"))
|
|
186
212
|
print(highlight(f"Function name: {result['function_name']}"))
|
|
187
213
|
if verbose:
|
|
188
214
|
print(dim(f" Target: {target}"))
|
|
189
215
|
print(highlight("\nGenerated package contents:"))
|
|
190
216
|
print(f" - UDF code: {result['output_path']}")
|
|
191
|
-
|
|
217
|
+
if result.get("test_path"):
|
|
218
|
+
print(f" - Test file: {result['test_path']}")
|
|
192
219
|
|
|
193
220
|
return 0
|
|
194
221
|
|
|
@@ -199,17 +226,3 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
|
|
|
199
226
|
|
|
200
227
|
traceback.print_exc()
|
|
201
228
|
return 1
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
def _load_config() -> dict:
|
|
205
|
-
"""Load datacompose.json configuration if it exists."""
|
|
206
|
-
config_path = Path("datacompose.json")
|
|
207
|
-
if config_path.exists():
|
|
208
|
-
try:
|
|
209
|
-
with open(config_path, "r") as f:
|
|
210
|
-
return json.load(f)
|
|
211
|
-
except Exception:
|
|
212
|
-
pass
|
|
213
|
-
return {}
|
|
214
|
-
|
|
215
|
-
|
datacompose/cli/commands/init.py
CHANGED
|
@@ -18,10 +18,11 @@ from datacompose.cli.colors import dim, error, highlight, info, success
|
|
|
18
18
|
|
|
19
19
|
DEFAULT_CONFIG = {
|
|
20
20
|
"version": "1.0",
|
|
21
|
+
"default_target": "pyspark",
|
|
21
22
|
"aliases": {"utils": "./src/utils"},
|
|
22
23
|
"targets": {
|
|
23
24
|
"pyspark": {
|
|
24
|
-
"output": "./
|
|
25
|
+
"output": "./transformers/pyspark",
|
|
25
26
|
}
|
|
26
27
|
},
|
|
27
28
|
}
|
|
@@ -57,7 +58,7 @@ class InitCommand:
|
|
|
57
58
|
def get_config_template(template_name: str) -> Dict[str, Any]:
|
|
58
59
|
"""Get configuration template by name."""
|
|
59
60
|
if template_name == "minimal":
|
|
60
|
-
return {"version": "1.0", "targets": {"pyspark": {"output": "./
|
|
61
|
+
return {"version": "1.0", "default_target": "pyspark", "targets": {"pyspark": {"output": "./transformers/pyspark"}}}
|
|
61
62
|
elif template_name == "advanced":
|
|
62
63
|
config = DEFAULT_CONFIG.copy()
|
|
63
64
|
config.update(
|
|
@@ -65,10 +66,10 @@ class InitCommand:
|
|
|
65
66
|
"style": "custom",
|
|
66
67
|
"aliases": {
|
|
67
68
|
"utils": "./src/utils",
|
|
68
|
-
"
|
|
69
|
+
"transformers": "./transformers",
|
|
69
70
|
},
|
|
70
71
|
"include": ["src/**/*"],
|
|
71
|
-
"exclude": ["__pycache__", "
|
|
72
|
+
"exclude": ["__pycache__", "transformers", "*.pyc", ".pytest_cache"],
|
|
72
73
|
"testing": {"framework": "pytest", "test_dir": "./tests"},
|
|
73
74
|
}
|
|
74
75
|
)
|
|
@@ -184,7 +185,7 @@ class InitCommand:
|
|
|
184
185
|
|
|
185
186
|
# Select targets with multi-select
|
|
186
187
|
available_targets = {
|
|
187
|
-
"pyspark": {"output": "./
|
|
188
|
+
"pyspark": {"output": "./transformers/pyspark", "name": "PySpark (Apache Spark)"},
|
|
188
189
|
}
|
|
189
190
|
|
|
190
191
|
selected_targets = InitCommand.prompt_for_targets(available_targets)
|
|
@@ -199,6 +200,31 @@ class InitCommand:
|
|
|
199
200
|
|
|
200
201
|
# Update targets with user selections
|
|
201
202
|
config["targets"] = selected_targets
|
|
203
|
+
|
|
204
|
+
# Set default target to the first selected target (or only target if single)
|
|
205
|
+
target_keys = list(selected_targets.keys())
|
|
206
|
+
if len(target_keys) == 1:
|
|
207
|
+
config["default_target"] = target_keys[0]
|
|
208
|
+
elif len(target_keys) > 1:
|
|
209
|
+
# Ask user to select default target
|
|
210
|
+
print(highlight("\nSelect Default Target"))
|
|
211
|
+
print(dim("Which platform should be used by default when running 'datacompose add'?\n"))
|
|
212
|
+
for i, key in enumerate(target_keys, 1):
|
|
213
|
+
print(f" {i}. {key}")
|
|
214
|
+
print()
|
|
215
|
+
|
|
216
|
+
while True:
|
|
217
|
+
choice = input(f"Select default target (1-{len(target_keys)}): ").strip()
|
|
218
|
+
try:
|
|
219
|
+
choice_idx = int(choice) - 1
|
|
220
|
+
if 0 <= choice_idx < len(target_keys):
|
|
221
|
+
config["default_target"] = target_keys[choice_idx]
|
|
222
|
+
print(dim(f"Default target set to: {target_keys[choice_idx]}\n"))
|
|
223
|
+
break
|
|
224
|
+
else:
|
|
225
|
+
print(error("Invalid selection. Please try again."))
|
|
226
|
+
except ValueError:
|
|
227
|
+
print(error("Please enter a number."))
|
|
202
228
|
|
|
203
229
|
print() # Add spacing
|
|
204
230
|
return config
|
|
@@ -403,11 +429,11 @@ def _run_init(force, output, verbose, yes, skip_completion) -> int:
|
|
|
403
429
|
"2. Source your shell config or restart terminal for tab completion"
|
|
404
430
|
)
|
|
405
431
|
print(
|
|
406
|
-
"3. Add your first transformer: datacompose add
|
|
432
|
+
"3. Add your first transformer: datacompose add emails"
|
|
407
433
|
)
|
|
408
434
|
else:
|
|
409
435
|
print(
|
|
410
|
-
"2. Add your first transformer: datacompose add
|
|
436
|
+
"2. Add your first transformer: datacompose add emails"
|
|
411
437
|
)
|
|
412
438
|
if not skip_completion:
|
|
413
439
|
print(
|
|
@@ -419,7 +445,7 @@ def _run_init(force, output, verbose, yes, skip_completion) -> int:
|
|
|
419
445
|
print(success("✓ Tab completion configured"))
|
|
420
446
|
print(
|
|
421
447
|
highlight(
|
|
422
|
-
"\nRun 'datacompose add
|
|
448
|
+
"\nRun 'datacompose add emails' to get started"
|
|
423
449
|
)
|
|
424
450
|
)
|
|
425
451
|
print(
|
|
@@ -430,7 +456,7 @@ def _run_init(force, output, verbose, yes, skip_completion) -> int:
|
|
|
430
456
|
else:
|
|
431
457
|
print(
|
|
432
458
|
highlight(
|
|
433
|
-
"\nRun 'datacompose add
|
|
459
|
+
"\nRun 'datacompose add emails' to get started"
|
|
434
460
|
)
|
|
435
461
|
)
|
|
436
462
|
if not skip_completion and not yes:
|
datacompose/cli/commands/list.py
CHANGED
|
@@ -95,7 +95,7 @@ class ListCommand:
|
|
|
95
95
|
print(f" • {transformer_name}")
|
|
96
96
|
|
|
97
97
|
print("\nUsage: datacompose add <transformer> --target <platform> [--type <type>]")
|
|
98
|
-
print("Example: datacompose add
|
|
98
|
+
print("Example: datacompose add emails --target pyspark")
|
|
99
99
|
return 0
|
|
100
100
|
|
|
101
101
|
@staticmethod
|
|
@@ -114,5 +114,5 @@ class ListCommand:
|
|
|
114
114
|
print(f" • {gen_type} ({gen_class.__name__})")
|
|
115
115
|
|
|
116
116
|
print("\nUsage: datacompose add <transformer> --target <platform> [--type <type>]")
|
|
117
|
-
print("Example: datacompose add
|
|
117
|
+
print("Example: datacompose add emails --target pyspark")
|
|
118
118
|
return 0
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration management for Datacompose CLI.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict, Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ConfigLoader:
|
|
11
|
+
"""Load and manage Datacompose configuration."""
|
|
12
|
+
|
|
13
|
+
DEFAULT_CONFIG_FILE = "datacompose.json"
|
|
14
|
+
|
|
15
|
+
@staticmethod
|
|
16
|
+
def load_config(config_path: Optional[Path] = None) -> Optional[Dict[str, Any]]:
|
|
17
|
+
"""Load configuration from datacompose.json.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
config_path: Optional path to config file. Defaults to ./datacompose.json
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Config dictionary or None if not found
|
|
24
|
+
"""
|
|
25
|
+
if config_path is None:
|
|
26
|
+
config_path = Path(ConfigLoader.DEFAULT_CONFIG_FILE)
|
|
27
|
+
|
|
28
|
+
if not config_path.exists():
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
with open(config_path, 'r') as f:
|
|
33
|
+
return json.load(f)
|
|
34
|
+
except (json.JSONDecodeError, IOError):
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def get_default_target(config: Optional[Dict[str, Any]] = None) -> Optional[str]:
|
|
39
|
+
"""Get the default target from config.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
config: Optional config dict. If None, will load from file.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Default target name or None
|
|
46
|
+
"""
|
|
47
|
+
if config is None:
|
|
48
|
+
config = ConfigLoader.load_config()
|
|
49
|
+
|
|
50
|
+
if not config:
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
# Check for explicit default_target setting
|
|
54
|
+
if "default_target" in config:
|
|
55
|
+
return config["default_target"]
|
|
56
|
+
|
|
57
|
+
# Otherwise use the first target if only one exists
|
|
58
|
+
targets = config.get("targets", {})
|
|
59
|
+
if len(targets) == 1:
|
|
60
|
+
return list(targets.keys())[0]
|
|
61
|
+
|
|
62
|
+
return None
|
|
63
|
+
|
|
64
|
+
@staticmethod
|
|
65
|
+
def get_target_output(config: Optional[Dict[str, Any]], target: str) -> Optional[str]:
|
|
66
|
+
"""Get the output directory for a specific target.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
config: Config dictionary
|
|
70
|
+
target: Target name
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Output directory path or None
|
|
74
|
+
"""
|
|
75
|
+
if not config:
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
targets = config.get("targets", {})
|
|
79
|
+
target_config = targets.get(target, {})
|
|
80
|
+
return target_config.get("output")
|
datacompose/cli/main.py
CHANGED
|
@@ -25,9 +25,9 @@ def cli(ctx):
|
|
|
25
25
|
"""Generate data cleaning UDFs for various platforms.
|
|
26
26
|
|
|
27
27
|
Examples:
|
|
28
|
-
datacompose init
|
|
29
|
-
datacompose add
|
|
30
|
-
datacompose add
|
|
28
|
+
datacompose init # Set up project with default target
|
|
29
|
+
datacompose add emails # Uses default target from config
|
|
30
|
+
datacompose add emails --target snowflake --output sql/udfs/
|
|
31
31
|
datacompose list targets
|
|
32
32
|
"""
|
|
33
33
|
pass
|
datacompose/generators/base.py
CHANGED
|
@@ -8,7 +8,6 @@ def __get_output_filename as well as any other build steps that you want.
|
|
|
8
8
|
|
|
9
9
|
import hashlib
|
|
10
10
|
from abc import ABC, abstractmethod
|
|
11
|
-
from datetime import datetime
|
|
12
11
|
from pathlib import Path
|
|
13
12
|
from typing import Any, Dict, Optional
|
|
14
13
|
|
|
@@ -45,16 +44,11 @@ class BaseGenerator(ABC):
|
|
|
45
44
|
Dictionary with generation results
|
|
46
45
|
"""
|
|
47
46
|
# Create a minimal spec-like dict from transformer name for compatibility
|
|
48
|
-
|
|
47
|
+
transformer = {"name": transformer_name}
|
|
49
48
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
# Calculate hash for caching
|
|
54
|
-
spec_hash = self._calculate_hash(spec, template_content)
|
|
55
|
-
|
|
56
|
-
# Determine output path
|
|
57
|
-
output_file = self._get_output_filename(spec["name"])
|
|
49
|
+
file_content: str = self._get_primitives_file(transformer_dir)
|
|
50
|
+
spec_hash = self._calculate_hash(transformer, file_content)
|
|
51
|
+
output_file = self._get_output_filename(transformer["name"])
|
|
58
52
|
output_path = self.output_dir / output_file
|
|
59
53
|
|
|
60
54
|
# Check if regeneration is needed
|
|
@@ -63,18 +57,18 @@ class BaseGenerator(ABC):
|
|
|
63
57
|
"skipped": True,
|
|
64
58
|
"output_path": str(output_path),
|
|
65
59
|
"hash": spec_hash,
|
|
66
|
-
"function_name": f"{
|
|
60
|
+
"function_name": f"{transformer['name']}_udf",
|
|
67
61
|
}
|
|
68
62
|
|
|
69
63
|
# Copy utils/primitives.py to the output directory
|
|
70
64
|
self._copy_utils_files(output_path)
|
|
71
|
-
|
|
65
|
+
self._write_output(output_path, file_content)
|
|
72
66
|
|
|
73
67
|
return {
|
|
74
68
|
"skipped": False,
|
|
75
69
|
"output_path": str(output_path),
|
|
76
70
|
"hash": spec_hash,
|
|
77
|
-
"function_name": f"{
|
|
71
|
+
"function_name": f"{transformer['name']}_udf",
|
|
78
72
|
}
|
|
79
73
|
|
|
80
74
|
@staticmethod
|
|
@@ -82,6 +76,7 @@ class BaseGenerator(ABC):
|
|
|
82
76
|
"""Calculate hash for cache invalidation."""
|
|
83
77
|
content = str(spec) + template_content
|
|
84
78
|
return hashlib.sha256(content.encode("utf-8")).hexdigest()[:8]
|
|
79
|
+
|
|
85
80
|
|
|
86
81
|
@staticmethod
|
|
87
82
|
def _should_skip_generation(output_path: Path, spec_hash: str) -> bool:
|
|
@@ -100,8 +95,6 @@ class BaseGenerator(ABC):
|
|
|
100
95
|
"""Write generated content to output file."""
|
|
101
96
|
# Create output directory if it doesn't exist
|
|
102
97
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
103
|
-
|
|
104
|
-
# Create __init__.py files to make directories importable as Python packages
|
|
105
98
|
self._ensure_init_files(output_path)
|
|
106
99
|
|
|
107
100
|
with open(output_path, "w") as f:
|
|
@@ -112,14 +105,14 @@ class BaseGenerator(ABC):
|
|
|
112
105
|
|
|
113
106
|
def _ensure_init_files(self, output_path: Path):
|
|
114
107
|
"""Ensure __init__.py files exist to make directories importable."""
|
|
115
|
-
# Get all directories from
|
|
108
|
+
# Get all directories from transformers down to the target directory
|
|
116
109
|
path_parts = output_path.parts
|
|
117
110
|
|
|
118
|
-
# Find the
|
|
111
|
+
# Find the transformers directory index
|
|
119
112
|
try:
|
|
120
|
-
|
|
113
|
+
transformers_index = path_parts.index("transformers")
|
|
121
114
|
except ValueError:
|
|
122
|
-
# No
|
|
115
|
+
# No transformers directory found, just create init for immediate parent
|
|
123
116
|
init_file = output_path.parent / "__init__.py"
|
|
124
117
|
if not init_file.exists():
|
|
125
118
|
init_file.touch()
|
|
@@ -127,9 +120,9 @@ class BaseGenerator(ABC):
|
|
|
127
120
|
print(f"Created {init_file}")
|
|
128
121
|
return
|
|
129
122
|
|
|
130
|
-
# Create __init__.py files for
|
|
123
|
+
# Create __init__.py files for transformers and all subdirectories leading to output
|
|
131
124
|
for i in range(
|
|
132
|
-
|
|
125
|
+
transformers_index, len(path_parts) - 1
|
|
133
126
|
): # -1 to exclude the file itself
|
|
134
127
|
dir_path = Path(*path_parts[: i + 1])
|
|
135
128
|
init_file = dir_path / "__init__.py"
|
|
@@ -138,25 +131,20 @@ class BaseGenerator(ABC):
|
|
|
138
131
|
if self.verbose:
|
|
139
132
|
print(f"Created {init_file}")
|
|
140
133
|
|
|
141
|
-
@staticmethod
|
|
142
|
-
def _prepare_template_vars(spec: Dict[str, Any], spec_hash: str) -> Dict[str, Any]:
|
|
143
|
-
"""Prepare variables for template rendering."""
|
|
144
|
-
return {
|
|
145
|
-
"transformer_name": spec["name"],
|
|
146
|
-
"udf_name": f"{spec['name']}_udf",
|
|
147
|
-
"hash": spec_hash,
|
|
148
|
-
"generation_timestamp": datetime.now().isoformat(),
|
|
149
|
-
"typo_map": spec.get("typo_map", {}),
|
|
150
|
-
"regex_patterns": spec.get("regex", {}),
|
|
151
|
-
"flags": spec.get("flags", {}),
|
|
152
|
-
"options": spec.get("options", {}),
|
|
153
|
-
"custom_rules": spec.get("custom_rules", {}),
|
|
154
|
-
}
|
|
155
|
-
|
|
156
134
|
|
|
157
135
|
def _copy_utils_files(self, output_path: Path):
|
|
158
|
-
"""Copy utility files like primitives.py to the
|
|
159
|
-
#
|
|
136
|
+
"""Copy utility files like primitives.py to the transformers directory."""
|
|
137
|
+
# Find the transformers directory root
|
|
138
|
+
path_parts = output_path.parts
|
|
139
|
+
try:
|
|
140
|
+
transformers_index = path_parts.index("transformers")
|
|
141
|
+
transformers_root = Path(*path_parts[:transformers_index + 1])
|
|
142
|
+
except ValueError:
|
|
143
|
+
# Fallback to parent directory if no 'transformers' in path
|
|
144
|
+
transformers_root = output_path.parent.parent
|
|
145
|
+
|
|
146
|
+
# Create utils directory in the same directory as the generated files
|
|
147
|
+
# This puts it at transformers/pyspark/utils
|
|
160
148
|
utils_dir = output_path.parent / "utils"
|
|
161
149
|
utils_dir.mkdir(parents=True, exist_ok=True)
|
|
162
150
|
|
|
@@ -179,12 +167,12 @@ class BaseGenerator(ABC):
|
|
|
179
167
|
|
|
180
168
|
@classmethod
|
|
181
169
|
@abstractmethod
|
|
182
|
-
def
|
|
170
|
+
def _get_primitives_location(cls, transformer_dir: Path | None) -> Path | None:
|
|
183
171
|
pass
|
|
184
172
|
|
|
185
173
|
@abstractmethod
|
|
186
|
-
def
|
|
187
|
-
"""Get the
|
|
174
|
+
def _get_primitives_file(self, transformer_dir: Path | None) -> str:
|
|
175
|
+
"""Get the file content for this generator."""
|
|
188
176
|
pass
|
|
189
177
|
|
|
190
178
|
@abstractmethod
|
|
@@ -11,41 +11,36 @@ class SparkPandasUDFGenerator(BaseGenerator):
|
|
|
11
11
|
"""Generator for Apache Spark pandas UDFs."""
|
|
12
12
|
|
|
13
13
|
ENGINE_SUBDIRECTORY = "pyspark"
|
|
14
|
-
|
|
14
|
+
PRIMITIVES_FILENAME = "pyspark_primitives.py"
|
|
15
15
|
|
|
16
16
|
@classmethod
|
|
17
|
-
def
|
|
17
|
+
def _get_primitives_location(cls, transformer_dir: Path | None) -> Path | None:
|
|
18
18
|
if transformer_dir is None:
|
|
19
19
|
return None
|
|
20
|
-
return transformer_dir / cls.ENGINE_SUBDIRECTORY / cls.
|
|
20
|
+
return transformer_dir / cls.ENGINE_SUBDIRECTORY / cls.PRIMITIVES_FILENAME
|
|
21
21
|
|
|
22
|
-
def
|
|
22
|
+
def _get_primitives_file(self, transformer_dir: Path | None = None) -> str:
|
|
23
23
|
"""Get the template content for Spark pandas UDFs."""
|
|
24
24
|
if transformer_dir:
|
|
25
25
|
# Look for transformer-specific template first
|
|
26
|
-
transformer_template = self.
|
|
26
|
+
transformer_template = self._get_primitives_location(transformer_dir)
|
|
27
27
|
if transformer_template and transformer_template.exists():
|
|
28
28
|
return transformer_template.read_text()
|
|
29
29
|
|
|
30
30
|
# Fallback to generator-specific template (if it exists)
|
|
31
|
-
generator_template = Path(__file__).parent / self.
|
|
31
|
+
generator_template = Path(__file__).parent / self.PRIMITIVES_FILENAME
|
|
32
32
|
if generator_template.exists():
|
|
33
33
|
return generator_template.read_text()
|
|
34
34
|
|
|
35
35
|
# If no templates found, raise error
|
|
36
36
|
raise FileNotFoundError(
|
|
37
|
-
f"No {self.
|
|
37
|
+
f"No {self.PRIMITIVES_FILENAME} template found in {transformer_dir} or {Path(__file__).parent}"
|
|
38
38
|
)
|
|
39
39
|
|
|
40
40
|
def _get_output_filename(self, transformer_name: str) -> str:
|
|
41
41
|
"""Get the output filename for PySpark primitives."""
|
|
42
|
-
#
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
# Use mapped name if available, otherwise fall back to transformer_name
|
|
50
|
-
output_name = name_mapping.get(transformer_name, f"{transformer_name}_primitives")
|
|
51
|
-
return f"{output_name}.py"
|
|
42
|
+
# Use the transformer name directly as the filename
|
|
43
|
+
# emails -> emails.py
|
|
44
|
+
# addresses -> addresses.py
|
|
45
|
+
# phone_numbers -> phone_numbers.py
|
|
46
|
+
return f"{transformer_name}.py"
|