datacompose 0.2.4.1__py3-none-any.whl → 0.2.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacompose might be problematic. Click here for more details.
- datacompose/cli/commands/add.py +49 -21
- datacompose/cli/commands/init.py +35 -9
- datacompose/cli/commands/list.py +2 -2
- datacompose/cli/config.py +80 -0
- datacompose/cli/main.py +3 -3
- datacompose/generators/base.py +15 -14
- datacompose/generators/pyspark/generator.py +5 -10
- datacompose/transformers/text/{clean_addresses → addresses}/pyspark/pyspark_primitives.py +68 -13
- datacompose/transformers/text/{clean_emails → emails}/pyspark/pyspark_primitives.py +53 -1
- datacompose/transformers/text/{clean_phone_numbers → phone_numbers}/pyspark/pyspark_primitives.py +377 -327
- datacompose-0.2.5.2.dist-info/METADATA +94 -0
- datacompose-0.2.5.2.dist-info/RECORD +31 -0
- datacompose-0.2.4.1.dist-info/METADATA +0 -449
- datacompose-0.2.4.1.dist-info/RECORD +0 -30
- /datacompose/transformers/text/{clean_addresses → addresses}/__init__.py +0 -0
- /datacompose/transformers/text/{clean_emails → emails}/__init__.py +0 -0
- /datacompose/transformers/text/{clean_phone_numbers → phone_numbers}/__init__.py +0 -0
- {datacompose-0.2.4.1.dist-info → datacompose-0.2.5.2.dist-info}/WHEEL +0 -0
- {datacompose-0.2.4.1.dist-info → datacompose-0.2.5.2.dist-info}/entry_points.txt +0 -0
- {datacompose-0.2.4.1.dist-info → datacompose-0.2.5.2.dist-info}/licenses/LICENSE +0 -0
- {datacompose-0.2.4.1.dist-info → datacompose-0.2.5.2.dist-info}/top_level.txt +0 -0
datacompose/cli/commands/add.py
CHANGED
|
@@ -7,6 +7,7 @@ from pathlib import Path
|
|
|
7
7
|
import click
|
|
8
8
|
|
|
9
9
|
from datacompose.cli.colors import dim, error, highlight, info, success
|
|
10
|
+
from datacompose.cli.config import ConfigLoader
|
|
10
11
|
from datacompose.cli.validation import validate_platform, validate_type_for_platform
|
|
11
12
|
from datacompose.transformers.discovery import TransformerDiscovery
|
|
12
13
|
|
|
@@ -85,28 +86,48 @@ _MODULE_DIR = Path(__file__).parent
|
|
|
85
86
|
@click.option(
|
|
86
87
|
"--target",
|
|
87
88
|
"-t",
|
|
88
|
-
default=
|
|
89
|
+
default=None,
|
|
89
90
|
shell_complete=complete_target,
|
|
90
|
-
help="Target platform (e.g., 'pyspark', 'postgres', 'snowflake').
|
|
91
|
+
help="Target platform (e.g., 'pyspark', 'postgres', 'snowflake'). Uses default from datacompose.json if not specified",
|
|
91
92
|
)
|
|
92
93
|
@click.option(
|
|
93
94
|
"--type",
|
|
94
95
|
shell_complete=complete_type,
|
|
95
96
|
help="UDF type for the platform (e.g., 'pandas_udf', 'sql_udf'). Uses platform default if not specified",
|
|
96
97
|
)
|
|
97
|
-
@click.option("--output", "-o", help="Output directory (default: build/{target})")
|
|
98
98
|
@click.option(
|
|
99
|
-
"--
|
|
100
|
-
|
|
101
|
-
help="
|
|
99
|
+
"--output",
|
|
100
|
+
"-o",
|
|
101
|
+
help="Output directory (default: from config or transformers/{target})",
|
|
102
102
|
)
|
|
103
103
|
@click.option("--verbose", "-v", is_flag=True, help="Verbose output")
|
|
104
104
|
@click.pass_context
|
|
105
|
-
def add(ctx, transformer, target, type, output,
|
|
105
|
+
def add(ctx, transformer, target, type, output, verbose):
|
|
106
106
|
"""Add UDFs for transformers.
|
|
107
107
|
|
|
108
|
-
TRANSFORMER: Transformer to add UDF for (e.g., '
|
|
108
|
+
TRANSFORMER: Transformer to add UDF for (e.g., 'emails')
|
|
109
109
|
"""
|
|
110
|
+
# Load config to get default target if not specified
|
|
111
|
+
config = ConfigLoader.load_config()
|
|
112
|
+
|
|
113
|
+
if target is None:
|
|
114
|
+
# Try to get default target from config
|
|
115
|
+
target = ConfigLoader.get_default_target(config)
|
|
116
|
+
if target is None:
|
|
117
|
+
print(
|
|
118
|
+
error(
|
|
119
|
+
"Error: No target specified and no default target found in datacompose.json"
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
print(
|
|
123
|
+
info(
|
|
124
|
+
"Please specify a target with --target or run 'datacompose init' to set up defaults"
|
|
125
|
+
)
|
|
126
|
+
)
|
|
127
|
+
ctx.exit(1)
|
|
128
|
+
elif verbose:
|
|
129
|
+
print(dim(f"Using default target from config: {target}"))
|
|
130
|
+
|
|
110
131
|
# Initialize discovery for validation
|
|
111
132
|
discovery = TransformerDiscovery()
|
|
112
133
|
|
|
@@ -119,12 +140,12 @@ def add(ctx, transformer, target, type, output, template_dir, verbose):
|
|
|
119
140
|
ctx.exit(1)
|
|
120
141
|
|
|
121
142
|
# Combine target and type into generator reference
|
|
122
|
-
exit_code = _run_add(transformer, target, output,
|
|
143
|
+
exit_code = _run_add(transformer, target, output, verbose)
|
|
123
144
|
if exit_code != 0:
|
|
124
145
|
ctx.exit(exit_code)
|
|
125
146
|
|
|
126
147
|
|
|
127
|
-
def _run_add(transformer, target, output,
|
|
148
|
+
def _run_add(transformer, target, output, verbose) -> int:
|
|
128
149
|
"""Execute the add command."""
|
|
129
150
|
# Initialize discovery
|
|
130
151
|
discovery = TransformerDiscovery()
|
|
@@ -135,9 +156,7 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
|
|
|
135
156
|
if not transformer_path:
|
|
136
157
|
print(error(f"Error: Transformer not found: {transformer}"))
|
|
137
158
|
print(
|
|
138
|
-
info(
|
|
139
|
-
f"Available transformers: {', '.join(discovery.list_transformers())}"
|
|
140
|
-
)
|
|
159
|
+
info(f"Available transformers: {', '.join(discovery.list_transformers())}")
|
|
141
160
|
)
|
|
142
161
|
return 1
|
|
143
162
|
else:
|
|
@@ -154,18 +173,28 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
|
|
|
154
173
|
print(info(f"Available generators: {', '.join(discovery.list_generators())}"))
|
|
155
174
|
return 1
|
|
156
175
|
|
|
157
|
-
# Determine output directory
|
|
176
|
+
# Determine output directory
|
|
158
177
|
if not output:
|
|
159
|
-
|
|
178
|
+
# Try to get output from config first
|
|
179
|
+
config = ConfigLoader.load_config()
|
|
180
|
+
config_output = ConfigLoader.get_target_output(config, target)
|
|
181
|
+
if config_output:
|
|
182
|
+
# Config output already includes 'transformers/pyspark', so use it directly
|
|
183
|
+
output_dir = config_output
|
|
184
|
+
else:
|
|
185
|
+
output_dir = f"transformers/{target}"
|
|
160
186
|
else:
|
|
161
|
-
output_dir =
|
|
187
|
+
output_dir = output
|
|
162
188
|
|
|
163
189
|
try:
|
|
164
190
|
# Create generator instance
|
|
191
|
+
# Note: template_dir is required by base class but not used by current generators
|
|
165
192
|
generator = generator_class(
|
|
166
|
-
template_dir=Path(
|
|
193
|
+
template_dir=Path("."), # Placeholder - not actually used
|
|
194
|
+
output_dir=Path(output_dir),
|
|
195
|
+
verbose=verbose,
|
|
167
196
|
)
|
|
168
|
-
|
|
197
|
+
|
|
169
198
|
# Generate the UDF
|
|
170
199
|
result = generator.generate(
|
|
171
200
|
transformer_name, force=False, transformer_dir=transformer_dir
|
|
@@ -178,14 +207,14 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
|
|
|
178
207
|
print(dim(f" Hash: {result.get('hash', 'N/A')}"))
|
|
179
208
|
else:
|
|
180
209
|
print(success(f"✓ UDF generated: {result['output_path']}"))
|
|
181
|
-
if result.get(
|
|
210
|
+
if result.get("test_path"):
|
|
182
211
|
print(success(f"✓ Test created: {result['test_path']}"))
|
|
183
212
|
print(highlight(f"Function name: {result['function_name']}"))
|
|
184
213
|
if verbose:
|
|
185
214
|
print(dim(f" Target: {target}"))
|
|
186
215
|
print(highlight("\nGenerated package contents:"))
|
|
187
216
|
print(f" - UDF code: {result['output_path']}")
|
|
188
|
-
if result.get(
|
|
217
|
+
if result.get("test_path"):
|
|
189
218
|
print(f" - Test file: {result['test_path']}")
|
|
190
219
|
|
|
191
220
|
return 0
|
|
@@ -197,4 +226,3 @@ def _run_add(transformer, target, output, template_dir, verbose) -> int:
|
|
|
197
226
|
|
|
198
227
|
traceback.print_exc()
|
|
199
228
|
return 1
|
|
200
|
-
|
datacompose/cli/commands/init.py
CHANGED
|
@@ -18,10 +18,11 @@ from datacompose.cli.colors import dim, error, highlight, info, success
|
|
|
18
18
|
|
|
19
19
|
DEFAULT_CONFIG = {
|
|
20
20
|
"version": "1.0",
|
|
21
|
+
"default_target": "pyspark",
|
|
21
22
|
"aliases": {"utils": "./src/utils"},
|
|
22
23
|
"targets": {
|
|
23
24
|
"pyspark": {
|
|
24
|
-
"output": "./
|
|
25
|
+
"output": "./transformers/pyspark",
|
|
25
26
|
}
|
|
26
27
|
},
|
|
27
28
|
}
|
|
@@ -57,7 +58,7 @@ class InitCommand:
|
|
|
57
58
|
def get_config_template(template_name: str) -> Dict[str, Any]:
|
|
58
59
|
"""Get configuration template by name."""
|
|
59
60
|
if template_name == "minimal":
|
|
60
|
-
return {"version": "1.0", "targets": {"pyspark": {"output": "./
|
|
61
|
+
return {"version": "1.0", "default_target": "pyspark", "targets": {"pyspark": {"output": "./transformers/pyspark"}}}
|
|
61
62
|
elif template_name == "advanced":
|
|
62
63
|
config = DEFAULT_CONFIG.copy()
|
|
63
64
|
config.update(
|
|
@@ -65,10 +66,10 @@ class InitCommand:
|
|
|
65
66
|
"style": "custom",
|
|
66
67
|
"aliases": {
|
|
67
68
|
"utils": "./src/utils",
|
|
68
|
-
"
|
|
69
|
+
"transformers": "./transformers",
|
|
69
70
|
},
|
|
70
71
|
"include": ["src/**/*"],
|
|
71
|
-
"exclude": ["__pycache__", "
|
|
72
|
+
"exclude": ["__pycache__", "transformers", "*.pyc", ".pytest_cache"],
|
|
72
73
|
"testing": {"framework": "pytest", "test_dir": "./tests"},
|
|
73
74
|
}
|
|
74
75
|
)
|
|
@@ -184,7 +185,7 @@ class InitCommand:
|
|
|
184
185
|
|
|
185
186
|
# Select targets with multi-select
|
|
186
187
|
available_targets = {
|
|
187
|
-
"pyspark": {"output": "./
|
|
188
|
+
"pyspark": {"output": "./transformers/pyspark", "name": "PySpark (Apache Spark)"},
|
|
188
189
|
}
|
|
189
190
|
|
|
190
191
|
selected_targets = InitCommand.prompt_for_targets(available_targets)
|
|
@@ -199,6 +200,31 @@ class InitCommand:
|
|
|
199
200
|
|
|
200
201
|
# Update targets with user selections
|
|
201
202
|
config["targets"] = selected_targets
|
|
203
|
+
|
|
204
|
+
# Set default target to the first selected target (or only target if single)
|
|
205
|
+
target_keys = list(selected_targets.keys())
|
|
206
|
+
if len(target_keys) == 1:
|
|
207
|
+
config["default_target"] = target_keys[0]
|
|
208
|
+
elif len(target_keys) > 1:
|
|
209
|
+
# Ask user to select default target
|
|
210
|
+
print(highlight("\nSelect Default Target"))
|
|
211
|
+
print(dim("Which platform should be used by default when running 'datacompose add'?\n"))
|
|
212
|
+
for i, key in enumerate(target_keys, 1):
|
|
213
|
+
print(f" {i}. {key}")
|
|
214
|
+
print()
|
|
215
|
+
|
|
216
|
+
while True:
|
|
217
|
+
choice = input(f"Select default target (1-{len(target_keys)}): ").strip()
|
|
218
|
+
try:
|
|
219
|
+
choice_idx = int(choice) - 1
|
|
220
|
+
if 0 <= choice_idx < len(target_keys):
|
|
221
|
+
config["default_target"] = target_keys[choice_idx]
|
|
222
|
+
print(dim(f"Default target set to: {target_keys[choice_idx]}\n"))
|
|
223
|
+
break
|
|
224
|
+
else:
|
|
225
|
+
print(error("Invalid selection. Please try again."))
|
|
226
|
+
except ValueError:
|
|
227
|
+
print(error("Please enter a number."))
|
|
202
228
|
|
|
203
229
|
print() # Add spacing
|
|
204
230
|
return config
|
|
@@ -403,11 +429,11 @@ def _run_init(force, output, verbose, yes, skip_completion) -> int:
|
|
|
403
429
|
"2. Source your shell config or restart terminal for tab completion"
|
|
404
430
|
)
|
|
405
431
|
print(
|
|
406
|
-
"3. Add your first transformer: datacompose add
|
|
432
|
+
"3. Add your first transformer: datacompose add emails"
|
|
407
433
|
)
|
|
408
434
|
else:
|
|
409
435
|
print(
|
|
410
|
-
"2. Add your first transformer: datacompose add
|
|
436
|
+
"2. Add your first transformer: datacompose add emails"
|
|
411
437
|
)
|
|
412
438
|
if not skip_completion:
|
|
413
439
|
print(
|
|
@@ -419,7 +445,7 @@ def _run_init(force, output, verbose, yes, skip_completion) -> int:
|
|
|
419
445
|
print(success("✓ Tab completion configured"))
|
|
420
446
|
print(
|
|
421
447
|
highlight(
|
|
422
|
-
"\nRun 'datacompose add
|
|
448
|
+
"\nRun 'datacompose add emails' to get started"
|
|
423
449
|
)
|
|
424
450
|
)
|
|
425
451
|
print(
|
|
@@ -430,7 +456,7 @@ def _run_init(force, output, verbose, yes, skip_completion) -> int:
|
|
|
430
456
|
else:
|
|
431
457
|
print(
|
|
432
458
|
highlight(
|
|
433
|
-
"\nRun 'datacompose add
|
|
459
|
+
"\nRun 'datacompose add emails' to get started"
|
|
434
460
|
)
|
|
435
461
|
)
|
|
436
462
|
if not skip_completion and not yes:
|
datacompose/cli/commands/list.py
CHANGED
|
@@ -95,7 +95,7 @@ class ListCommand:
|
|
|
95
95
|
print(f" • {transformer_name}")
|
|
96
96
|
|
|
97
97
|
print("\nUsage: datacompose add <transformer> --target <platform> [--type <type>]")
|
|
98
|
-
print("Example: datacompose add
|
|
98
|
+
print("Example: datacompose add emails --target pyspark")
|
|
99
99
|
return 0
|
|
100
100
|
|
|
101
101
|
@staticmethod
|
|
@@ -114,5 +114,5 @@ class ListCommand:
|
|
|
114
114
|
print(f" • {gen_type} ({gen_class.__name__})")
|
|
115
115
|
|
|
116
116
|
print("\nUsage: datacompose add <transformer> --target <platform> [--type <type>]")
|
|
117
|
-
print("Example: datacompose add
|
|
117
|
+
print("Example: datacompose add emails --target pyspark")
|
|
118
118
|
return 0
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration management for Datacompose CLI.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict, Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ConfigLoader:
|
|
11
|
+
"""Load and manage Datacompose configuration."""
|
|
12
|
+
|
|
13
|
+
DEFAULT_CONFIG_FILE = "datacompose.json"
|
|
14
|
+
|
|
15
|
+
@staticmethod
|
|
16
|
+
def load_config(config_path: Optional[Path] = None) -> Optional[Dict[str, Any]]:
|
|
17
|
+
"""Load configuration from datacompose.json.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
config_path: Optional path to config file. Defaults to ./datacompose.json
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Config dictionary or None if not found
|
|
24
|
+
"""
|
|
25
|
+
if config_path is None:
|
|
26
|
+
config_path = Path(ConfigLoader.DEFAULT_CONFIG_FILE)
|
|
27
|
+
|
|
28
|
+
if not config_path.exists():
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
with open(config_path, 'r') as f:
|
|
33
|
+
return json.load(f)
|
|
34
|
+
except (json.JSONDecodeError, IOError):
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def get_default_target(config: Optional[Dict[str, Any]] = None) -> Optional[str]:
|
|
39
|
+
"""Get the default target from config.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
config: Optional config dict. If None, will load from file.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Default target name or None
|
|
46
|
+
"""
|
|
47
|
+
if config is None:
|
|
48
|
+
config = ConfigLoader.load_config()
|
|
49
|
+
|
|
50
|
+
if not config:
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
# Check for explicit default_target setting
|
|
54
|
+
if "default_target" in config:
|
|
55
|
+
return config["default_target"]
|
|
56
|
+
|
|
57
|
+
# Otherwise use the first target if only one exists
|
|
58
|
+
targets = config.get("targets", {})
|
|
59
|
+
if len(targets) == 1:
|
|
60
|
+
return list(targets.keys())[0]
|
|
61
|
+
|
|
62
|
+
return None
|
|
63
|
+
|
|
64
|
+
@staticmethod
|
|
65
|
+
def get_target_output(config: Optional[Dict[str, Any]], target: str) -> Optional[str]:
|
|
66
|
+
"""Get the output directory for a specific target.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
config: Config dictionary
|
|
70
|
+
target: Target name
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Output directory path or None
|
|
74
|
+
"""
|
|
75
|
+
if not config:
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
targets = config.get("targets", {})
|
|
79
|
+
target_config = targets.get(target, {})
|
|
80
|
+
return target_config.get("output")
|
datacompose/cli/main.py
CHANGED
|
@@ -25,9 +25,9 @@ def cli(ctx):
|
|
|
25
25
|
"""Generate data cleaning UDFs for various platforms.
|
|
26
26
|
|
|
27
27
|
Examples:
|
|
28
|
-
datacompose init
|
|
29
|
-
datacompose add
|
|
30
|
-
datacompose add
|
|
28
|
+
datacompose init # Set up project with default target
|
|
29
|
+
datacompose add emails # Uses default target from config
|
|
30
|
+
datacompose add emails --target snowflake --output sql/udfs/
|
|
31
31
|
datacompose list targets
|
|
32
32
|
"""
|
|
33
33
|
pass
|
datacompose/generators/base.py
CHANGED
|
@@ -105,14 +105,14 @@ class BaseGenerator(ABC):
|
|
|
105
105
|
|
|
106
106
|
def _ensure_init_files(self, output_path: Path):
|
|
107
107
|
"""Ensure __init__.py files exist to make directories importable."""
|
|
108
|
-
# Get all directories from
|
|
108
|
+
# Get all directories from transformers down to the target directory
|
|
109
109
|
path_parts = output_path.parts
|
|
110
110
|
|
|
111
|
-
# Find the
|
|
111
|
+
# Find the transformers directory index
|
|
112
112
|
try:
|
|
113
|
-
|
|
113
|
+
transformers_index = path_parts.index("transformers")
|
|
114
114
|
except ValueError:
|
|
115
|
-
# No
|
|
115
|
+
# No transformers directory found, just create init for immediate parent
|
|
116
116
|
init_file = output_path.parent / "__init__.py"
|
|
117
117
|
if not init_file.exists():
|
|
118
118
|
init_file.touch()
|
|
@@ -120,9 +120,9 @@ class BaseGenerator(ABC):
|
|
|
120
120
|
print(f"Created {init_file}")
|
|
121
121
|
return
|
|
122
122
|
|
|
123
|
-
# Create __init__.py files for
|
|
123
|
+
# Create __init__.py files for transformers and all subdirectories leading to output
|
|
124
124
|
for i in range(
|
|
125
|
-
|
|
125
|
+
transformers_index, len(path_parts) - 1
|
|
126
126
|
): # -1 to exclude the file itself
|
|
127
127
|
dir_path = Path(*path_parts[: i + 1])
|
|
128
128
|
init_file = dir_path / "__init__.py"
|
|
@@ -133,18 +133,19 @@ class BaseGenerator(ABC):
|
|
|
133
133
|
|
|
134
134
|
|
|
135
135
|
def _copy_utils_files(self, output_path: Path):
|
|
136
|
-
"""Copy utility files like primitives.py to the
|
|
137
|
-
# Find the
|
|
136
|
+
"""Copy utility files like primitives.py to the transformers directory."""
|
|
137
|
+
# Find the transformers directory root
|
|
138
138
|
path_parts = output_path.parts
|
|
139
139
|
try:
|
|
140
|
-
|
|
141
|
-
|
|
140
|
+
transformers_index = path_parts.index("transformers")
|
|
141
|
+
transformers_root = Path(*path_parts[:transformers_index + 1])
|
|
142
142
|
except ValueError:
|
|
143
|
-
# Fallback to parent directory if no '
|
|
144
|
-
|
|
143
|
+
# Fallback to parent directory if no 'transformers' in path
|
|
144
|
+
transformers_root = output_path.parent.parent
|
|
145
145
|
|
|
146
|
-
# Create utils directory
|
|
147
|
-
|
|
146
|
+
# Create utils directory in the same directory as the generated files
|
|
147
|
+
# This puts it at transformers/pyspark/utils
|
|
148
|
+
utils_dir = output_path.parent / "utils"
|
|
148
149
|
utils_dir.mkdir(parents=True, exist_ok=True)
|
|
149
150
|
|
|
150
151
|
# Create __init__.py in utils directory
|
|
@@ -39,13 +39,8 @@ class SparkPandasUDFGenerator(BaseGenerator):
|
|
|
39
39
|
|
|
40
40
|
def _get_output_filename(self, transformer_name: str) -> str:
|
|
41
41
|
"""Get the output filename for PySpark primitives."""
|
|
42
|
-
#
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
# Use mapped name if available, otherwise fall back to transformer_name
|
|
50
|
-
output_name = name_mapping.get(transformer_name, f"{transformer_name}_primitives")
|
|
51
|
-
return f"{output_name}.py"
|
|
42
|
+
# Use the transformer name directly as the filename
|
|
43
|
+
# emails -> emails.py
|
|
44
|
+
# addresses -> addresses.py
|
|
45
|
+
# phone_numbers -> phone_numbers.py
|
|
46
|
+
return f"{transformer_name}.py"
|
|
@@ -1,3 +1,54 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Address transformation primitives for PySpark.
|
|
3
|
+
|
|
4
|
+
Preview Output:
|
|
5
|
+
+----------------------------------------------+-------------+-----------+-----------+-----+-------+
|
|
6
|
+
|address |street_number|street_name|city |state|zip |
|
|
7
|
+
+----------------------------------------------+-------------+-----------+-----------+-----+-------+
|
|
8
|
+
| 123 Main St, New York, NY 10001 |123 |Main |New York |NY |10001 |
|
|
9
|
+
|456 oak ave apt 5b, los angeles, ca 90001 |456 |Oak |Los Angeles|CA |90001 |
|
|
10
|
+
|789 ELM STREET CHICAGO IL 60601 |789 |Elm |Chicago |IL |60601 |
|
|
11
|
+
|321 pine rd. suite 100,, boston massachusetts|321 |Pine |Boston |MA |null |
|
|
12
|
+
|PO Box 789, Atlanta, GA 30301 |null |null |Atlanta |GA |30301 |
|
|
13
|
+
+----------------------------------------------+-------------+-----------+-----------+-----+-------+
|
|
14
|
+
|
|
15
|
+
Usage Example:
|
|
16
|
+
from pyspark.sql import SparkSession
|
|
17
|
+
from pyspark.sql import functions as F
|
|
18
|
+
from transformers.pyspark.addresses import addresses
|
|
19
|
+
|
|
20
|
+
# Initialize Spark
|
|
21
|
+
spark = SparkSession.builder.appName("DataCleaning").getOrCreate()
|
|
22
|
+
|
|
23
|
+
# Create sample data
|
|
24
|
+
data = [
|
|
25
|
+
("123 Main St, New York, NY 10001",),
|
|
26
|
+
("456 Oak Ave Apt 5B, Los Angeles, CA 90001",),
|
|
27
|
+
("789 Elm Street, Chicago, IL 60601",),
|
|
28
|
+
("321 Pine Road Suite 100, Boston, MA 02101",),
|
|
29
|
+
]
|
|
30
|
+
df = spark.createDataFrame(data, ["address"])
|
|
31
|
+
|
|
32
|
+
# Extract and standardize address components
|
|
33
|
+
result_df = df.select(
|
|
34
|
+
F.col("address"),
|
|
35
|
+
addresses.extract_street_number(F.col("address")).alias("street_number"),
|
|
36
|
+
addresses.extract_street_name(F.col("address")).alias("street_name"),
|
|
37
|
+
addresses.extract_city(F.col("address")).alias("city"),
|
|
38
|
+
addresses.extract_state(F.col("address")).alias("state"),
|
|
39
|
+
addresses.extract_zip_code(F.col("address")).alias("zip")
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Show results
|
|
43
|
+
result_df.show(truncate=False)
|
|
44
|
+
|
|
45
|
+
# Filter to valid addresses
|
|
46
|
+
valid_addresses = result_df.filter(addresses.validate_zip_code(F.col("zip")))
|
|
47
|
+
|
|
48
|
+
Installation:
|
|
49
|
+
datacompose add addresses
|
|
50
|
+
"""
|
|
51
|
+
|
|
1
52
|
import re
|
|
2
53
|
from typing import TYPE_CHECKING, Dict, List, Optional
|
|
3
54
|
|
|
@@ -16,7 +67,7 @@ else:
|
|
|
16
67
|
|
|
17
68
|
try:
|
|
18
69
|
# Try local utils import first (for generated code)
|
|
19
|
-
from utils.primitives import PrimitiveRegistry
|
|
70
|
+
from utils.primitives import PrimitiveRegistry # type: ignore
|
|
20
71
|
except ImportError:
|
|
21
72
|
# Fall back to installed datacompose package
|
|
22
73
|
from datacompose.operators.primitives import PrimitiveRegistry
|
|
@@ -345,8 +396,10 @@ def extract_street_name(col: Column) -> Column:
|
|
|
345
396
|
trimmed_col = F.trim(col)
|
|
346
397
|
without_number = F.when(
|
|
347
398
|
# If it's just a numbered street (e.g., "5th Avenue", "1st Street")
|
|
348
|
-
trimmed_col.rlike(
|
|
349
|
-
|
|
399
|
+
trimmed_col.rlike(
|
|
400
|
+
r"^(?i)\d+(?:st|nd|rd|th)\s+(?:" + "|".join(suffixes) + r")$"
|
|
401
|
+
),
|
|
402
|
+
trimmed_col, # Keep as is - it's a numbered street name
|
|
350
403
|
).otherwise(
|
|
351
404
|
# Otherwise remove the house number
|
|
352
405
|
F.regexp_replace(trimmed_col, r"^\d+[\w\-/]*\s+", "")
|
|
@@ -354,9 +407,7 @@ def extract_street_name(col: Column) -> Column:
|
|
|
354
407
|
|
|
355
408
|
# Remove directional prefix - case insensitive
|
|
356
409
|
# Include full directional words and abbreviations
|
|
357
|
-
prefix_pattern = (
|
|
358
|
-
r"^(?i)(?:North|South|East|West|Northeast|Northwest|Southeast|Southwest|N\.?|S\.?|E\.?|W\.?|NE\.?|NW\.?|SE\.?|SW\.?)\s+"
|
|
359
|
-
)
|
|
410
|
+
prefix_pattern = r"^(?i)(?:North|South|East|West|Northeast|Northwest|Southeast|Southwest|N\.?|S\.?|E\.?|W\.?|NE\.?|NW\.?|SE\.?|SW\.?)\s+"
|
|
360
411
|
without_prefix = F.regexp_replace(without_number, prefix_pattern, "")
|
|
361
412
|
|
|
362
413
|
# Extract everything before the street suffix - case insensitive
|
|
@@ -434,8 +485,10 @@ def extract_street_suffix(col: Column) -> Column:
|
|
|
434
485
|
|
|
435
486
|
# Build pattern to match the LAST suffix in the string
|
|
436
487
|
# This handles cases like "St. James Place" where we want "Place" not "St"
|
|
437
|
-
suffix_pattern =
|
|
438
|
-
|
|
488
|
+
suffix_pattern = (
|
|
489
|
+
r"\b(" + "|".join(suffixes) + r")\b(?!.*\b(?:" + "|".join(suffixes) + r")\b)"
|
|
490
|
+
)
|
|
491
|
+
|
|
439
492
|
# Extract the last matching suffix - case insensitive
|
|
440
493
|
suffix_pattern_ci = r"(?i)" + suffix_pattern
|
|
441
494
|
result = F.regexp_extract(col, suffix_pattern_ci, 1)
|
|
@@ -653,25 +706,27 @@ def standardize_street_suffix(
|
|
|
653
706
|
if col is None:
|
|
654
707
|
return F.lit("")
|
|
655
708
|
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
656
|
-
|
|
709
|
+
|
|
657
710
|
# Convert to uppercase for matching
|
|
658
711
|
upper_col = F.upper(F.trim(col))
|
|
659
712
|
|
|
660
713
|
# Start with the original column
|
|
661
714
|
result = col
|
|
662
|
-
|
|
715
|
+
|
|
663
716
|
# Apply custom mappings first if provided (they take precedence)
|
|
664
717
|
if custom_mappings:
|
|
665
718
|
for original, standard in custom_mappings.items():
|
|
666
719
|
result = F.when(
|
|
667
720
|
upper_col == F.upper(F.lit(original)), F.lit(standard)
|
|
668
721
|
).otherwise(result)
|
|
669
|
-
|
|
722
|
+
|
|
670
723
|
# Then apply standard mappings for anything not already mapped
|
|
671
724
|
# Need to check if result has changed to avoid overwriting custom mappings
|
|
672
725
|
for original, standard in suffix_map.items():
|
|
673
726
|
# Only apply if not already mapped by custom mappings
|
|
674
|
-
if custom_mappings and original.upper() in [
|
|
727
|
+
if custom_mappings and original.upper() in [
|
|
728
|
+
k.upper() for k in custom_mappings.keys()
|
|
729
|
+
]:
|
|
675
730
|
continue
|
|
676
731
|
result = F.when(upper_col == original, F.lit(standard)).otherwise(result)
|
|
677
732
|
|
|
@@ -1005,7 +1060,7 @@ def format_secondary_address(unit_type: Column, unit_number: Column) -> Column:
|
|
|
1005
1060
|
Column with formatted secondary address
|
|
1006
1061
|
|
|
1007
1062
|
Example:
|
|
1008
|
-
from datacompose.transformers.text.
|
|
1063
|
+
from datacompose.transformers.text.addresses.pyspark.pyspark_udf import format_secondary_address
|
|
1009
1064
|
df.select(format_secondary_address(F.lit("Apartment"), F.lit("5B")))
|
|
1010
1065
|
# -> "Apt 5B"
|
|
1011
1066
|
"""
|
|
@@ -1,3 +1,55 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Email transformation primitives for PySpark.
|
|
3
|
+
|
|
4
|
+
Preview Output:
|
|
5
|
+
+---------------------------+----------------------+-------------+----------------+--------+
|
|
6
|
+
|email |standardized |username |domain |is_valid|
|
|
7
|
+
+---------------------------+----------------------+-------------+----------------+--------+
|
|
8
|
+
| John.Doe@Gmail.COM |john.doe@gmail.com |john.doe |gmail.com |true |
|
|
9
|
+
|JANE.SMITH@OUTLOOK.COM |jane.smith@outlook.com|jane.smith |outlook.com |true |
|
|
10
|
+
| info@company-name.org |info@company-name.org |info |company-name.org|true |
|
|
11
|
+
|invalid.email@ |null |null |null |false |
|
|
12
|
+
|user+tag@domain.co.uk |user+tag@domain.co.uk |user+tag |domain.co.uk |true |
|
|
13
|
+
|bad email@test.com |null |null |null |false |
|
|
14
|
+
+---------------------------+----------------------+-------------+----------------+--------+
|
|
15
|
+
|
|
16
|
+
Usage Example:
|
|
17
|
+
from pyspark.sql import SparkSession
|
|
18
|
+
from pyspark.sql import functions as F
|
|
19
|
+
from transformers.pyspark.emails import emails
|
|
20
|
+
|
|
21
|
+
# Initialize Spark
|
|
22
|
+
spark = SparkSession.builder.appName("EmailCleaning").getOrCreate()
|
|
23
|
+
|
|
24
|
+
# Create sample data
|
|
25
|
+
data = [
|
|
26
|
+
("john.doe@gmail.com",),
|
|
27
|
+
("JANE.SMITH@OUTLOOK.COM",),
|
|
28
|
+
("info@company-name.org",),
|
|
29
|
+
("invalid.email@",),
|
|
30
|
+
("user+tag@domain.co.uk",),
|
|
31
|
+
]
|
|
32
|
+
df = spark.createDataFrame(data, ["email"])
|
|
33
|
+
|
|
34
|
+
# Extract and validate email components
|
|
35
|
+
result_df = df.select(
|
|
36
|
+
F.col("email"),
|
|
37
|
+
emails.standardize_email(F.col("email")).alias("standardized"),
|
|
38
|
+
emails.extract_username(F.col("email")).alias("username"),
|
|
39
|
+
emails.extract_domain(F.col("email")).alias("domain"),
|
|
40
|
+
emails.is_valid_email(F.col("email")).alias("is_valid")
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Show results
|
|
44
|
+
result_df.show(truncate=False)
|
|
45
|
+
|
|
46
|
+
# Filter to valid emails only
|
|
47
|
+
valid_emails = result_df.filter(F.col("is_valid") == True)
|
|
48
|
+
|
|
49
|
+
Installation:
|
|
50
|
+
datacompose add emails
|
|
51
|
+
"""
|
|
52
|
+
|
|
1
53
|
import re
|
|
2
54
|
from typing import TYPE_CHECKING, Dict, List, Optional
|
|
3
55
|
|
|
@@ -16,7 +68,7 @@ else:
|
|
|
16
68
|
|
|
17
69
|
try:
|
|
18
70
|
# Try local utils import first (for generated code)
|
|
19
|
-
from utils.primitives import PrimitiveRegistry
|
|
71
|
+
from utils.primitives import PrimitiveRegistry # type: ignore
|
|
20
72
|
except ImportError:
|
|
21
73
|
# Fall back to installed datacompose package
|
|
22
74
|
from datacompose.operators.primitives import PrimitiveRegistry
|