datacompose 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacompose might be problematic. Click here for more details.
- datacompose/__init__.py +1 -0
- datacompose/cli/__init__.py +5 -0
- datacompose/cli/colors.py +80 -0
- datacompose/cli/commands/__init__.py +3 -0
- datacompose/cli/commands/add.py +215 -0
- datacompose/cli/commands/init.py +451 -0
- datacompose/cli/commands/list.py +118 -0
- datacompose/cli/commands/upgrade.py +7 -0
- datacompose/cli/main.py +59 -0
- datacompose/cli/validation.py +72 -0
- datacompose/generators/__init__.py +3 -0
- datacompose/generators/base.py +193 -0
- datacompose/generators/pyspark/__init__.py +1 -0
- datacompose/generators/pyspark/generator.py +51 -0
- datacompose/operators/__init__.py +21 -0
- datacompose/operators/primitives.py +595 -0
- datacompose/transformers/__init__.py +0 -0
- datacompose/transformers/discovery.py +186 -0
- datacompose/transformers/text/__init__.py +1 -0
- datacompose/transformers/text/clean_addresses/__init__.py +1 -0
- datacompose/transformers/text/clean_addresses/pyspark/pyspark_primitives.py +1967 -0
- datacompose/transformers/text/clean_emails/__init__.py +1 -0
- datacompose/transformers/text/clean_emails/pyspark/pyspark_primitives.py +781 -0
- datacompose/transformers/text/clean_phone_numbers/__init__.py +0 -0
- datacompose/transformers/text/clean_phone_numbers/pyspark/pyspark_primitives.py +941 -0
- datacompose-0.2.4.dist-info/METADATA +431 -0
- datacompose-0.2.4.dist-info/RECORD +31 -0
- datacompose-0.2.4.dist-info/WHEEL +5 -0
- datacompose-0.2.4.dist-info/entry_points.txt +2 -0
- datacompose-0.2.4.dist-info/licenses/LICENSE +21 -0
- datacompose-0.2.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Transformer and generator discovery system.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import importlib
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict, List, Optional, Tuple
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TransformerDiscovery:
|
|
12
|
+
"""Discovers available transformers and generators."""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self, transformers_dir: Path | None = None, generators_dir: Path | None = None
|
|
16
|
+
):
|
|
17
|
+
"""Initialize discovery with custom directories."""
|
|
18
|
+
self.transformers_dir = transformers_dir or Path(__file__).parent
|
|
19
|
+
self.generators_dir = (
|
|
20
|
+
generators_dir or Path(__file__).parent.parent / "generators"
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
def discover_transformers(self) -> Dict[str, Path]:
|
|
24
|
+
"""
|
|
25
|
+
Discover available transformers.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Dict[transformer_name, transformer_path]
|
|
29
|
+
"""
|
|
30
|
+
transformers = {}
|
|
31
|
+
|
|
32
|
+
# Look for domain directories, then transformer directories inside them
|
|
33
|
+
for domain_dir in self.transformers_dir.iterdir():
|
|
34
|
+
if (
|
|
35
|
+
domain_dir.is_dir()
|
|
36
|
+
and not domain_dir.name.startswith((".", "__"))
|
|
37
|
+
and domain_dir.name not in ("discovery.py")
|
|
38
|
+
):
|
|
39
|
+
|
|
40
|
+
# Look for transformer directories within domain
|
|
41
|
+
for transformer_dir in domain_dir.iterdir():
|
|
42
|
+
if transformer_dir.is_dir() and not transformer_dir.name.startswith(
|
|
43
|
+
(".", "__")
|
|
44
|
+
):
|
|
45
|
+
transformer_name = transformer_dir.name
|
|
46
|
+
# Just store the transformer directory path
|
|
47
|
+
transformers[transformer_name] = transformer_dir
|
|
48
|
+
|
|
49
|
+
return transformers
|
|
50
|
+
|
|
51
|
+
def discover_generators(self) -> Dict[str, Dict[str, type]]:
|
|
52
|
+
"""
|
|
53
|
+
Discover available generators by platform.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Dict[platform, Dict[generator_type, generator_class]]
|
|
57
|
+
"""
|
|
58
|
+
generators = {}
|
|
59
|
+
|
|
60
|
+
# Look for platform directories
|
|
61
|
+
for platform_dir in self.generators_dir.iterdir():
|
|
62
|
+
if (
|
|
63
|
+
platform_dir.is_dir()
|
|
64
|
+
and not platform_dir.name.startswith((".", "__"))
|
|
65
|
+
and platform_dir.name != "base.py"
|
|
66
|
+
):
|
|
67
|
+
platform_name = platform_dir.name
|
|
68
|
+
generators[platform_name] = {}
|
|
69
|
+
|
|
70
|
+
# Look for generator files within platform
|
|
71
|
+
for generator_file in platform_dir.glob("*.py"):
|
|
72
|
+
if not generator_file.name.startswith((".", "__")):
|
|
73
|
+
generator_name = generator_file.stem
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
# Import the generator module
|
|
77
|
+
module_path = (
|
|
78
|
+
f"datacompose.generators.{platform_name}.{generator_name}"
|
|
79
|
+
)
|
|
80
|
+
module = importlib.import_module(module_path)
|
|
81
|
+
|
|
82
|
+
# Find generator classes or factory functions
|
|
83
|
+
for attr_name in dir(module):
|
|
84
|
+
attr = getattr(module, attr_name)
|
|
85
|
+
# Check for generator classes
|
|
86
|
+
if (
|
|
87
|
+
isinstance(attr, type)
|
|
88
|
+
and hasattr(attr, "_get_template_content")
|
|
89
|
+
and attr.__name__.endswith("Generator")
|
|
90
|
+
):
|
|
91
|
+
generators[platform_name][generator_name] = attr
|
|
92
|
+
# Check for factory functions that create generators
|
|
93
|
+
elif (
|
|
94
|
+
callable(attr)
|
|
95
|
+
and attr_name.endswith("Generator")
|
|
96
|
+
and not attr_name.startswith("_")
|
|
97
|
+
):
|
|
98
|
+
generators[platform_name][generator_name] = attr
|
|
99
|
+
except Exception:
|
|
100
|
+
# Skip modules that can't be imported
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
return generators
|
|
104
|
+
|
|
105
|
+
def get_transformer_info(self, transformer: str) -> Optional[Dict]:
|
|
106
|
+
"""Get info for a specific transformer."""
|
|
107
|
+
transformers = self.discover_transformers()
|
|
108
|
+
|
|
109
|
+
if transformer in transformers:
|
|
110
|
+
# Return basic info about the transformer
|
|
111
|
+
return {
|
|
112
|
+
"name": transformer,
|
|
113
|
+
"path": str(transformers[transformer])
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
return None
|
|
117
|
+
|
|
118
|
+
def resolve_transformer(
|
|
119
|
+
self, transformer_ref: str
|
|
120
|
+
) -> Tuple[Optional[str], Optional[Path]]:
|
|
121
|
+
"""
|
|
122
|
+
Resolve transformer reference to name and transformer path.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
transformer_ref: transformer name
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
Tuple of (transformer_name, transformer_path) or (None, None)
|
|
129
|
+
"""
|
|
130
|
+
transformers = self.discover_transformers()
|
|
131
|
+
|
|
132
|
+
if transformer_ref in transformers:
|
|
133
|
+
return transformer_ref, transformers[transformer_ref]
|
|
134
|
+
|
|
135
|
+
return None, None
|
|
136
|
+
|
|
137
|
+
def resolve_generator(self, generator_ref: str) -> Optional[type]:
|
|
138
|
+
"""
|
|
139
|
+
Resolve generator reference to generator class.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
generator_ref: Either "platform.type" or just "platform" (defaults to pandas_udf for pyspark)
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Generator class or None
|
|
146
|
+
"""
|
|
147
|
+
if "." in generator_ref:
|
|
148
|
+
# New format: platform.type
|
|
149
|
+
platform, gen_type = generator_ref.split(".", 1)
|
|
150
|
+
else:
|
|
151
|
+
# Legacy format: just platform, use default type or first available
|
|
152
|
+
platform = generator_ref
|
|
153
|
+
gen_type = None
|
|
154
|
+
|
|
155
|
+
generators = self.discover_generators()
|
|
156
|
+
|
|
157
|
+
if platform in generators:
|
|
158
|
+
if gen_type and gen_type in generators[platform]:
|
|
159
|
+
return generators[platform][gen_type]
|
|
160
|
+
elif not gen_type:
|
|
161
|
+
# No specific type requested, try to find a default or use first available
|
|
162
|
+
if "pandas_udf" in generators[platform]:
|
|
163
|
+
return generators[platform]["pandas_udf"]
|
|
164
|
+
elif "generator" in generators[platform]:
|
|
165
|
+
return generators[platform]["generator"]
|
|
166
|
+
elif generators[platform]:
|
|
167
|
+
# Use the first available generator for this platform
|
|
168
|
+
return next(iter(generators[platform].values()))
|
|
169
|
+
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
def list_transformers(self) -> List[str]:
|
|
173
|
+
"""List all available transformers."""
|
|
174
|
+
transformers = self.discover_transformers()
|
|
175
|
+
return sorted(transformers.keys())
|
|
176
|
+
|
|
177
|
+
def list_generators(self) -> List[str]:
|
|
178
|
+
"""List all available generators in platform.type format."""
|
|
179
|
+
generators = self.discover_generators()
|
|
180
|
+
result = []
|
|
181
|
+
|
|
182
|
+
for platform, platform_generators in generators.items():
|
|
183
|
+
for gen_type in platform_generators.keys():
|
|
184
|
+
result.append(f"{platform}.{gen_type}")
|
|
185
|
+
|
|
186
|
+
return sorted(result)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Text processing transformers."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Address processing transformers."""
|