datacompose 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacompose might be problematic. Click here for more details.

Files changed (31) hide show
  1. datacompose/__init__.py +1 -0
  2. datacompose/cli/__init__.py +5 -0
  3. datacompose/cli/colors.py +80 -0
  4. datacompose/cli/commands/__init__.py +3 -0
  5. datacompose/cli/commands/add.py +215 -0
  6. datacompose/cli/commands/init.py +451 -0
  7. datacompose/cli/commands/list.py +118 -0
  8. datacompose/cli/commands/upgrade.py +7 -0
  9. datacompose/cli/main.py +59 -0
  10. datacompose/cli/validation.py +72 -0
  11. datacompose/generators/__init__.py +3 -0
  12. datacompose/generators/base.py +193 -0
  13. datacompose/generators/pyspark/__init__.py +1 -0
  14. datacompose/generators/pyspark/generator.py +51 -0
  15. datacompose/operators/__init__.py +21 -0
  16. datacompose/operators/primitives.py +595 -0
  17. datacompose/transformers/__init__.py +0 -0
  18. datacompose/transformers/discovery.py +186 -0
  19. datacompose/transformers/text/__init__.py +1 -0
  20. datacompose/transformers/text/clean_addresses/__init__.py +1 -0
  21. datacompose/transformers/text/clean_addresses/pyspark/pyspark_primitives.py +1967 -0
  22. datacompose/transformers/text/clean_emails/__init__.py +1 -0
  23. datacompose/transformers/text/clean_emails/pyspark/pyspark_primitives.py +781 -0
  24. datacompose/transformers/text/clean_phone_numbers/__init__.py +0 -0
  25. datacompose/transformers/text/clean_phone_numbers/pyspark/pyspark_primitives.py +941 -0
  26. datacompose-0.2.4.dist-info/METADATA +431 -0
  27. datacompose-0.2.4.dist-info/RECORD +31 -0
  28. datacompose-0.2.4.dist-info/WHEEL +5 -0
  29. datacompose-0.2.4.dist-info/entry_points.txt +2 -0
  30. datacompose-0.2.4.dist-info/licenses/LICENSE +21 -0
  31. datacompose-0.2.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,186 @@
1
+ """
2
+ Transformer and generator discovery system.
3
+ """
4
+
5
+ import importlib
6
+ from pathlib import Path
7
+ from typing import Dict, List, Optional, Tuple
8
+
9
+
10
+
11
+ class TransformerDiscovery:
12
+ """Discovers available transformers and generators."""
13
+
14
+ def __init__(
15
+ self, transformers_dir: Path | None = None, generators_dir: Path | None = None
16
+ ):
17
+ """Initialize discovery with custom directories."""
18
+ self.transformers_dir = transformers_dir or Path(__file__).parent
19
+ self.generators_dir = (
20
+ generators_dir or Path(__file__).parent.parent / "generators"
21
+ )
22
+
23
+ def discover_transformers(self) -> Dict[str, Path]:
24
+ """
25
+ Discover available transformers.
26
+
27
+ Returns:
28
+ Dict[transformer_name, transformer_path]
29
+ """
30
+ transformers = {}
31
+
32
+ # Look for domain directories, then transformer directories inside them
33
+ for domain_dir in self.transformers_dir.iterdir():
34
+ if (
35
+ domain_dir.is_dir()
36
+ and not domain_dir.name.startswith((".", "__"))
37
+ and domain_dir.name not in ("discovery.py")
38
+ ):
39
+
40
+ # Look for transformer directories within domain
41
+ for transformer_dir in domain_dir.iterdir():
42
+ if transformer_dir.is_dir() and not transformer_dir.name.startswith(
43
+ (".", "__")
44
+ ):
45
+ transformer_name = transformer_dir.name
46
+ # Just store the transformer directory path
47
+ transformers[transformer_name] = transformer_dir
48
+
49
+ return transformers
50
+
51
+ def discover_generators(self) -> Dict[str, Dict[str, type]]:
52
+ """
53
+ Discover available generators by platform.
54
+
55
+ Returns:
56
+ Dict[platform, Dict[generator_type, generator_class]]
57
+ """
58
+ generators = {}
59
+
60
+ # Look for platform directories
61
+ for platform_dir in self.generators_dir.iterdir():
62
+ if (
63
+ platform_dir.is_dir()
64
+ and not platform_dir.name.startswith((".", "__"))
65
+ and platform_dir.name != "base.py"
66
+ ):
67
+ platform_name = platform_dir.name
68
+ generators[platform_name] = {}
69
+
70
+ # Look for generator files within platform
71
+ for generator_file in platform_dir.glob("*.py"):
72
+ if not generator_file.name.startswith((".", "__")):
73
+ generator_name = generator_file.stem
74
+
75
+ try:
76
+ # Import the generator module
77
+ module_path = (
78
+ f"datacompose.generators.{platform_name}.{generator_name}"
79
+ )
80
+ module = importlib.import_module(module_path)
81
+
82
+ # Find generator classes or factory functions
83
+ for attr_name in dir(module):
84
+ attr = getattr(module, attr_name)
85
+ # Check for generator classes
86
+ if (
87
+ isinstance(attr, type)
88
+ and hasattr(attr, "_get_template_content")
89
+ and attr.__name__.endswith("Generator")
90
+ ):
91
+ generators[platform_name][generator_name] = attr
92
+ # Check for factory functions that create generators
93
+ elif (
94
+ callable(attr)
95
+ and attr_name.endswith("Generator")
96
+ and not attr_name.startswith("_")
97
+ ):
98
+ generators[platform_name][generator_name] = attr
99
+ except Exception:
100
+ # Skip modules that can't be imported
101
+ continue
102
+
103
+ return generators
104
+
105
+ def get_transformer_info(self, transformer: str) -> Optional[Dict]:
106
+ """Get info for a specific transformer."""
107
+ transformers = self.discover_transformers()
108
+
109
+ if transformer in transformers:
110
+ # Return basic info about the transformer
111
+ return {
112
+ "name": transformer,
113
+ "path": str(transformers[transformer])
114
+ }
115
+
116
+ return None
117
+
118
+ def resolve_transformer(
119
+ self, transformer_ref: str
120
+ ) -> Tuple[Optional[str], Optional[Path]]:
121
+ """
122
+ Resolve transformer reference to name and transformer path.
123
+
124
+ Args:
125
+ transformer_ref: transformer name
126
+
127
+ Returns:
128
+ Tuple of (transformer_name, transformer_path) or (None, None)
129
+ """
130
+ transformers = self.discover_transformers()
131
+
132
+ if transformer_ref in transformers:
133
+ return transformer_ref, transformers[transformer_ref]
134
+
135
+ return None, None
136
+
137
+ def resolve_generator(self, generator_ref: str) -> Optional[type]:
138
+ """
139
+ Resolve generator reference to generator class.
140
+
141
+ Args:
142
+ generator_ref: Either "platform.type" or just "platform" (defaults to pandas_udf for pyspark)
143
+
144
+ Returns:
145
+ Generator class or None
146
+ """
147
+ if "." in generator_ref:
148
+ # New format: platform.type
149
+ platform, gen_type = generator_ref.split(".", 1)
150
+ else:
151
+ # Legacy format: just platform, use default type or first available
152
+ platform = generator_ref
153
+ gen_type = None
154
+
155
+ generators = self.discover_generators()
156
+
157
+ if platform in generators:
158
+ if gen_type and gen_type in generators[platform]:
159
+ return generators[platform][gen_type]
160
+ elif not gen_type:
161
+ # No specific type requested, try to find a default or use first available
162
+ if "pandas_udf" in generators[platform]:
163
+ return generators[platform]["pandas_udf"]
164
+ elif "generator" in generators[platform]:
165
+ return generators[platform]["generator"]
166
+ elif generators[platform]:
167
+ # Use the first available generator for this platform
168
+ return next(iter(generators[platform].values()))
169
+
170
+ return None
171
+
172
+ def list_transformers(self) -> List[str]:
173
+ """List all available transformers."""
174
+ transformers = self.discover_transformers()
175
+ return sorted(transformers.keys())
176
+
177
+ def list_generators(self) -> List[str]:
178
+ """List all available generators in platform.type format."""
179
+ generators = self.discover_generators()
180
+ result = []
181
+
182
+ for platform, platform_generators in generators.items():
183
+ for gen_type in platform_generators.keys():
184
+ result.append(f"{platform}.{gen_type}")
185
+
186
+ return sorted(result)
@@ -0,0 +1 @@
1
+ """Text processing transformers."""
@@ -0,0 +1 @@
1
+ """Address processing transformers."""