datacompose 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacompose might be problematic. Click here for more details.

Files changed (31) hide show
  1. datacompose/__init__.py +1 -0
  2. datacompose/cli/__init__.py +5 -0
  3. datacompose/cli/colors.py +80 -0
  4. datacompose/cli/commands/__init__.py +3 -0
  5. datacompose/cli/commands/add.py +215 -0
  6. datacompose/cli/commands/init.py +451 -0
  7. datacompose/cli/commands/list.py +118 -0
  8. datacompose/cli/commands/upgrade.py +7 -0
  9. datacompose/cli/main.py +59 -0
  10. datacompose/cli/validation.py +72 -0
  11. datacompose/generators/__init__.py +3 -0
  12. datacompose/generators/base.py +193 -0
  13. datacompose/generators/pyspark/__init__.py +1 -0
  14. datacompose/generators/pyspark/generator.py +51 -0
  15. datacompose/operators/__init__.py +21 -0
  16. datacompose/operators/primitives.py +595 -0
  17. datacompose/transformers/__init__.py +0 -0
  18. datacompose/transformers/discovery.py +186 -0
  19. datacompose/transformers/text/__init__.py +1 -0
  20. datacompose/transformers/text/clean_addresses/__init__.py +1 -0
  21. datacompose/transformers/text/clean_addresses/pyspark/pyspark_primitives.py +1967 -0
  22. datacompose/transformers/text/clean_emails/__init__.py +1 -0
  23. datacompose/transformers/text/clean_emails/pyspark/pyspark_primitives.py +781 -0
  24. datacompose/transformers/text/clean_phone_numbers/__init__.py +0 -0
  25. datacompose/transformers/text/clean_phone_numbers/pyspark/pyspark_primitives.py +941 -0
  26. datacompose-0.2.4.dist-info/METADATA +431 -0
  27. datacompose-0.2.4.dist-info/RECORD +31 -0
  28. datacompose-0.2.4.dist-info/WHEEL +5 -0
  29. datacompose-0.2.4.dist-info/entry_points.txt +2 -0
  30. datacompose-0.2.4.dist-info/licenses/LICENSE +21 -0
  31. datacompose-0.2.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,72 @@
1
+ """
2
+ Validation utilities for CLI commands.
3
+ """
4
+
5
+ from datacompose.transformers.discovery import TransformerDiscovery
6
+ from datacompose.cli.colors import error, info
7
+
8
+
9
+ def validate_platform(platform: str, discovery: TransformerDiscovery) -> bool:
10
+ """Validate that platform exists.
11
+
12
+ Args:
13
+ platform: Platform name (e.g., 'pyspark', 'postgres', 'snowflake')
14
+ discovery: TransformerDiscovery instance
15
+
16
+ Returns:
17
+ True if platform is valid, False otherwise
18
+ """
19
+ available_generators = discovery.list_generators()
20
+ available_platforms = list(set(g.split(".")[0] for g in available_generators))
21
+
22
+ if platform not in available_platforms:
23
+ print(error(f"Platform '{platform}' not found."))
24
+ print(info(f"Available platforms: {', '.join(sorted(available_platforms))}"))
25
+ return False
26
+ return True
27
+
28
+
29
+ def validate_type_for_platform(
30
+ platform: str, type_name: str, discovery: TransformerDiscovery
31
+ ) -> bool:
32
+ """Validate that type exists for the given platform.
33
+
34
+ Args:
35
+ platform: Platform name (e.g., 'pyspark', 'postgres')
36
+ type_name: Type name (e.g., 'pandas_udf', 'sql_udf')
37
+ discovery: TransformerDiscovery instance
38
+
39
+ Returns:
40
+ True if type is valid for platform, False otherwise
41
+ """
42
+ available_generators = discovery.list_generators()
43
+ platform_generators = [
44
+ g for g in available_generators if g.startswith(f"{platform}.")
45
+ ]
46
+ available_types = [g.split(".")[1] for g in platform_generators]
47
+
48
+ if type_name not in available_types:
49
+ print(error(f"Type '{type_name}' not available for platform '{platform}'."))
50
+ if available_types:
51
+ print(info(f"Available types for {platform}: {', '.join(available_types)}"))
52
+ else:
53
+ print(info(f"No generators available for platform '{platform}'."))
54
+ return False
55
+ return True
56
+
57
+
58
+ def get_available_platforms(discovery: TransformerDiscovery) -> list[str]:
59
+ """Get list of available platforms."""
60
+ available_generators = discovery.list_generators()
61
+ return sorted(set(g.split(".")[0] for g in available_generators))
62
+
63
+
64
+ def get_available_types_for_platform(
65
+ platform: str, discovery: TransformerDiscovery
66
+ ) -> list[str]:
67
+ """Get list of available types for a specific platform."""
68
+ available_generators = discovery.list_generators()
69
+ platform_generators = [
70
+ g for g in available_generators if g.startswith(f"{platform}.")
71
+ ]
72
+ return [g.split(".")[1] for g in platform_generators]
@@ -0,0 +1,3 @@
1
+ """
2
+ UDF generators for different platforms.
3
+ """
@@ -0,0 +1,193 @@
1
+ """
2
+ Base generator class for UDF generation.
3
+
4
+ The classes that inherit from this generator must implement the following methods:
5
+ def _get_template_content(self, transformer_dir: Path | None = None) -> str:
6
+ def __get_output_filename as well as any other build steps that you want.
7
+ """
8
+
9
+ import hashlib
10
+ from abc import ABC, abstractmethod
11
+ from datetime import datetime
12
+ from pathlib import Path
13
+ from typing import Any, Dict, Optional
14
+
15
+
16
+ class BaseGenerator(ABC):
17
+ """Base class for UDF generators."""
18
+
19
+ def __init__(self, template_dir: Path, output_dir: Path, verbose: bool = False):
20
+ """Initialize the generator.
21
+
22
+ Args:
23
+ template_dir: Directory containing templates
24
+ output_dir: Directory to write generated UDFs
25
+ verbose: Enable verbose output
26
+ """
27
+ self.template_dir = template_dir
28
+ self.output_dir = output_dir
29
+ self.verbose = verbose
30
+
31
+ def generate(
32
+ self,
33
+ transformer_name: str,
34
+ force: bool = False,
35
+ transformer_dir: Optional[Path] = None,
36
+ ) -> Dict[str, Any]:
37
+ """Generate UDF for transformer.
38
+
39
+ Args:
40
+ transformer_name: Name of the transformer
41
+ force: Force regeneration even if hash matches
42
+ transformer_dir: Directory containing the transformer (for template lookup)
43
+
44
+ Returns:
45
+ Dictionary with generation results
46
+ """
47
+ # Create a minimal spec-like dict from transformer name for compatibility
48
+ spec = {"name": transformer_name}
49
+
50
+ # Get template content
51
+ template_content = self._get_template_content(transformer_dir)
52
+
53
+ # Calculate hash for caching
54
+ spec_hash = self._calculate_hash(spec, template_content)
55
+
56
+ # Determine output path
57
+ output_file = self._get_output_filename(spec["name"])
58
+ output_path = self.output_dir / output_file
59
+
60
+ # Check if regeneration is needed
61
+ if not force and self._should_skip_generation(output_path, spec_hash):
62
+ return {
63
+ "skipped": True,
64
+ "output_path": str(output_path),
65
+ "hash": spec_hash,
66
+ "function_name": f"{spec['name']}_udf",
67
+ }
68
+
69
+ # Copy utils/primitives.py to the output directory
70
+ self._copy_utils_files(output_path)
71
+
72
+
73
+ return {
74
+ "skipped": False,
75
+ "output_path": str(output_path),
76
+ "hash": spec_hash,
77
+ "function_name": f"{spec['name']}_udf",
78
+ }
79
+
80
+ @staticmethod
81
+ def _calculate_hash(spec: Dict[str, Any], template_content: str) -> str:
82
+ """Calculate hash for cache invalidation."""
83
+ content = str(spec) + template_content
84
+ return hashlib.sha256(content.encode("utf-8")).hexdigest()[:8]
85
+
86
+ @staticmethod
87
+ def _should_skip_generation(output_path: Path, spec_hash: str) -> bool:
88
+ """Check if generation should be skipped based on hash."""
89
+ if not output_path.exists():
90
+ return False
91
+
92
+ try:
93
+ with open(output_path, "r") as f:
94
+ first_lines = "".join(f.readlines()[:5])
95
+ return f"Hash: {spec_hash}" in first_lines
96
+ except Exception:
97
+ return False
98
+
99
+ def _write_output(self, output_path: Path, content: str):
100
+ """Write generated content to output file."""
101
+ # Create output directory if it doesn't exist
102
+ output_path.parent.mkdir(parents=True, exist_ok=True)
103
+
104
+ # Create __init__.py files to make directories importable as Python packages
105
+ self._ensure_init_files(output_path)
106
+
107
+ with open(output_path, "w") as f:
108
+ f.write(content)
109
+
110
+ if self.verbose:
111
+ print(f"Wrote output to: {output_path}")
112
+
113
+ def _ensure_init_files(self, output_path: Path):
114
+ """Ensure __init__.py files exist to make directories importable."""
115
+ # Get all directories from build down to the target directory
116
+ path_parts = output_path.parts
117
+
118
+ # Find the build directory index
119
+ try:
120
+ build_index = path_parts.index("build")
121
+ except ValueError:
122
+ # No build directory found, just create init for immediate parent
123
+ init_file = output_path.parent / "__init__.py"
124
+ if not init_file.exists():
125
+ init_file.touch()
126
+ if self.verbose:
127
+ print(f"Created {init_file}")
128
+ return
129
+
130
+ # Create __init__.py files for build and all subdirectories leading to output
131
+ for i in range(
132
+ build_index, len(path_parts) - 1
133
+ ): # -1 to exclude the file itself
134
+ dir_path = Path(*path_parts[: i + 1])
135
+ init_file = dir_path / "__init__.py"
136
+ if not init_file.exists():
137
+ init_file.touch()
138
+ if self.verbose:
139
+ print(f"Created {init_file}")
140
+
141
+ @staticmethod
142
+ def _prepare_template_vars(spec: Dict[str, Any], spec_hash: str) -> Dict[str, Any]:
143
+ """Prepare variables for template rendering."""
144
+ return {
145
+ "transformer_name": spec["name"],
146
+ "udf_name": f"{spec['name']}_udf",
147
+ "hash": spec_hash,
148
+ "generation_timestamp": datetime.now().isoformat(),
149
+ "typo_map": spec.get("typo_map", {}),
150
+ "regex_patterns": spec.get("regex", {}),
151
+ "flags": spec.get("flags", {}),
152
+ "options": spec.get("options", {}),
153
+ "custom_rules": spec.get("custom_rules", {}),
154
+ }
155
+
156
+
157
+ def _copy_utils_files(self, output_path: Path):
158
+ """Copy utility files like primitives.py to the output directory."""
159
+ # Create utils directory at the same level as the output file
160
+ utils_dir = output_path.parent / "utils"
161
+ utils_dir.mkdir(parents=True, exist_ok=True)
162
+
163
+ # Create __init__.py in utils directory
164
+ init_file = utils_dir / "__init__.py"
165
+ if not init_file.exists():
166
+ init_file.touch()
167
+ if self.verbose:
168
+ print(f"Created {init_file}")
169
+
170
+ # Copy primitives.py from datacompose.operators
171
+ primitives_source = Path(__file__).parent.parent / "operators" / "primitives.py"
172
+ primitives_dest = utils_dir / "primitives.py"
173
+
174
+ if primitives_source.exists() and not primitives_dest.exists():
175
+ import shutil
176
+ shutil.copy2(primitives_source, primitives_dest)
177
+ if self.verbose:
178
+ print(f"Copied primitives.py to {primitives_dest}")
179
+
180
+ @classmethod
181
+ @abstractmethod
182
+ def _get_template_location(cls, transformer_dir: Path | None) -> Path | None:
183
+ pass
184
+
185
+ @abstractmethod
186
+ def _get_template_content(self, transformer_dir: Path | None) -> str:
187
+ """Get the template content for this generator."""
188
+ pass
189
+
190
+ @abstractmethod
191
+ def _get_output_filename(self, transformer_name: str) -> str:
192
+ """Get the output filename for generated UDF."""
193
+ pass
@@ -0,0 +1 @@
1
+ """Spark platform generators."""
@@ -0,0 +1,51 @@
1
+ """
2
+ Spark pandas UDF generator.
3
+ """
4
+
5
+ from pathlib import Path
6
+
7
+ from ..base import BaseGenerator
8
+
9
+
10
+ class SparkPandasUDFGenerator(BaseGenerator):
11
+ """Generator for Apache Spark pandas UDFs."""
12
+
13
+ ENGINE_SUBDIRECTORY = "pyspark"
14
+ TEMPLATE_FILENAME = "pyspark_primitives.py"
15
+
16
+ @classmethod
17
+ def _get_template_location(cls, transformer_dir: Path | None) -> Path | None:
18
+ if transformer_dir is None:
19
+ return None
20
+ return transformer_dir / cls.ENGINE_SUBDIRECTORY / cls.TEMPLATE_FILENAME
21
+
22
+ def _get_template_content(self, transformer_dir: Path | None = None) -> str:
23
+ """Get the template content for Spark pandas UDFs."""
24
+ if transformer_dir:
25
+ # Look for transformer-specific template first
26
+ transformer_template = self._get_template_location(transformer_dir)
27
+ if transformer_template and transformer_template.exists():
28
+ return transformer_template.read_text()
29
+
30
+ # Fallback to generator-specific template (if it exists)
31
+ generator_template = Path(__file__).parent / self.TEMPLATE_FILENAME
32
+ if generator_template.exists():
33
+ return generator_template.read_text()
34
+
35
+ # If no templates found, raise error
36
+ raise FileNotFoundError(
37
+ f"No {self.TEMPLATE_FILENAME} template found in {transformer_dir} or {Path(__file__).parent}"
38
+ )
39
+
40
+ def _get_output_filename(self, transformer_name: str) -> str:
41
+ """Get the output filename for PySpark primitives."""
42
+ # Map transformer names to their primitive namespace names
43
+ name_mapping = {
44
+ "clean_emails": "email_primitives",
45
+ "clean_addresses": "address_primitives",
46
+ "clean_phone_numbers": "phone_primitives"
47
+ }
48
+
49
+ # Use mapped name if available, otherwise fall back to transformer_name
50
+ output_name = name_mapping.get(transformer_name, f"{transformer_name}_primitives")
51
+ return f"{output_name}.py"
@@ -0,0 +1,21 @@
1
+ """
2
+ Datacompose Operators Module
3
+ =======================
4
+
5
+ This module provides the core framework for building composable data transformation pipelines.
6
+
7
+ Main Components:
8
+ - SmartPrimitive: Enables partial application of transformations
9
+ - PrimitiveRegistry: Container for organizing related transformations
10
+ - PipelineCompiler: Compiles declarative syntax into executable pipelines
11
+ - StablePipeline: Runtime executor for compiled pipelines
12
+ """
13
+
14
+ from .primitives import SmartPrimitive, PrimitiveRegistry
15
+
16
+ __all__ = [
17
+ "SmartPrimitive",
18
+ "PrimitiveRegistry",
19
+ ]
20
+
21
+ __version__ = "0.2.4"