datacompose 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacompose might be problematic. Click here for more details.

Files changed (31) hide show
  1. datacompose/__init__.py +1 -0
  2. datacompose/cli/__init__.py +5 -0
  3. datacompose/cli/colors.py +80 -0
  4. datacompose/cli/commands/__init__.py +3 -0
  5. datacompose/cli/commands/add.py +215 -0
  6. datacompose/cli/commands/init.py +451 -0
  7. datacompose/cli/commands/list.py +118 -0
  8. datacompose/cli/commands/upgrade.py +7 -0
  9. datacompose/cli/main.py +59 -0
  10. datacompose/cli/validation.py +72 -0
  11. datacompose/generators/__init__.py +3 -0
  12. datacompose/generators/base.py +193 -0
  13. datacompose/generators/pyspark/__init__.py +1 -0
  14. datacompose/generators/pyspark/generator.py +51 -0
  15. datacompose/operators/__init__.py +21 -0
  16. datacompose/operators/primitives.py +595 -0
  17. datacompose/transformers/__init__.py +0 -0
  18. datacompose/transformers/discovery.py +186 -0
  19. datacompose/transformers/text/__init__.py +1 -0
  20. datacompose/transformers/text/clean_addresses/__init__.py +1 -0
  21. datacompose/transformers/text/clean_addresses/pyspark/pyspark_primitives.py +1967 -0
  22. datacompose/transformers/text/clean_emails/__init__.py +1 -0
  23. datacompose/transformers/text/clean_emails/pyspark/pyspark_primitives.py +781 -0
  24. datacompose/transformers/text/clean_phone_numbers/__init__.py +0 -0
  25. datacompose/transformers/text/clean_phone_numbers/pyspark/pyspark_primitives.py +941 -0
  26. datacompose-0.2.4.dist-info/METADATA +431 -0
  27. datacompose-0.2.4.dist-info/RECORD +31 -0
  28. datacompose-0.2.4.dist-info/WHEEL +5 -0
  29. datacompose-0.2.4.dist-info/entry_points.txt +2 -0
  30. datacompose-0.2.4.dist-info/licenses/LICENSE +21 -0
  31. datacompose-0.2.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1 @@
1
+ """Datacompose source package."""
@@ -0,0 +1,5 @@
1
+ """
2
+ Datacompose CLI - Command-line interface for generating data cleaning UDFs.
3
+ """
4
+
5
+ __version__ = "0.2.4"
@@ -0,0 +1,80 @@
1
+ """
2
+ Simple color utilities for CLI output.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+
8
+
9
+ class Colors:
10
+ """ANSI color codes for terminal output."""
11
+
12
+ # Text colors
13
+ RED = "\033[91m"
14
+ GREEN = "\033[92m"
15
+ YELLOW = "\033[93m"
16
+ BLUE = "\033[94m"
17
+ MAGENTA = "\033[95m"
18
+ CYAN = "\033[96m"
19
+ WHITE = "\033[97m"
20
+ GRAY = "\033[90m"
21
+
22
+ # Styles
23
+ BOLD = "\033[1m"
24
+ DIM = "\033[2m"
25
+ UNDERLINE = "\033[4m"
26
+
27
+ # Reset
28
+ RESET = "\033[0m"
29
+
30
+ @classmethod
31
+ def is_enabled(cls) -> bool:
32
+ """Check if colors should be enabled."""
33
+ # Disable colors if NO_COLOR env var is set
34
+ if os.getenv("NO_COLOR"):
35
+ return False
36
+
37
+ # Disable colors if not in a TTY
38
+ if not sys.stdout.isatty():
39
+ return False
40
+
41
+ return True
42
+
43
+
44
+ def colorize(text: str, color: str = "", style: str = "") -> str:
45
+ """Colorize text if colors are enabled."""
46
+ if not Colors.is_enabled():
47
+ return text
48
+
49
+ prefix = style + color
50
+ return f"{prefix}{text}{Colors.RESET}"
51
+
52
+
53
+ def success(text: str) -> str:
54
+ """Green text for success messages."""
55
+ return colorize(text, Colors.GREEN, Colors.BOLD)
56
+
57
+
58
+ def error(text: str) -> str:
59
+ """Red text for error messages."""
60
+ return colorize(text, Colors.RED, Colors.BOLD)
61
+
62
+
63
+ def warning(text: str) -> str:
64
+ """Yellow text for warning messages."""
65
+ return colorize(text, Colors.YELLOW, Colors.BOLD)
66
+
67
+
68
+ def info(text: str) -> str:
69
+ """Blue text for info messages."""
70
+ return colorize(text, Colors.BLUE)
71
+
72
+
73
+ def highlight(text: str) -> str:
74
+ """Cyan text for highlighted text."""
75
+ return colorize(text, Colors.CYAN, Colors.BOLD)
76
+
77
+
78
+ def dim(text: str) -> str:
79
+ """Dimmed text for less important info."""
80
+ return colorize(text, Colors.GRAY)
@@ -0,0 +1,3 @@
1
+ """
2
+ CLI commands for Datacompose.
3
+ """
@@ -0,0 +1,215 @@
1
+ """
2
+ Add command for generating UDFs.
3
+ """
4
+
5
+ import json
6
+ from pathlib import Path
7
+
8
+ import click
9
+
10
+ from datacompose.cli.colors import dim, error, highlight, info, success
11
+ from datacompose.cli.validation import validate_platform, validate_type_for_platform
12
+ from datacompose.transformers.discovery import TransformerDiscovery
13
+
14
+
15
+ # Completion functions for Click shell completion
16
+ def complete_transformer(ctx, param, incomplete):
17
+ """Complete transformer names from discovery system."""
18
+ try:
19
+ discovery = TransformerDiscovery()
20
+ transformers = discovery.list_transformers()
21
+ return [
22
+ click.shell_completion.CompletionItem(t) # type: ignore
23
+ for t in transformers
24
+ if t.startswith(incomplete)
25
+ ]
26
+ except Exception:
27
+ return []
28
+
29
+
30
+ def complete_target(ctx, param, incomplete):
31
+ """Complete target platforms from discovery system."""
32
+ try:
33
+ discovery = TransformerDiscovery()
34
+ generators = discovery.list_generators()
35
+ # Extract platform names (part before the dot)
36
+ platforms = list(set(gen.split(".")[0] for gen in generators))
37
+ return [
38
+ click.shell_completion.CompletionItem(p) # type: ignore
39
+ for p in platforms
40
+ if p.startswith(incomplete)
41
+ ]
42
+ except Exception:
43
+ return []
44
+
45
+
46
+ def complete_type(ctx, param, incomplete):
47
+ """Complete generator types based on selected target."""
48
+ try:
49
+ discovery = TransformerDiscovery()
50
+ generators = discovery.list_generators()
51
+
52
+ # Try to get the target from context
53
+ target = None
54
+ if ctx.params.get("target"):
55
+ target = ctx.params["target"]
56
+
57
+ if target:
58
+ # Filter to types for this specific target
59
+ target_generators = [
60
+ gen for gen in generators if gen.startswith(f"{target}.")
61
+ ]
62
+ types = [gen.split(".", 1)[1] for gen in target_generators if "." in gen]
63
+ return [
64
+ click.shell_completion.CompletionItem(t) # type: ignore
65
+ for t in types
66
+ if t.startswith(incomplete)
67
+ ]
68
+ else:
69
+ # Return all available types
70
+ types = [gen.split(".", 1)[1] for gen in generators if "." in gen]
71
+ return [
72
+ click.shell_completion.CompletionItem(t) # type: ignore
73
+ for t in types
74
+ if t.startswith(incomplete)
75
+ ]
76
+ except Exception:
77
+ return []
78
+
79
+
80
+ # Get the directory where this module is located
81
+ _MODULE_DIR = Path(__file__).parent
82
+
83
+
84
+ @click.command()
85
+ @click.argument("transformer", shell_complete=complete_transformer)
86
+ @click.option(
87
+ "--target",
88
+ "-t",
89
+ default="pyspark",
90
+ shell_complete=complete_target,
91
+ help="Target platform (e.g., 'pyspark', 'postgres', 'snowflake'). Default: pyspark",
92
+ )
93
+ @click.option(
94
+ "--type",
95
+ shell_complete=complete_type,
96
+ help="UDF type for the platform (e.g., 'pandas_udf', 'sql_udf'). Uses platform default if not specified",
97
+ )
98
+ @click.option("--output", "-o", help="Output directory (default: build/{target})")
99
+ @click.option(
100
+ "--template-dir",
101
+ default="src/transformers/templates",
102
+ help="Directory containing templates (default: src/transformers/templates)",
103
+ )
104
+ @click.option("--verbose", "-v", is_flag=True, help="Verbose output")
105
+ @click.pass_context
106
+ def add(ctx, transformer, target, type, output, template_dir, verbose):
107
+ """Add UDFs for transformers.
108
+
109
+ TRANSFORMER: Transformer to add UDF for (e.g., 'clean_emails')
110
+ """
111
+ # Initialize discovery for validation
112
+ discovery = TransformerDiscovery()
113
+
114
+ # Validate platform first
115
+ if not validate_platform(target, discovery):
116
+ ctx.exit(1)
117
+
118
+ # Validate type if specified
119
+ if type and not validate_type_for_platform(target, type, discovery):
120
+ ctx.exit(1)
121
+
122
+ # Combine target and type into generator reference
123
+ exit_code = _run_add(transformer, target, output, template_dir, verbose)
124
+ if exit_code != 0:
125
+ ctx.exit(exit_code)
126
+
127
+
128
+ def _run_add(transformer, target, output, template_dir, verbose) -> int:
129
+ """Execute the add command."""
130
+ # Initialize discovery
131
+ discovery = TransformerDiscovery()
132
+
133
+ # Resolve transformer
134
+ transformer_name, transformer_path = discovery.resolve_transformer(transformer)
135
+
136
+ if not transformer_path:
137
+ print(error(f"Error: Transformer not found: {transformer}"))
138
+ print(
139
+ info(
140
+ f"Available transformers: {', '.join(discovery.list_transformers())}"
141
+ )
142
+ )
143
+ return 1
144
+ else:
145
+ print(info(f"Using transformer: {transformer_name}"))
146
+ if verbose:
147
+ print(dim(f"Transformer path: {transformer_path}"))
148
+ # For discovered transformers, set transformer_dir
149
+ transformer_dir = transformer_path
150
+
151
+ # Resolve generator
152
+ generator_class = discovery.resolve_generator(target)
153
+ if not generator_class:
154
+ print(error(f"Error: Generator not found: {target}"))
155
+ print(info(f"Available generators: {', '.join(discovery.list_generators())}"))
156
+ return 1
157
+
158
+ # Determine output directory
159
+ # Extract platform from target (e.g., "pyspark.pandas_udf" -> "pyspark")
160
+ platform = target.split(".")[0]
161
+
162
+ if not output:
163
+ output_dir = f"build/{platform}/{transformer_name}"
164
+ else:
165
+ output_dir = f"{output}/{platform}/{transformer_name}"
166
+
167
+ # Create generator instance
168
+ generator = generator_class(
169
+ template_dir=Path(template_dir), output_dir=Path(output_dir), verbose=verbose
170
+ )
171
+
172
+ try:
173
+ # Generate the UDF
174
+ result = generator.generate(
175
+ transformer_name, force=False, transformer_dir=transformer_dir
176
+ )
177
+
178
+ if result.get("skipped"):
179
+ print(info(f"UDF already exists: {result['output_path']}"))
180
+ print(dim("No changes needed (hash matches)"))
181
+ if verbose:
182
+ print(dim(f" Hash: {result.get('hash', 'N/A')}"))
183
+ else:
184
+ print(success(f"✓ UDF generated: {result['output_path']}"))
185
+ print(success(f"✓ Test created: {result['test_path']}"))
186
+ print(highlight(f"Function name: {result['function_name']}"))
187
+ if verbose:
188
+ print(dim(f" Target: {target}"))
189
+ print(highlight("\nGenerated package contents:"))
190
+ print(f" - UDF code: {result['output_path']}")
191
+ print(f" - Test file: {result['test_path']}")
192
+
193
+ return 0
194
+
195
+ except Exception as e:
196
+ print(error(f"Add failed: {e}"))
197
+ if verbose:
198
+ import traceback
199
+
200
+ traceback.print_exc()
201
+ return 1
202
+
203
+
204
+ def _load_config() -> dict:
205
+ """Load datacompose.json configuration if it exists."""
206
+ config_path = Path("datacompose.json")
207
+ if config_path.exists():
208
+ try:
209
+ with open(config_path, "r") as f:
210
+ return json.load(f)
211
+ except Exception:
212
+ pass
213
+ return {}
214
+
215
+