datacompose 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacompose might be problematic. Click here for more details.

Files changed (31) hide show
  1. datacompose/__init__.py +1 -0
  2. datacompose/cli/__init__.py +5 -0
  3. datacompose/cli/colors.py +80 -0
  4. datacompose/cli/commands/__init__.py +3 -0
  5. datacompose/cli/commands/add.py +215 -0
  6. datacompose/cli/commands/init.py +451 -0
  7. datacompose/cli/commands/list.py +118 -0
  8. datacompose/cli/commands/upgrade.py +7 -0
  9. datacompose/cli/main.py +59 -0
  10. datacompose/cli/validation.py +72 -0
  11. datacompose/generators/__init__.py +3 -0
  12. datacompose/generators/base.py +193 -0
  13. datacompose/generators/pyspark/__init__.py +1 -0
  14. datacompose/generators/pyspark/generator.py +51 -0
  15. datacompose/operators/__init__.py +21 -0
  16. datacompose/operators/primitives.py +595 -0
  17. datacompose/transformers/__init__.py +0 -0
  18. datacompose/transformers/discovery.py +186 -0
  19. datacompose/transformers/text/__init__.py +1 -0
  20. datacompose/transformers/text/clean_addresses/__init__.py +1 -0
  21. datacompose/transformers/text/clean_addresses/pyspark/pyspark_primitives.py +1967 -0
  22. datacompose/transformers/text/clean_emails/__init__.py +1 -0
  23. datacompose/transformers/text/clean_emails/pyspark/pyspark_primitives.py +781 -0
  24. datacompose/transformers/text/clean_phone_numbers/__init__.py +0 -0
  25. datacompose/transformers/text/clean_phone_numbers/pyspark/pyspark_primitives.py +941 -0
  26. datacompose-0.2.4.dist-info/METADATA +431 -0
  27. datacompose-0.2.4.dist-info/RECORD +31 -0
  28. datacompose-0.2.4.dist-info/WHEEL +5 -0
  29. datacompose-0.2.4.dist-info/entry_points.txt +2 -0
  30. datacompose-0.2.4.dist-info/licenses/LICENSE +21 -0
  31. datacompose-0.2.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,451 @@
1
+ """
2
+ Init command for initializing a Datacompose project configuration.
3
+ """
4
+
5
+ import json
6
+ import os
7
+ import sys
8
+ import termios
9
+ import tty
10
+ from pathlib import Path
11
+ from typing import Any, Dict
12
+
13
+ import click
14
+
15
+ from datacompose.cli.colors import dim, error, highlight, info, success
16
+
17
+ # Get the directory where this module is located
18
+
19
+ DEFAULT_CONFIG = {
20
+ "version": "1.0",
21
+ "aliases": {"utils": "./src/utils"},
22
+ "targets": {
23
+ "pyspark": {
24
+ "output": "./build/pyspark",
25
+ }
26
+ },
27
+ }
28
+
29
+
30
+ @click.command()
31
+ @click.option(
32
+ "--force", "-f", is_flag=True, help="Overwrite existing datacompose.json if it exists"
33
+ )
34
+ @click.option(
35
+ "--output",
36
+ "-o",
37
+ default="./datacompose.json",
38
+ help="Output path for the config file (default: ./datacompose.json)",
39
+ )
40
+ @click.option("--verbose", "-v", is_flag=True, help="Verbose output")
41
+ @click.option(
42
+ "--yes", "-y", is_flag=True, help="Skip interactive prompts and use defaults"
43
+ )
44
+ @click.option("--skip-completion", is_flag=True, help="Skip shell completion setup")
45
+ @click.pass_context
46
+ def init(ctx, force, output, verbose, yes, skip_completion):
47
+ """Initialize project configuration."""
48
+ exit_code = _run_init(force, output, verbose, yes, skip_completion)
49
+ if exit_code != 0:
50
+ ctx.exit(exit_code)
51
+
52
+
53
+ class InitCommand:
54
+ """Command to initialize a Datacompose project configuration."""
55
+
56
+ @staticmethod
57
+ def get_config_template(template_name: str) -> Dict[str, Any]:
58
+ """Get configuration template by name."""
59
+ if template_name == "minimal":
60
+ return {"version": "1.0", "targets": {"pyspark": {"output": "./build/pyspark"}}}
61
+ elif template_name == "advanced":
62
+ config = DEFAULT_CONFIG.copy()
63
+ config.update(
64
+ {
65
+ "style": "custom",
66
+ "aliases": {
67
+ "utils": "./src/utils",
68
+ "build": "./build",
69
+ },
70
+ "include": ["src/**/*"],
71
+ "exclude": ["__pycache__", "build", "*.pyc", ".pytest_cache"],
72
+ "testing": {"framework": "pytest", "test_dir": "./tests"},
73
+ }
74
+ )
75
+ return config
76
+ else: # default
77
+ return DEFAULT_CONFIG.copy()
78
+
79
+ @staticmethod
80
+ def get_key():
81
+ """Get a single key press from the user."""
82
+ try:
83
+ fd = sys.stdin.fileno()
84
+ old_settings = termios.tcgetattr(fd)
85
+ tty.setraw(sys.stdin.fileno())
86
+ key = sys.stdin.read(1)
87
+
88
+ # Handle arrow keys (escape sequences)
89
+ if key == "\x1b":
90
+ key += sys.stdin.read(2)
91
+
92
+ termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
93
+ return key
94
+ except Exception:
95
+ # Fallback for systems without termios (like Windows)
96
+ return input()
97
+
98
+ @staticmethod
99
+ def prompt_for_targets(available_targets: Dict[str, Dict]) -> Dict[str, Dict]:
100
+ """Interactive multi-select for choosing targets with arrow key navigation."""
101
+ target_keys = list(available_targets.keys())
102
+ selected = [i == 0 for i in target_keys]
103
+ current_pos = 0 # Current cursor position
104
+
105
+ while True:
106
+ # Clear screen and display
107
+ print("\033[2J\033[H", end="") # Clear screen, move cursor to top
108
+ print(highlight("Platform Selection"))
109
+ print(dim("Choose which platforms you'd like to generate UDFs for:\n"))
110
+
111
+ for i, (key, target_info) in enumerate(available_targets.items()):
112
+ # Selection indicators with better symbols
113
+ if selected[i]:
114
+ marker = "[✓]"
115
+ name_color = success
116
+ else:
117
+ marker = "[ ]"
118
+
119
+ def name_color(text):
120
+ return text
121
+
122
+ # Current item indicator with better styling
123
+ if i == current_pos:
124
+ cursor = "> "
125
+ # Highlighted current line
126
+ line = f"{cursor}{marker} {name_color(target_info['name'])} {dim('-> ' + target_info['output'])}"
127
+ print(f"\033[7m{line}\033[0m")
128
+ else:
129
+ cursor = " "
130
+ line = f"{cursor}{marker} {name_color(target_info['name'])} {dim('-> ' + target_info['output'])}"
131
+ print(line)
132
+
133
+ # Summary section with better formatting
134
+ selected_names = [target_keys[i] for i, sel in enumerate(selected) if sel]
135
+ if selected_names:
136
+ summary = highlight(f"Selected: {', '.join(selected_names)}")
137
+ else:
138
+ summary = dim("Selected: None")
139
+
140
+ print(f"\n{summary}")
141
+ print(
142
+ f"\n{dim('Controls:')} ↑/↓ navigate • SPACE toggle • ENTER confirm • q/ESC quit"
143
+ )
144
+
145
+ # Get key input
146
+ key = InitCommand.get_key()
147
+
148
+ if key == "\x1b[A": # Up arrow
149
+ current_pos = (current_pos - 1) % len(target_keys)
150
+ elif key == "\x1b[B": # Down arrow
151
+ current_pos = (current_pos + 1) % len(target_keys)
152
+ elif key == " ": # Space to toggle
153
+ selected[current_pos] = not selected[current_pos]
154
+ elif key == "\r" or key == "\n": # Enter to confirm
155
+ break
156
+ elif key == "q" or key == "Q" or key == "\x1b": # Quit with q or ESC
157
+ return {}
158
+
159
+ # Build selected targets with custom output paths
160
+ print("\033[2J\033[H", end="") # Clear screen
161
+ print(highlight("Output Directory Configuration"))
162
+ print(dim("Configure output directories for your selected platforms:\n"))
163
+
164
+ result = {}
165
+ for i, (key, target_info) in enumerate(available_targets.items()):
166
+ if selected[i]:
167
+ prompt = f"{success('[✓]')} {target_info['name']} output directory? {dim('(default: ' + target_info['output'] + ')')} "
168
+ output_path = input(prompt).strip()
169
+ if not output_path:
170
+ output_path = target_info["output"]
171
+
172
+ result[key] = {"output": output_path}
173
+ print(dim(f" -> Set to: {output_path}\n"))
174
+
175
+ return result
176
+
177
+ @staticmethod
178
+ def prompt_for_config(template_config: Dict[str, Any]) -> Dict[str, Any] | None:
179
+ """Interactively prompt user for configuration options."""
180
+ print(highlight("Setting up your Datacompose project configuration..."))
181
+ print(dim("Press Enter to use the default value shown in brackets.\n"))
182
+
183
+ print()
184
+
185
+ # Select targets with multi-select
186
+ available_targets = {
187
+ "pyspark": {"output": "./build/pyspark", "name": "PySpark (Apache Spark)"},
188
+ }
189
+
190
+ selected_targets = InitCommand.prompt_for_targets(available_targets)
191
+
192
+ # Check if user quit the selection
193
+ if not selected_targets:
194
+ print(dim("\nConfiguration cancelled."))
195
+ return None
196
+
197
+ # Update the configuration
198
+ config = template_config.copy()
199
+
200
+ # Update targets with user selections
201
+ config["targets"] = selected_targets
202
+
203
+ print() # Add spacing
204
+ return config
205
+
206
+ @staticmethod
207
+ def create_directory_structure(config: Dict[str, Any], verbose: bool = False):
208
+ """Create the basic directory structure based on config."""
209
+ directories_to_create = []
210
+
211
+ # Output directories will be created automatically
212
+
213
+ # Add target output directories
214
+ if "targets" in config:
215
+ for target_config in config["targets"].values():
216
+ if "output" in target_config:
217
+ directories_to_create.append(Path(target_config["output"]).parent)
218
+
219
+ # Add template directories if specified
220
+
221
+ for directory in directories_to_create:
222
+ if not directory.exists():
223
+ directory.mkdir(parents=True, exist_ok=True)
224
+ if verbose:
225
+ print(f"Created directory: {directory}")
226
+
227
+ @staticmethod
228
+ def setup_shell_completion(verbose: bool = False) -> bool:
229
+ """Set up shell completion for datacompose commands. Returns True if successful."""
230
+ try:
231
+ # Detect current shell
232
+ shell = os.environ.get("SHELL", "").lower()
233
+
234
+ if "bash" in shell:
235
+ config_file = Path.home() / ".bashrc"
236
+ # Also check .bash_profile as fallback
237
+ if not config_file.exists():
238
+ config_file = Path.home() / ".bash_profile"
239
+ elif "zsh" in shell:
240
+ config_file = Path.home() / ".zshrc"
241
+ else:
242
+ if verbose:
243
+ print(dim(f"Shell not detected or not supported: {shell}"))
244
+ print(dim("Supported shells: bash, zsh"))
245
+ return False
246
+
247
+ completion_line = 'eval "$(register-python-argcomplete datacompose)"'
248
+
249
+ # Check if config file exists
250
+ if not config_file.exists():
251
+ if verbose:
252
+ print(dim(f"Shell config file not found: {config_file}"))
253
+ return False
254
+
255
+ # Read existing config
256
+ try:
257
+ with open(config_file, "r") as f:
258
+ content = f.read()
259
+ except PermissionError:
260
+ if verbose:
261
+ print(dim(f"Permission denied reading: {config_file}"))
262
+ return False
263
+
264
+ # Check if already configured
265
+ if (
266
+ completion_line in content
267
+ or "register-python-argcomplete datacompose" in content
268
+ ):
269
+ if verbose:
270
+ print(success("✓ Shell completion already configured"))
271
+ return True
272
+
273
+ # Create backup
274
+ backup_file = config_file.with_suffix(config_file.suffix + ".datacompose-backup")
275
+ try:
276
+ with open(backup_file, "w") as f:
277
+ f.write(content)
278
+ if verbose:
279
+ print(dim(f"Created backup: {backup_file}"))
280
+ except PermissionError:
281
+ if verbose:
282
+ print(dim("Warning: Could not create backup file"))
283
+
284
+ # Add completion line
285
+ try:
286
+ with open(config_file, "a") as f:
287
+ f.write(f"\n# Datacompose CLI completion\n{completion_line}\n")
288
+
289
+ # shell_name = "bash" if "bash" in shell else "zsh"
290
+ print(success(f"✓ Added tab completion to {config_file}"))
291
+ print(
292
+ info(
293
+ f"Run 'source {config_file}' or restart your terminal to enable completion"
294
+ )
295
+ )
296
+ return True
297
+
298
+ except PermissionError:
299
+ if verbose:
300
+ print(dim(f"Permission denied writing to: {config_file}"))
301
+ return False
302
+
303
+ except Exception as e:
304
+ if verbose:
305
+ print(dim(f"Completion setup failed: {e}"))
306
+ return False
307
+
308
+ @staticmethod
309
+ def prompt_completion_setup(verbose: bool = False) -> bool:
310
+ """Prompt user to set up shell completion and do it if they agree."""
311
+ try:
312
+ print() # Add some spacing
313
+ response = (
314
+ input(highlight("Set up tab completion for datacompose commands? (Y/n): "))
315
+ .strip()
316
+ .lower()
317
+ )
318
+
319
+ if response in ["", "y", "yes"]:
320
+ success_setup = InitCommand.setup_shell_completion(verbose)
321
+ if not success_setup:
322
+ print()
323
+ print(dim("Manual setup instructions:"))
324
+ print(
325
+ dim(
326
+ " bash: echo 'eval \"$(register-python-argcomplete datacompose)\"' >> ~/.bashrc"
327
+ )
328
+ )
329
+ print(
330
+ dim(
331
+ " zsh: echo 'eval \"$(register-python-argcomplete datacompose)\"' >> ~/.zshrc"
332
+ )
333
+ )
334
+ return success_setup
335
+ else:
336
+ print(dim("Skipped shell completion setup"))
337
+ print(dim("You can set it up later with:"))
338
+ print(
339
+ dim(
340
+ " echo 'eval \"$(register-python-argcomplete datacompose)\"' >> ~/.bashrc"
341
+ )
342
+ )
343
+ return False
344
+
345
+ except (KeyboardInterrupt, EOFError):
346
+ print(dim("\nSkipped shell completion setup"))
347
+ return False
348
+
349
+
350
+ def _run_init(force, output, verbose, yes, skip_completion) -> int:
351
+ """Execute the init command."""
352
+ config_path = Path(output)
353
+
354
+ # Check if config already exists
355
+ if config_path.exists() and not force:
356
+ print(error(f"Configuration file already exists: {config_path}"))
357
+ print(dim("Use --force to overwrite"))
358
+ return 1
359
+
360
+ try:
361
+ # Get the default template
362
+ template_config = InitCommand.get_config_template("default")
363
+
364
+ # Either prompt for interactive configuration or use defaults
365
+ if yes:
366
+ config = template_config
367
+ print("Using default configuration...")
368
+ else:
369
+ config = InitCommand.prompt_for_config(template_config)
370
+ # Check if user cancelled the configuration
371
+ if config is None:
372
+ return 0
373
+
374
+ # Write the configuration file
375
+ with open(config_path, "w") as f:
376
+ json.dump(config, f, indent=2)
377
+
378
+ print(success(f"✓ Configuration initialized: {config_path}"))
379
+
380
+ # Set up shell completion (unless skipping)
381
+ completion_setup = False
382
+ if (
383
+ not yes and not skip_completion
384
+ ): # Only prompt in interactive mode and if not skipping
385
+ completion_setup = InitCommand.prompt_completion_setup(verbose)
386
+ elif skip_completion and verbose:
387
+ print(dim("Skipped shell completion setup (--skip-completion)"))
388
+ elif yes and verbose:
389
+ print(dim("Skipped shell completion setup (non-interactive mode)"))
390
+
391
+ # Create directory structure
392
+ InitCommand.create_directory_structure(config, verbose)
393
+
394
+ if verbose:
395
+ print(success("✓ Used template: default"))
396
+ print(success("✓ Created directory structure"))
397
+ if completion_setup:
398
+ print(success("✓ Shell completion configured"))
399
+ print(highlight("\nNext steps:"))
400
+ print("1. Review the configuration in datacompose.json")
401
+ if completion_setup:
402
+ print(
403
+ "2. Source your shell config or restart terminal for tab completion"
404
+ )
405
+ print(
406
+ "3. Add your first transformer: datacompose add clean_emails --target pyspark"
407
+ )
408
+ else:
409
+ print(
410
+ "2. Add your first transformer: datacompose add clean_emails --target pyspark"
411
+ )
412
+ if not skip_completion:
413
+ print(
414
+ "4. Set up tab completion: echo 'eval \"$(register-python-argcomplete datacompose)\"' >> ~/.bashrc"
415
+ )
416
+ else:
417
+ print(success("✓ Directory structure created"))
418
+ if completion_setup:
419
+ print(success("✓ Tab completion configured"))
420
+ print(
421
+ highlight(
422
+ "\nRun 'datacompose add clean_emails --target pyspark' to get started"
423
+ )
424
+ )
425
+ print(
426
+ dim(
427
+ "Restart your terminal or run 'source ~/.bashrc' to enable tab completion"
428
+ )
429
+ )
430
+ else:
431
+ print(
432
+ highlight(
433
+ "\nRun 'datacompose add clean_emails --target pyspark' to get started"
434
+ )
435
+ )
436
+ if not skip_completion and not yes:
437
+ print(
438
+ dim(
439
+ "Tip: Set up tab completion with: echo 'eval \"$(register-python-argcomplete datacompose)\"' >> ~/.bashrc"
440
+ )
441
+ )
442
+
443
+ return 0
444
+
445
+ except Exception as e:
446
+ print(error(f"Init failed: {e}"))
447
+ if verbose:
448
+ import traceback
449
+
450
+ traceback.print_exc()
451
+ return 1
@@ -0,0 +1,118 @@
1
+ """
2
+ List command for showing available targets and transformers.
3
+ """
4
+
5
+ import click
6
+
7
+ from datacompose.transformers.discovery import TransformerDiscovery
8
+
9
+
10
+ # Completion function for list items
11
+ def complete_list_items(ctx, param, incomplete):
12
+ """Complete list item choices."""
13
+ items = ["targets", "transformers", "generators"]
14
+ return [
15
+ click.shell_completion.CompletionItem(item) # type ignore
16
+ for item in items
17
+ if item.startswith(incomplete)
18
+ ]
19
+
20
+
21
+ @click.command(name="list")
22
+ @click.argument(
23
+ "item",
24
+ type=click.Choice(["targets", "transformers", "generators"]),
25
+ shell_complete=complete_list_items,
26
+ )
27
+ @click.pass_context
28
+ def list_cmd(ctx, item):
29
+ """List available targets, transformers, or generators.
30
+
31
+ ITEM: What to list: targets, transformers, or generators
32
+ """
33
+ exit_code = _run_list(item)
34
+ if exit_code != 0:
35
+ ctx.exit(exit_code)
36
+
37
+
38
+ def _run_list(item) -> int:
39
+ """Execute the list command."""
40
+ discovery = TransformerDiscovery()
41
+
42
+ if item == "transformers":
43
+ return ListCommand._list_transformers(discovery)
44
+ elif item == "generators":
45
+ return ListCommand._list_generators(discovery)
46
+ elif item == "targets":
47
+ return ListCommand._list_generators(discovery)
48
+ else:
49
+ print(f"Unknown item: {item}")
50
+ return 1
51
+
52
+
53
+ class ListCommand:
54
+ """Command to list available targets and transformers."""
55
+
56
+ @staticmethod
57
+ def _list_targets() -> int:
58
+ """List available target platforms."""
59
+ from cli.commands.add import AddCommand
60
+
61
+ print(" Available targets:")
62
+ for target in AddCommand.AVAILABLE_TARGETS.keys():
63
+ print(f" • {target}")
64
+
65
+ print("\n💡 Use 'datacompose add <transformer> --target <target>' to generate UDFs")
66
+ return 0
67
+
68
+ @staticmethod
69
+ def _list_transformers(discovery: TransformerDiscovery) -> int:
70
+ """List available transformers by domain."""
71
+ transformers = discovery.discover_transformers()
72
+
73
+ if not transformers:
74
+ print(" No transformers found.")
75
+ return 0
76
+
77
+ print(" Available transformers:")
78
+
79
+ # Group transformers by domain (extracted from path)
80
+ domains = {}
81
+ for transformer_name, transformer_path in transformers.items():
82
+ # Extract domain from path
83
+ domain = (
84
+ transformer_path.parent.parent.name
85
+ if transformer_path.parent.parent.name != "transformers"
86
+ else "legacy"
87
+ )
88
+ if domain not in domains:
89
+ domains[domain] = {}
90
+ domains[domain][transformer_name] = transformer_path
91
+
92
+ for domain, domain_transformers in sorted(domains.items()):
93
+ print(f"\n {domain}/")
94
+ for transformer_name, transformer_path in sorted(domain_transformers.items()):
95
+ print(f" • {transformer_name}")
96
+
97
+ print("\nUsage: datacompose add <transformer> --target <platform> [--type <type>]")
98
+ print("Example: datacompose add clean_emails --target pyspark")
99
+ return 0
100
+
101
+ @staticmethod
102
+ def _list_generators(discovery: TransformerDiscovery) -> int:
103
+ """List available generators by platform."""
104
+ generators = discovery.discover_generators()
105
+
106
+ if not generators:
107
+ print(" No generators found.")
108
+ return 0
109
+
110
+ print(" Available generators:")
111
+ for platform, platform_generators in sorted(generators.items()):
112
+ print(f"\n {platform}/")
113
+ for gen_type, gen_class in sorted(platform_generators.items()):
114
+ print(f" • {gen_type} ({gen_class.__name__})")
115
+
116
+ print("\nUsage: datacompose add <transformer> --target <platform> [--type <type>]")
117
+ print("Example: datacompose add clean_emails --target pyspark")
118
+ return 0
@@ -0,0 +1,7 @@
1
+ """Upgrade command for upgrading a transformer to a new version."""
2
+
3
+
4
+ class UpgradeCommand:
5
+ """Command to upgrade a transformer to a new version."""
6
+
7
+ None
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env python3
2
+ # PYTHON_ARGCOMPLETE_OK
3
+ """
4
+ Main CLI entry point for Datacompose.
5
+ """
6
+
7
+ import click
8
+ import sys
9
+
10
+ # Import argcomplete for tab completion
11
+ try:
12
+ import argcomplete
13
+ except ImportError:
14
+ argcomplete = None
15
+
16
+ from datacompose.cli.commands.add import add
17
+ from datacompose.cli.commands.init import init
18
+ from datacompose.cli.commands.list import list_cmd
19
+
20
+
21
+ @click.group()
22
+ @click.version_option("0.1.0", prog_name="datacompose")
23
+ @click.pass_context
24
+ def cli(ctx):
25
+ """Generate data cleaning UDFs for various platforms.
26
+
27
+ Examples:
28
+ datacompose init
29
+ datacompose add clean_emails --target pyspark
30
+ datacompose add clean_emails --target snowflake --output sql/udfs/
31
+ datacompose list targets
32
+ """
33
+ pass
34
+
35
+
36
+ # Add commands to the main CLI group
37
+ cli.add_command(init)
38
+ cli.add_command(add)
39
+ cli.add_command(list_cmd)
40
+
41
+
42
+ def main():
43
+ """Main CLI entry point."""
44
+ # Enable argcomplete for tab completion
45
+ if argcomplete:
46
+ argcomplete.autocomplete(cli)
47
+
48
+ try:
49
+ cli()
50
+ except KeyboardInterrupt:
51
+ click.echo("\nOperation cancelled by user", err=True)
52
+ sys.exit(1)
53
+ except Exception as e:
54
+ click.echo(f"Error: {e}", err=True)
55
+ sys.exit(1)
56
+
57
+
58
+ if __name__ == "__main__":
59
+ main()