datacompose 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacompose might be problematic. Click here for more details.

Files changed (31) hide show
  1. datacompose/__init__.py +1 -0
  2. datacompose/cli/__init__.py +5 -0
  3. datacompose/cli/colors.py +80 -0
  4. datacompose/cli/commands/__init__.py +3 -0
  5. datacompose/cli/commands/add.py +215 -0
  6. datacompose/cli/commands/init.py +451 -0
  7. datacompose/cli/commands/list.py +118 -0
  8. datacompose/cli/commands/upgrade.py +7 -0
  9. datacompose/cli/main.py +59 -0
  10. datacompose/cli/validation.py +72 -0
  11. datacompose/generators/__init__.py +3 -0
  12. datacompose/generators/base.py +193 -0
  13. datacompose/generators/pyspark/__init__.py +1 -0
  14. datacompose/generators/pyspark/generator.py +51 -0
  15. datacompose/operators/__init__.py +21 -0
  16. datacompose/operators/primitives.py +595 -0
  17. datacompose/transformers/__init__.py +0 -0
  18. datacompose/transformers/discovery.py +186 -0
  19. datacompose/transformers/text/__init__.py +1 -0
  20. datacompose/transformers/text/clean_addresses/__init__.py +1 -0
  21. datacompose/transformers/text/clean_addresses/pyspark/pyspark_primitives.py +1967 -0
  22. datacompose/transformers/text/clean_emails/__init__.py +1 -0
  23. datacompose/transformers/text/clean_emails/pyspark/pyspark_primitives.py +781 -0
  24. datacompose/transformers/text/clean_phone_numbers/__init__.py +0 -0
  25. datacompose/transformers/text/clean_phone_numbers/pyspark/pyspark_primitives.py +941 -0
  26. datacompose-0.2.4.dist-info/METADATA +431 -0
  27. datacompose-0.2.4.dist-info/RECORD +31 -0
  28. datacompose-0.2.4.dist-info/WHEEL +5 -0
  29. datacompose-0.2.4.dist-info/entry_points.txt +2 -0
  30. datacompose-0.2.4.dist-info/licenses/LICENSE +21 -0
  31. datacompose-0.2.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,595 @@
1
+ """
2
+ datacompose/operators/primitives.py
3
+ =====================================
4
+ Simple and elegant compose decorator framework for building data pipelines.
5
+ """
6
+
7
+ import ast
8
+ import inspect
9
+ import logging
10
+ import textwrap
11
+ from dataclasses import dataclass
12
+ from functools import wraps
13
+ from typing import Any, Callable, Dict, List, Optional, Sequence
14
+
15
+ # Set up module logger
16
+ logger = logging.getLogger(__name__)
17
+
18
+ try:
19
+ from pyspark.sql import Column # type: ignore
20
+ except ImportError:
21
+ pass
22
+
23
+
24
+ class SmartPrimitive:
25
+ """Wraps a PySpark column transformation function to enable partial application.
26
+
27
+ SmartPrimitive allows column transformation functions to be:
28
+ 1. Called directly with a column: `primitive(col)`
29
+ 2. Pre-configured with parameters: `primitive(param=value)` returns a configured function
30
+
31
+ This enables building reusable, parameterized transformations that can be composed
32
+ into data pipelines.
33
+
34
+ Example:
35
+ >>> def trim_spaces(col, chars=' '):
36
+ ... return f.trim(col, chars)
37
+ >>>
38
+ >>> trim = SmartPrimitive(trim_spaces)
39
+ >>>
40
+ >>> # Direct usage
41
+ >>> df.select(trim(f.col("text")))
42
+ >>>
43
+ >>> # Pre-configured usage
44
+ >>> trim_tabs = trim(chars='\t')
45
+ >>> df.select(trim_tabs(f.col("text")))
46
+
47
+
48
+ Please note that you will not use this directly. It will be used in the PrimitiveRegistry class
49
+ """
50
+
51
+ def __init__(self, func: Callable, name: Optional[str] = None):
52
+ """Initialize a SmartPrimitive.
53
+
54
+ Args:
55
+ func: The column transformation function to wrap
56
+ name: Optional name for the primitive (defaults to func.__name__)
57
+ """
58
+ self.func = func
59
+ self.name = name or func.__name__
60
+ self.__doc__ = func.__doc__
61
+
62
+ def __call__(self, col: Optional[Column] = None, **kwargs): # type: ignore
63
+ """Apply the transformation or return a configured version.
64
+
65
+ Args:
66
+ col: Optional PySpark Column to transform. If provided, applies the
67
+ transformation immediately. If None, returns a configured function.
68
+ **kwargs: Parameters to pass to the transformation function
69
+
70
+ Returns:
71
+ If col is provided: The transformed Column
72
+ If col is None: A configured function that takes a Column
73
+ """
74
+ if col is not None: # type: ignore
75
+ return self.func(col, **kwargs) # type: ignore
76
+ else:
77
+
78
+ @wraps(self.func)
79
+ def configured(col: Column): # type: ignore
80
+ return self.func(col, **kwargs) # type: ignore
81
+
82
+ configured.__name__ = (
83
+ f"{self.name}({', '.join(f'{k}={v}' for k, v in kwargs.items())})"
84
+ )
85
+ return configured
86
+
87
+
88
+ class PrimitiveRegistry:
89
+ """Container for organizing related column transformation primitives.
90
+
91
+ PrimitiveRegistry groups related SmartPrimitive transformations under a common
92
+ namespace, making them accessible as attributes. This provides a clean API for
93
+ organizing and accessing transformation functions.
94
+
95
+ Example:
96
+ >>> # Create a registry for string operations
97
+ >>> string = PrimitiveRegistry("string")
98
+ >>>
99
+ >>> # Register transformations
100
+ >>> @string.register()
101
+ >>> def lowercase(col):
102
+ ... return f.lower(col)
103
+ >>>
104
+ >>> @string.register()
105
+ >>> def trim(col, chars=' '):
106
+ ... return f.trim(col, chars)
107
+ >>>
108
+ >>> # Use the transformations
109
+ >>> df.select(string.lowercase(f.col("text")))
110
+ >>> df.select(string.trim(chars='\t')(f.col("text")))
111
+ """
112
+
113
+ def __init__(self, namespace_name: str):
114
+ """Initialize a PrimitiveRegistry.
115
+
116
+ Args:
117
+ namespace_name: Name for this namespace (used in error messages)
118
+ """
119
+ self.namespace_name = namespace_name
120
+ self._primitives = {}
121
+ self._conditionals = {}
122
+
123
+ def register(self, name: Optional[str] = None, is_conditional: bool = False):
124
+ """Decorator to register a function as a SmartPrimitive in this namespace.
125
+
126
+ Args:
127
+ name: Optional name for the primitive (defaults to function name)
128
+
129
+ Returns:
130
+ Decorator function that wraps the target function as a SmartPrimitive
131
+
132
+ Example:
133
+ >>> ns = PrimitiveRegistry("text")
134
+ >>> @ns.register()
135
+ >>> def clean(col):
136
+ ... return f.trim(f.lower(col))
137
+ """
138
+
139
+ def decorator(func: Callable):
140
+ primitive_name = name or func.__name__
141
+
142
+ if is_conditional:
143
+ self._conditionals[primitive_name] = SmartPrimitive(
144
+ func, primitive_name
145
+ )
146
+ setattr(self, primitive_name, self._conditionals[primitive_name])
147
+ else:
148
+ self._primitives[primitive_name] = SmartPrimitive(func, primitive_name)
149
+ setattr(self, primitive_name, self._primitives[primitive_name])
150
+ # return self._primitives[primitive_name]
151
+ return func
152
+
153
+ return decorator
154
+
155
+ def __getattr__(self, name):
156
+ if name in self._primitives:
157
+ return self._primitives[name]
158
+ elif name in self._conditionals:
159
+ return self._conditionals[name]
160
+ else:
161
+ raise AttributeError(f"No primitive '{name}' in {self.namespace_name}")
162
+
163
+ def compose(
164
+ self,
165
+ func: Optional[Callable] = None,
166
+ *,
167
+ debug: bool = False,
168
+ steps: Optional[list] = None,
169
+ **namespaces,
170
+ ):
171
+ """Decorator that converts a function body into a composed transformation pipeline.
172
+
173
+ The compose decorator analyzes the AST of a function and extracts a sequence of
174
+ transformation calls, creating a pipeline that applies them in order. This allows
175
+ declarative pipeline definitions using natural function call syntax.
176
+
177
+ Args:
178
+ func: Function to convert into a pipeline
179
+ debug: If True, prints each transformation as it's applied
180
+ steps: Optional list of pre-configured steps (bypasses AST parsing)
181
+ **namespaces: Namespace objects to use for resolving transformations
182
+
183
+ Returns:
184
+ A composed function that applies all transformations in sequence
185
+
186
+ Example:
187
+ >>> string = PrimitiveRegistry("string")
188
+ >>>
189
+ >>> @string.register()
190
+ >>> def trim(col, chars=' '):
191
+ ... return f.trim(col, chars)
192
+ >>>
193
+ >>> @string.register()
194
+ >>> def lowercase(col):
195
+ ... return f.lower(col)
196
+ >>>
197
+ >>> @compose(string=string)
198
+ >>> def clean_text():
199
+ ... string.trim()
200
+ ... string.lowercase()
201
+ >>>
202
+ >>> # Use the composed pipeline
203
+ >>> df.select(clean_text(f.col("text")))
204
+ """
205
+
206
+ def decorator(func: Callable):
207
+ # Check if steps are provided directly
208
+ if steps is not None:
209
+ # Old style with explicit steps
210
+ def pipeline(col: Column) -> Column: # type: ignore
211
+ result = col # type: ignore
212
+ for step in steps: # type: ignore
213
+ result = step(result)
214
+ return result
215
+
216
+ pipeline.__name__ = func.__name__
217
+ pipeline.__doc__ = func.__doc__
218
+ return pipeline
219
+
220
+ # Try to get the function as a string and parse it
221
+ try:
222
+ compiler = PipelineCompiler(namespaces, debug)
223
+ pipeline = compiler.compile(func)
224
+
225
+ if debug and pipeline.steps:
226
+ logger.info(
227
+ f"Successfully compiled '{func.__name__}' with {len(pipeline.steps)} steps"
228
+ )
229
+
230
+ return pipeline
231
+ except Exception as e:
232
+ logger.warning(
233
+ f"Advanced compilation failed for '{func.__name__}': {e}. "
234
+ f"Falling back to sequential extraction."
235
+ )
236
+ if debug:
237
+ logger.debug("Compilation error details:", exc_info=True)
238
+
239
+ # Fallback: Extract just the function calls sequentially
240
+ # This maintains backward compatibility
241
+ return _fallback_compose(func, namespaces, debug)
242
+
243
+ if func is None:
244
+ # Called with arguments @compose(debug=True, email=email_namespace)
245
+ return decorator
246
+ else:
247
+ # Called without arguments @compose
248
+ return decorator(func)
249
+
250
+
251
+ def _fallback_compose(func: Callable, namespaces: Dict, debug: bool) -> Callable:
252
+ """Fallback for when compilation fails - extracts sequential calls only"""
253
+ try:
254
+ source = inspect.getsource(func)
255
+ source = textwrap.dedent(source)
256
+ tree = ast.parse(source)
257
+ func_def = tree.body[0]
258
+
259
+ # Extract only simple function calls (old behavior)
260
+ steps = []
261
+ if isinstance(func_def, ast.FunctionDef):
262
+ for node in func_def.body:
263
+ if isinstance(node, ast.Expr) and isinstance(node.value, ast.Call):
264
+ if isinstance(node.value.func, ast.Attribute):
265
+ namespace_name = (
266
+ node.value.func.value.id
267
+ if isinstance(node.value.func.value, ast.Name)
268
+ else None
269
+ )
270
+ method_name = node.value.func.attr
271
+ namespace = (
272
+ namespaces.get(namespace_name) if namespace_name else None
273
+ ) or (globals().get(namespace_name) if namespace_name else None)
274
+ if namespace and hasattr(namespace, method_name):
275
+ method = getattr(namespace, method_name)
276
+
277
+ kwargs = {}
278
+ for keyword in node.value.keywords:
279
+ try:
280
+ kwargs[keyword.arg] = ast.literal_eval(
281
+ keyword.value
282
+ )
283
+ except Exception:
284
+ pass
285
+
286
+ steps.append(method(**kwargs) if kwargs else method)
287
+
288
+ def pipeline(col: Column) -> Column: # type: ignore
289
+ result = col # type: ignore
290
+ for step in steps:
291
+ if debug:
292
+ logger.debug(f"Executing step: {getattr(step, '__name__', step)}")
293
+ result = step(result) # type: ignore
294
+ return result
295
+
296
+ pipeline.__name__ = func.__name__
297
+ pipeline.__doc__ = func.__doc__
298
+ return pipeline
299
+
300
+ except Exception as e:
301
+ logger.error(
302
+ f"Failed to create pipeline for '{func.__name__}': {e}. "
303
+ f"Returning identity function."
304
+ )
305
+
306
+ # Ultimate fallback - return identity function
307
+ def pipeline(col: Column) -> Column: # type: ignore
308
+ return col # type: ignore
309
+
310
+ pipeline.__name__ = func.__name__
311
+ pipeline.__doc__ = f"Failed to compile {func.__name__}"
312
+ return pipeline
313
+
314
+
315
+ try:
316
+ from pyspark.sql import Column
317
+ from pyspark.sql import functions as F
318
+ except ImportError:
319
+ logging.debug("PySpark not available")
320
+
321
+ # Set up module logger
322
+ logger = logging.getLogger(__name__)
323
+
324
+
325
+ @dataclass
326
+ class CompiledStep:
327
+ """A compiled pipeline step"""
328
+
329
+ step_type: str
330
+ action: Optional[Callable] = None
331
+ condition: Optional[Callable] = None
332
+ then_branch: Optional[List["CompiledStep"]] = None
333
+ else_branch: Optional[List["CompiledStep"]] = None
334
+
335
+ def __post_init__(self):
336
+ """Validate the compiled step after initialization"""
337
+ self.validate()
338
+
339
+ def validate(self):
340
+ """Validate that the step is properly configured"""
341
+ valid_types = {"transform", "conditional"}
342
+
343
+ if self.step_type not in valid_types:
344
+ raise ValueError(
345
+ f"Invalid step_type '{self.step_type}'. "
346
+ f"Must be one of {valid_types}"
347
+ )
348
+
349
+ if self.step_type == "transform":
350
+ if not callable(self.action):
351
+ raise ValueError(
352
+ f"Transform step requires a callable action, "
353
+ f"got {type(self.action).__name__}"
354
+ )
355
+ if self.condition is not None:
356
+ logger.warning("Transform step has condition which will be ignored")
357
+ if self.then_branch is not None or self.else_branch is not None:
358
+ logger.warning("Transform step has branches which will be ignored")
359
+
360
+ elif self.step_type == "conditional":
361
+ if not callable(self.condition):
362
+ raise ValueError(
363
+ f"Conditional step requires a callable condition, "
364
+ f"got {type(self.condition).__name__ if self.condition else 'None'}"
365
+ )
366
+ if not self.then_branch:
367
+ raise ValueError("Conditional step requires at least a then_branch")
368
+ if self.action is not None:
369
+ logger.warning("Conditional step has action which will be ignored")
370
+
371
+ # Validate nested steps
372
+ for step in self.then_branch:
373
+ if not isinstance(step, CompiledStep):
374
+ raise TypeError(
375
+ f"then_branch must contain CompiledStep instances, "
376
+ f"got {type(step).__name__}"
377
+ )
378
+
379
+ if self.else_branch:
380
+ for step in self.else_branch:
381
+ if not isinstance(step, CompiledStep):
382
+ raise TypeError(
383
+ f"else_branch must contain CompiledStep instances, "
384
+ f"got {type(step).__name__}"
385
+ )
386
+
387
+
388
+ class StablePipeline:
389
+ """Stable runtime pipeline executor"""
390
+
391
+ def __init__(self, steps: Optional[List[CompiledStep]] = None, debug=False):
392
+ self.steps = steps or []
393
+ self.debug = debug
394
+ self.__name__ = "pipeline"
395
+ self.__doc__ = "Compiled pipeline"
396
+ self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
397
+
398
+ # Validate all steps
399
+ self._validate_pipeline()
400
+
401
+ def __call__(self, col: Column) -> Column: # type: ignore
402
+ """Execute the pipeline"""
403
+ return self._execute_steps(self.steps, col) # type: ignore
404
+
405
+ def _execute_steps(self, steps: List[CompiledStep], col: Column) -> Column: # type: ignore
406
+ result = col # type: ignore
407
+
408
+ for step in steps:
409
+ if self.debug:
410
+ step_name = (
411
+ getattr(step.action, "__name__", step.step_type)
412
+ if step.action
413
+ else step.step_type
414
+ )
415
+ self.logger.debug(f"Executing step: {step_name}")
416
+
417
+ if step.step_type == "transform":
418
+ if callable(step.action):
419
+ result = step.action(result)
420
+
421
+ elif step.step_type == "conditional":
422
+ if step.then_branch:
423
+ then_result = self._execute_steps(step.then_branch, result) # type: ignore
424
+
425
+ if step.else_branch:
426
+ else_result = self._execute_steps(step.else_branch, result) # type: ignore
427
+ result = F.when(step.condition(result), then_result).otherwise( # type: ignore
428
+ else_result
429
+ )
430
+ else:
431
+ result = F.when(step.condition(result), then_result).otherwise( # type: ignore
432
+ result
433
+ )
434
+
435
+ return result
436
+
437
+ def _validate_pipeline(self):
438
+ """Validate all steps in the pipeline"""
439
+ if not self.steps:
440
+ self.logger.debug("Empty pipeline - no steps to validate")
441
+ return
442
+
443
+ for i, step in enumerate(self.steps):
444
+ if not isinstance(step, CompiledStep):
445
+ raise TypeError(
446
+ f"Pipeline step {i} must be a CompiledStep instance, "
447
+ f"got {type(step).__name__}"
448
+ )
449
+ # Step validation happens in CompiledStep.__post_init__
450
+
451
+ self.logger.debug(f"Pipeline validated with {len(self.steps)} steps")
452
+
453
+
454
+ class PipelineCompiler:
455
+ def __init__(self, namespaces: Dict[str, Any], debug: bool = False):
456
+ self.namespaces = namespaces
457
+ self.debug = debug
458
+
459
+ def compile(self, func: Callable) -> StablePipeline:
460
+ try:
461
+ source = inspect.getsource(func)
462
+ source = textwrap.dedent(source)
463
+ tree = ast.parse(source)
464
+ func_def = tree.body[0]
465
+
466
+ # Ensure we have a function definition
467
+ if not isinstance(func_def, ast.FunctionDef):
468
+ raise ValueError(f"Expected FunctionDef, got {type(func_def).__name__}")
469
+
470
+ steps = self._compile_body(func_def.body)
471
+ pipeline = StablePipeline(steps, self.debug)
472
+ pipeline.__name__ = func.__name__
473
+ pipeline.__doc__ = func.__doc__
474
+
475
+ return pipeline
476
+
477
+ except Exception as e:
478
+ logger.warning(
479
+ f"Failed to compile '{func.__name__}': {e}. "
480
+ f"Creating empty pipeline as fallback."
481
+ )
482
+ if self.debug:
483
+ logger.debug(f"Compilation error details: {e}", exc_info=True)
484
+ # Return empty pipeline on failure
485
+ return StablePipeline([], self.debug)
486
+
487
+ def _compile_body(self, nodes: Sequence[ast.AST]) -> List[CompiledStep]:
488
+ """Compile AST nodes to steps"""
489
+ steps = []
490
+
491
+ for node in nodes:
492
+ if isinstance(node, ast.If):
493
+ step = self._compile_if(node)
494
+ if step:
495
+ steps.append(step)
496
+
497
+ elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Call):
498
+ step = self._compile_call(node.value)
499
+ if step:
500
+ steps.append(step)
501
+
502
+ return steps
503
+
504
+ def _compile_if(self, node: ast.If) -> Optional[CompiledStep]:
505
+ """Compile if/else statement"""
506
+ condition = self._compile_condition(node.test)
507
+ then_branch = self._compile_body(node.body)
508
+ else_branch = self._compile_body(node.orelse) if node.orelse else None
509
+
510
+ try:
511
+ return CompiledStep(
512
+ step_type="conditional",
513
+ condition=condition,
514
+ then_branch=then_branch,
515
+ else_branch=else_branch,
516
+ )
517
+ except (ValueError, TypeError) as e:
518
+ logger.warning(f"Failed to compile conditional: {e}")
519
+ if self.debug:
520
+ logger.debug("Conditional compilation error details:", exc_info=True)
521
+ return None
522
+
523
+ def _compile_condition(self, node: ast.AST) -> Callable:
524
+ """Compile condition expression"""
525
+ if isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute):
526
+ namespace_name = (
527
+ node.func.value.id if isinstance(node.func.value, ast.Name) else None
528
+ )
529
+ method_name = node.func.attr
530
+
531
+ namespace = (
532
+ self.namespaces.get(namespace_name) if namespace_name else None
533
+ ) or (globals().get(namespace_name) if namespace_name else None)
534
+ if namespace and hasattr(namespace, method_name):
535
+ method = getattr(namespace, method_name)
536
+
537
+ kwargs = {}
538
+ for keyword in node.keywords:
539
+ kwargs[keyword.arg] = self._get_value(keyword.value)
540
+
541
+ return method(**kwargs) if kwargs else method
542
+
543
+ return lambda col: True
544
+
545
+ def _compile_call(self, node: ast.Call) -> Optional[CompiledStep]:
546
+ """Compile function call"""
547
+ if isinstance(node.func, ast.Attribute):
548
+ namespace_name = (
549
+ node.func.value.id if isinstance(node.func.value, ast.Name) else None
550
+ )
551
+ method_name = node.func.attr
552
+
553
+ namespace = (
554
+ self.namespaces.get(namespace_name) if namespace_name else None
555
+ ) or (globals().get(namespace_name) if namespace_name else None)
556
+ if namespace and hasattr(namespace, method_name):
557
+ method = getattr(namespace, method_name)
558
+
559
+ kwargs = {}
560
+ for keyword in node.keywords:
561
+ kwargs[keyword.arg] = self._get_value(keyword.value)
562
+
563
+ action = method(**kwargs) if kwargs else method
564
+
565
+ try:
566
+ return CompiledStep(step_type="transform", action=action)
567
+ except (ValueError, TypeError) as e:
568
+ logger.warning(f"Failed to compile transform: {e}")
569
+ if self.debug:
570
+ logger.debug(
571
+ "Transform compilation error details:", exc_info=True
572
+ )
573
+ return None
574
+
575
+ return None
576
+
577
+ def _get_value(self, node: ast.AST) -> Any:
578
+ """Extract value from AST node"""
579
+ if isinstance(node, ast.Constant):
580
+ return node.value
581
+ # Python 3.7 compatibility - handle legacy literal nodes
582
+ elif hasattr(ast, "Num") and isinstance(node, (ast.Num, ast.Str, ast.Bytes, ast.NameConstant)): # type: ignore
583
+ return node.value # type: ignore
584
+ else:
585
+ try:
586
+ return ast.literal_eval(node)
587
+ except Exception as e:
588
+ logger.debug(f"Failed to extract value from AST node: {e}")
589
+ return None
590
+
591
+
592
+ __all__ = [
593
+ "SmartPrimitive",
594
+ "PrimitiveRegistry",
595
+ ]
File without changes