datacompose 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacompose might be problematic. Click here for more details.
- datacompose/__init__.py +1 -0
- datacompose/cli/__init__.py +5 -0
- datacompose/cli/colors.py +80 -0
- datacompose/cli/commands/__init__.py +3 -0
- datacompose/cli/commands/add.py +215 -0
- datacompose/cli/commands/init.py +451 -0
- datacompose/cli/commands/list.py +118 -0
- datacompose/cli/commands/upgrade.py +7 -0
- datacompose/cli/main.py +59 -0
- datacompose/cli/validation.py +72 -0
- datacompose/generators/__init__.py +3 -0
- datacompose/generators/base.py +193 -0
- datacompose/generators/pyspark/__init__.py +1 -0
- datacompose/generators/pyspark/generator.py +51 -0
- datacompose/operators/__init__.py +21 -0
- datacompose/operators/primitives.py +595 -0
- datacompose/transformers/__init__.py +0 -0
- datacompose/transformers/discovery.py +186 -0
- datacompose/transformers/text/__init__.py +1 -0
- datacompose/transformers/text/clean_addresses/__init__.py +1 -0
- datacompose/transformers/text/clean_addresses/pyspark/pyspark_primitives.py +1967 -0
- datacompose/transformers/text/clean_emails/__init__.py +1 -0
- datacompose/transformers/text/clean_emails/pyspark/pyspark_primitives.py +781 -0
- datacompose/transformers/text/clean_phone_numbers/__init__.py +0 -0
- datacompose/transformers/text/clean_phone_numbers/pyspark/pyspark_primitives.py +941 -0
- datacompose-0.2.4.dist-info/METADATA +431 -0
- datacompose-0.2.4.dist-info/RECORD +31 -0
- datacompose-0.2.4.dist-info/WHEEL +5 -0
- datacompose-0.2.4.dist-info/entry_points.txt +2 -0
- datacompose-0.2.4.dist-info/licenses/LICENSE +21 -0
- datacompose-0.2.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,595 @@
|
|
|
1
|
+
"""
|
|
2
|
+
datacompose/operators/primitives.py
|
|
3
|
+
=====================================
|
|
4
|
+
Simple and elegant compose decorator framework for building data pipelines.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import ast
|
|
8
|
+
import inspect
|
|
9
|
+
import logging
|
|
10
|
+
import textwrap
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from functools import wraps
|
|
13
|
+
from typing import Any, Callable, Dict, List, Optional, Sequence
|
|
14
|
+
|
|
15
|
+
# Set up module logger
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
from pyspark.sql import Column # type: ignore
|
|
20
|
+
except ImportError:
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SmartPrimitive:
|
|
25
|
+
"""Wraps a PySpark column transformation function to enable partial application.
|
|
26
|
+
|
|
27
|
+
SmartPrimitive allows column transformation functions to be:
|
|
28
|
+
1. Called directly with a column: `primitive(col)`
|
|
29
|
+
2. Pre-configured with parameters: `primitive(param=value)` returns a configured function
|
|
30
|
+
|
|
31
|
+
This enables building reusable, parameterized transformations that can be composed
|
|
32
|
+
into data pipelines.
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
>>> def trim_spaces(col, chars=' '):
|
|
36
|
+
... return f.trim(col, chars)
|
|
37
|
+
>>>
|
|
38
|
+
>>> trim = SmartPrimitive(trim_spaces)
|
|
39
|
+
>>>
|
|
40
|
+
>>> # Direct usage
|
|
41
|
+
>>> df.select(trim(f.col("text")))
|
|
42
|
+
>>>
|
|
43
|
+
>>> # Pre-configured usage
|
|
44
|
+
>>> trim_tabs = trim(chars='\t')
|
|
45
|
+
>>> df.select(trim_tabs(f.col("text")))
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
Please note that you will not use this directly. It will be used in the PrimitiveRegistry class
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(self, func: Callable, name: Optional[str] = None):
|
|
52
|
+
"""Initialize a SmartPrimitive.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
func: The column transformation function to wrap
|
|
56
|
+
name: Optional name for the primitive (defaults to func.__name__)
|
|
57
|
+
"""
|
|
58
|
+
self.func = func
|
|
59
|
+
self.name = name or func.__name__
|
|
60
|
+
self.__doc__ = func.__doc__
|
|
61
|
+
|
|
62
|
+
def __call__(self, col: Optional[Column] = None, **kwargs): # type: ignore
|
|
63
|
+
"""Apply the transformation or return a configured version.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
col: Optional PySpark Column to transform. If provided, applies the
|
|
67
|
+
transformation immediately. If None, returns a configured function.
|
|
68
|
+
**kwargs: Parameters to pass to the transformation function
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
If col is provided: The transformed Column
|
|
72
|
+
If col is None: A configured function that takes a Column
|
|
73
|
+
"""
|
|
74
|
+
if col is not None: # type: ignore
|
|
75
|
+
return self.func(col, **kwargs) # type: ignore
|
|
76
|
+
else:
|
|
77
|
+
|
|
78
|
+
@wraps(self.func)
|
|
79
|
+
def configured(col: Column): # type: ignore
|
|
80
|
+
return self.func(col, **kwargs) # type: ignore
|
|
81
|
+
|
|
82
|
+
configured.__name__ = (
|
|
83
|
+
f"{self.name}({', '.join(f'{k}={v}' for k, v in kwargs.items())})"
|
|
84
|
+
)
|
|
85
|
+
return configured
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class PrimitiveRegistry:
|
|
89
|
+
"""Container for organizing related column transformation primitives.
|
|
90
|
+
|
|
91
|
+
PrimitiveRegistry groups related SmartPrimitive transformations under a common
|
|
92
|
+
namespace, making them accessible as attributes. This provides a clean API for
|
|
93
|
+
organizing and accessing transformation functions.
|
|
94
|
+
|
|
95
|
+
Example:
|
|
96
|
+
>>> # Create a registry for string operations
|
|
97
|
+
>>> string = PrimitiveRegistry("string")
|
|
98
|
+
>>>
|
|
99
|
+
>>> # Register transformations
|
|
100
|
+
>>> @string.register()
|
|
101
|
+
>>> def lowercase(col):
|
|
102
|
+
... return f.lower(col)
|
|
103
|
+
>>>
|
|
104
|
+
>>> @string.register()
|
|
105
|
+
>>> def trim(col, chars=' '):
|
|
106
|
+
... return f.trim(col, chars)
|
|
107
|
+
>>>
|
|
108
|
+
>>> # Use the transformations
|
|
109
|
+
>>> df.select(string.lowercase(f.col("text")))
|
|
110
|
+
>>> df.select(string.trim(chars='\t')(f.col("text")))
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
def __init__(self, namespace_name: str):
|
|
114
|
+
"""Initialize a PrimitiveRegistry.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
namespace_name: Name for this namespace (used in error messages)
|
|
118
|
+
"""
|
|
119
|
+
self.namespace_name = namespace_name
|
|
120
|
+
self._primitives = {}
|
|
121
|
+
self._conditionals = {}
|
|
122
|
+
|
|
123
|
+
def register(self, name: Optional[str] = None, is_conditional: bool = False):
|
|
124
|
+
"""Decorator to register a function as a SmartPrimitive in this namespace.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
name: Optional name for the primitive (defaults to function name)
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Decorator function that wraps the target function as a SmartPrimitive
|
|
131
|
+
|
|
132
|
+
Example:
|
|
133
|
+
>>> ns = PrimitiveRegistry("text")
|
|
134
|
+
>>> @ns.register()
|
|
135
|
+
>>> def clean(col):
|
|
136
|
+
... return f.trim(f.lower(col))
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
def decorator(func: Callable):
|
|
140
|
+
primitive_name = name or func.__name__
|
|
141
|
+
|
|
142
|
+
if is_conditional:
|
|
143
|
+
self._conditionals[primitive_name] = SmartPrimitive(
|
|
144
|
+
func, primitive_name
|
|
145
|
+
)
|
|
146
|
+
setattr(self, primitive_name, self._conditionals[primitive_name])
|
|
147
|
+
else:
|
|
148
|
+
self._primitives[primitive_name] = SmartPrimitive(func, primitive_name)
|
|
149
|
+
setattr(self, primitive_name, self._primitives[primitive_name])
|
|
150
|
+
# return self._primitives[primitive_name]
|
|
151
|
+
return func
|
|
152
|
+
|
|
153
|
+
return decorator
|
|
154
|
+
|
|
155
|
+
def __getattr__(self, name):
|
|
156
|
+
if name in self._primitives:
|
|
157
|
+
return self._primitives[name]
|
|
158
|
+
elif name in self._conditionals:
|
|
159
|
+
return self._conditionals[name]
|
|
160
|
+
else:
|
|
161
|
+
raise AttributeError(f"No primitive '{name}' in {self.namespace_name}")
|
|
162
|
+
|
|
163
|
+
def compose(
|
|
164
|
+
self,
|
|
165
|
+
func: Optional[Callable] = None,
|
|
166
|
+
*,
|
|
167
|
+
debug: bool = False,
|
|
168
|
+
steps: Optional[list] = None,
|
|
169
|
+
**namespaces,
|
|
170
|
+
):
|
|
171
|
+
"""Decorator that converts a function body into a composed transformation pipeline.
|
|
172
|
+
|
|
173
|
+
The compose decorator analyzes the AST of a function and extracts a sequence of
|
|
174
|
+
transformation calls, creating a pipeline that applies them in order. This allows
|
|
175
|
+
declarative pipeline definitions using natural function call syntax.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
func: Function to convert into a pipeline
|
|
179
|
+
debug: If True, prints each transformation as it's applied
|
|
180
|
+
steps: Optional list of pre-configured steps (bypasses AST parsing)
|
|
181
|
+
**namespaces: Namespace objects to use for resolving transformations
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
A composed function that applies all transformations in sequence
|
|
185
|
+
|
|
186
|
+
Example:
|
|
187
|
+
>>> string = PrimitiveRegistry("string")
|
|
188
|
+
>>>
|
|
189
|
+
>>> @string.register()
|
|
190
|
+
>>> def trim(col, chars=' '):
|
|
191
|
+
... return f.trim(col, chars)
|
|
192
|
+
>>>
|
|
193
|
+
>>> @string.register()
|
|
194
|
+
>>> def lowercase(col):
|
|
195
|
+
... return f.lower(col)
|
|
196
|
+
>>>
|
|
197
|
+
>>> @compose(string=string)
|
|
198
|
+
>>> def clean_text():
|
|
199
|
+
... string.trim()
|
|
200
|
+
... string.lowercase()
|
|
201
|
+
>>>
|
|
202
|
+
>>> # Use the composed pipeline
|
|
203
|
+
>>> df.select(clean_text(f.col("text")))
|
|
204
|
+
"""
|
|
205
|
+
|
|
206
|
+
def decorator(func: Callable):
|
|
207
|
+
# Check if steps are provided directly
|
|
208
|
+
if steps is not None:
|
|
209
|
+
# Old style with explicit steps
|
|
210
|
+
def pipeline(col: Column) -> Column: # type: ignore
|
|
211
|
+
result = col # type: ignore
|
|
212
|
+
for step in steps: # type: ignore
|
|
213
|
+
result = step(result)
|
|
214
|
+
return result
|
|
215
|
+
|
|
216
|
+
pipeline.__name__ = func.__name__
|
|
217
|
+
pipeline.__doc__ = func.__doc__
|
|
218
|
+
return pipeline
|
|
219
|
+
|
|
220
|
+
# Try to get the function as a string and parse it
|
|
221
|
+
try:
|
|
222
|
+
compiler = PipelineCompiler(namespaces, debug)
|
|
223
|
+
pipeline = compiler.compile(func)
|
|
224
|
+
|
|
225
|
+
if debug and pipeline.steps:
|
|
226
|
+
logger.info(
|
|
227
|
+
f"Successfully compiled '{func.__name__}' with {len(pipeline.steps)} steps"
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
return pipeline
|
|
231
|
+
except Exception as e:
|
|
232
|
+
logger.warning(
|
|
233
|
+
f"Advanced compilation failed for '{func.__name__}': {e}. "
|
|
234
|
+
f"Falling back to sequential extraction."
|
|
235
|
+
)
|
|
236
|
+
if debug:
|
|
237
|
+
logger.debug("Compilation error details:", exc_info=True)
|
|
238
|
+
|
|
239
|
+
# Fallback: Extract just the function calls sequentially
|
|
240
|
+
# This maintains backward compatibility
|
|
241
|
+
return _fallback_compose(func, namespaces, debug)
|
|
242
|
+
|
|
243
|
+
if func is None:
|
|
244
|
+
# Called with arguments @compose(debug=True, email=email_namespace)
|
|
245
|
+
return decorator
|
|
246
|
+
else:
|
|
247
|
+
# Called without arguments @compose
|
|
248
|
+
return decorator(func)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _fallback_compose(func: Callable, namespaces: Dict, debug: bool) -> Callable:
|
|
252
|
+
"""Fallback for when compilation fails - extracts sequential calls only"""
|
|
253
|
+
try:
|
|
254
|
+
source = inspect.getsource(func)
|
|
255
|
+
source = textwrap.dedent(source)
|
|
256
|
+
tree = ast.parse(source)
|
|
257
|
+
func_def = tree.body[0]
|
|
258
|
+
|
|
259
|
+
# Extract only simple function calls (old behavior)
|
|
260
|
+
steps = []
|
|
261
|
+
if isinstance(func_def, ast.FunctionDef):
|
|
262
|
+
for node in func_def.body:
|
|
263
|
+
if isinstance(node, ast.Expr) and isinstance(node.value, ast.Call):
|
|
264
|
+
if isinstance(node.value.func, ast.Attribute):
|
|
265
|
+
namespace_name = (
|
|
266
|
+
node.value.func.value.id
|
|
267
|
+
if isinstance(node.value.func.value, ast.Name)
|
|
268
|
+
else None
|
|
269
|
+
)
|
|
270
|
+
method_name = node.value.func.attr
|
|
271
|
+
namespace = (
|
|
272
|
+
namespaces.get(namespace_name) if namespace_name else None
|
|
273
|
+
) or (globals().get(namespace_name) if namespace_name else None)
|
|
274
|
+
if namespace and hasattr(namespace, method_name):
|
|
275
|
+
method = getattr(namespace, method_name)
|
|
276
|
+
|
|
277
|
+
kwargs = {}
|
|
278
|
+
for keyword in node.value.keywords:
|
|
279
|
+
try:
|
|
280
|
+
kwargs[keyword.arg] = ast.literal_eval(
|
|
281
|
+
keyword.value
|
|
282
|
+
)
|
|
283
|
+
except Exception:
|
|
284
|
+
pass
|
|
285
|
+
|
|
286
|
+
steps.append(method(**kwargs) if kwargs else method)
|
|
287
|
+
|
|
288
|
+
def pipeline(col: Column) -> Column: # type: ignore
|
|
289
|
+
result = col # type: ignore
|
|
290
|
+
for step in steps:
|
|
291
|
+
if debug:
|
|
292
|
+
logger.debug(f"Executing step: {getattr(step, '__name__', step)}")
|
|
293
|
+
result = step(result) # type: ignore
|
|
294
|
+
return result
|
|
295
|
+
|
|
296
|
+
pipeline.__name__ = func.__name__
|
|
297
|
+
pipeline.__doc__ = func.__doc__
|
|
298
|
+
return pipeline
|
|
299
|
+
|
|
300
|
+
except Exception as e:
|
|
301
|
+
logger.error(
|
|
302
|
+
f"Failed to create pipeline for '{func.__name__}': {e}. "
|
|
303
|
+
f"Returning identity function."
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
# Ultimate fallback - return identity function
|
|
307
|
+
def pipeline(col: Column) -> Column: # type: ignore
|
|
308
|
+
return col # type: ignore
|
|
309
|
+
|
|
310
|
+
pipeline.__name__ = func.__name__
|
|
311
|
+
pipeline.__doc__ = f"Failed to compile {func.__name__}"
|
|
312
|
+
return pipeline
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
try:
|
|
316
|
+
from pyspark.sql import Column
|
|
317
|
+
from pyspark.sql import functions as F
|
|
318
|
+
except ImportError:
|
|
319
|
+
logging.debug("PySpark not available")
|
|
320
|
+
|
|
321
|
+
# Set up module logger
|
|
322
|
+
logger = logging.getLogger(__name__)
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
@dataclass
|
|
326
|
+
class CompiledStep:
|
|
327
|
+
"""A compiled pipeline step"""
|
|
328
|
+
|
|
329
|
+
step_type: str
|
|
330
|
+
action: Optional[Callable] = None
|
|
331
|
+
condition: Optional[Callable] = None
|
|
332
|
+
then_branch: Optional[List["CompiledStep"]] = None
|
|
333
|
+
else_branch: Optional[List["CompiledStep"]] = None
|
|
334
|
+
|
|
335
|
+
def __post_init__(self):
|
|
336
|
+
"""Validate the compiled step after initialization"""
|
|
337
|
+
self.validate()
|
|
338
|
+
|
|
339
|
+
def validate(self):
|
|
340
|
+
"""Validate that the step is properly configured"""
|
|
341
|
+
valid_types = {"transform", "conditional"}
|
|
342
|
+
|
|
343
|
+
if self.step_type not in valid_types:
|
|
344
|
+
raise ValueError(
|
|
345
|
+
f"Invalid step_type '{self.step_type}'. "
|
|
346
|
+
f"Must be one of {valid_types}"
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
if self.step_type == "transform":
|
|
350
|
+
if not callable(self.action):
|
|
351
|
+
raise ValueError(
|
|
352
|
+
f"Transform step requires a callable action, "
|
|
353
|
+
f"got {type(self.action).__name__}"
|
|
354
|
+
)
|
|
355
|
+
if self.condition is not None:
|
|
356
|
+
logger.warning("Transform step has condition which will be ignored")
|
|
357
|
+
if self.then_branch is not None or self.else_branch is not None:
|
|
358
|
+
logger.warning("Transform step has branches which will be ignored")
|
|
359
|
+
|
|
360
|
+
elif self.step_type == "conditional":
|
|
361
|
+
if not callable(self.condition):
|
|
362
|
+
raise ValueError(
|
|
363
|
+
f"Conditional step requires a callable condition, "
|
|
364
|
+
f"got {type(self.condition).__name__ if self.condition else 'None'}"
|
|
365
|
+
)
|
|
366
|
+
if not self.then_branch:
|
|
367
|
+
raise ValueError("Conditional step requires at least a then_branch")
|
|
368
|
+
if self.action is not None:
|
|
369
|
+
logger.warning("Conditional step has action which will be ignored")
|
|
370
|
+
|
|
371
|
+
# Validate nested steps
|
|
372
|
+
for step in self.then_branch:
|
|
373
|
+
if not isinstance(step, CompiledStep):
|
|
374
|
+
raise TypeError(
|
|
375
|
+
f"then_branch must contain CompiledStep instances, "
|
|
376
|
+
f"got {type(step).__name__}"
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
if self.else_branch:
|
|
380
|
+
for step in self.else_branch:
|
|
381
|
+
if not isinstance(step, CompiledStep):
|
|
382
|
+
raise TypeError(
|
|
383
|
+
f"else_branch must contain CompiledStep instances, "
|
|
384
|
+
f"got {type(step).__name__}"
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
class StablePipeline:
|
|
389
|
+
"""Stable runtime pipeline executor"""
|
|
390
|
+
|
|
391
|
+
def __init__(self, steps: Optional[List[CompiledStep]] = None, debug=False):
|
|
392
|
+
self.steps = steps or []
|
|
393
|
+
self.debug = debug
|
|
394
|
+
self.__name__ = "pipeline"
|
|
395
|
+
self.__doc__ = "Compiled pipeline"
|
|
396
|
+
self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
|
|
397
|
+
|
|
398
|
+
# Validate all steps
|
|
399
|
+
self._validate_pipeline()
|
|
400
|
+
|
|
401
|
+
def __call__(self, col: Column) -> Column: # type: ignore
|
|
402
|
+
"""Execute the pipeline"""
|
|
403
|
+
return self._execute_steps(self.steps, col) # type: ignore
|
|
404
|
+
|
|
405
|
+
def _execute_steps(self, steps: List[CompiledStep], col: Column) -> Column: # type: ignore
|
|
406
|
+
result = col # type: ignore
|
|
407
|
+
|
|
408
|
+
for step in steps:
|
|
409
|
+
if self.debug:
|
|
410
|
+
step_name = (
|
|
411
|
+
getattr(step.action, "__name__", step.step_type)
|
|
412
|
+
if step.action
|
|
413
|
+
else step.step_type
|
|
414
|
+
)
|
|
415
|
+
self.logger.debug(f"Executing step: {step_name}")
|
|
416
|
+
|
|
417
|
+
if step.step_type == "transform":
|
|
418
|
+
if callable(step.action):
|
|
419
|
+
result = step.action(result)
|
|
420
|
+
|
|
421
|
+
elif step.step_type == "conditional":
|
|
422
|
+
if step.then_branch:
|
|
423
|
+
then_result = self._execute_steps(step.then_branch, result) # type: ignore
|
|
424
|
+
|
|
425
|
+
if step.else_branch:
|
|
426
|
+
else_result = self._execute_steps(step.else_branch, result) # type: ignore
|
|
427
|
+
result = F.when(step.condition(result), then_result).otherwise( # type: ignore
|
|
428
|
+
else_result
|
|
429
|
+
)
|
|
430
|
+
else:
|
|
431
|
+
result = F.when(step.condition(result), then_result).otherwise( # type: ignore
|
|
432
|
+
result
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
return result
|
|
436
|
+
|
|
437
|
+
def _validate_pipeline(self):
|
|
438
|
+
"""Validate all steps in the pipeline"""
|
|
439
|
+
if not self.steps:
|
|
440
|
+
self.logger.debug("Empty pipeline - no steps to validate")
|
|
441
|
+
return
|
|
442
|
+
|
|
443
|
+
for i, step in enumerate(self.steps):
|
|
444
|
+
if not isinstance(step, CompiledStep):
|
|
445
|
+
raise TypeError(
|
|
446
|
+
f"Pipeline step {i} must be a CompiledStep instance, "
|
|
447
|
+
f"got {type(step).__name__}"
|
|
448
|
+
)
|
|
449
|
+
# Step validation happens in CompiledStep.__post_init__
|
|
450
|
+
|
|
451
|
+
self.logger.debug(f"Pipeline validated with {len(self.steps)} steps")
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
class PipelineCompiler:
|
|
455
|
+
def __init__(self, namespaces: Dict[str, Any], debug: bool = False):
|
|
456
|
+
self.namespaces = namespaces
|
|
457
|
+
self.debug = debug
|
|
458
|
+
|
|
459
|
+
def compile(self, func: Callable) -> StablePipeline:
|
|
460
|
+
try:
|
|
461
|
+
source = inspect.getsource(func)
|
|
462
|
+
source = textwrap.dedent(source)
|
|
463
|
+
tree = ast.parse(source)
|
|
464
|
+
func_def = tree.body[0]
|
|
465
|
+
|
|
466
|
+
# Ensure we have a function definition
|
|
467
|
+
if not isinstance(func_def, ast.FunctionDef):
|
|
468
|
+
raise ValueError(f"Expected FunctionDef, got {type(func_def).__name__}")
|
|
469
|
+
|
|
470
|
+
steps = self._compile_body(func_def.body)
|
|
471
|
+
pipeline = StablePipeline(steps, self.debug)
|
|
472
|
+
pipeline.__name__ = func.__name__
|
|
473
|
+
pipeline.__doc__ = func.__doc__
|
|
474
|
+
|
|
475
|
+
return pipeline
|
|
476
|
+
|
|
477
|
+
except Exception as e:
|
|
478
|
+
logger.warning(
|
|
479
|
+
f"Failed to compile '{func.__name__}': {e}. "
|
|
480
|
+
f"Creating empty pipeline as fallback."
|
|
481
|
+
)
|
|
482
|
+
if self.debug:
|
|
483
|
+
logger.debug(f"Compilation error details: {e}", exc_info=True)
|
|
484
|
+
# Return empty pipeline on failure
|
|
485
|
+
return StablePipeline([], self.debug)
|
|
486
|
+
|
|
487
|
+
def _compile_body(self, nodes: Sequence[ast.AST]) -> List[CompiledStep]:
|
|
488
|
+
"""Compile AST nodes to steps"""
|
|
489
|
+
steps = []
|
|
490
|
+
|
|
491
|
+
for node in nodes:
|
|
492
|
+
if isinstance(node, ast.If):
|
|
493
|
+
step = self._compile_if(node)
|
|
494
|
+
if step:
|
|
495
|
+
steps.append(step)
|
|
496
|
+
|
|
497
|
+
elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Call):
|
|
498
|
+
step = self._compile_call(node.value)
|
|
499
|
+
if step:
|
|
500
|
+
steps.append(step)
|
|
501
|
+
|
|
502
|
+
return steps
|
|
503
|
+
|
|
504
|
+
def _compile_if(self, node: ast.If) -> Optional[CompiledStep]:
|
|
505
|
+
"""Compile if/else statement"""
|
|
506
|
+
condition = self._compile_condition(node.test)
|
|
507
|
+
then_branch = self._compile_body(node.body)
|
|
508
|
+
else_branch = self._compile_body(node.orelse) if node.orelse else None
|
|
509
|
+
|
|
510
|
+
try:
|
|
511
|
+
return CompiledStep(
|
|
512
|
+
step_type="conditional",
|
|
513
|
+
condition=condition,
|
|
514
|
+
then_branch=then_branch,
|
|
515
|
+
else_branch=else_branch,
|
|
516
|
+
)
|
|
517
|
+
except (ValueError, TypeError) as e:
|
|
518
|
+
logger.warning(f"Failed to compile conditional: {e}")
|
|
519
|
+
if self.debug:
|
|
520
|
+
logger.debug("Conditional compilation error details:", exc_info=True)
|
|
521
|
+
return None
|
|
522
|
+
|
|
523
|
+
def _compile_condition(self, node: ast.AST) -> Callable:
|
|
524
|
+
"""Compile condition expression"""
|
|
525
|
+
if isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute):
|
|
526
|
+
namespace_name = (
|
|
527
|
+
node.func.value.id if isinstance(node.func.value, ast.Name) else None
|
|
528
|
+
)
|
|
529
|
+
method_name = node.func.attr
|
|
530
|
+
|
|
531
|
+
namespace = (
|
|
532
|
+
self.namespaces.get(namespace_name) if namespace_name else None
|
|
533
|
+
) or (globals().get(namespace_name) if namespace_name else None)
|
|
534
|
+
if namespace and hasattr(namespace, method_name):
|
|
535
|
+
method = getattr(namespace, method_name)
|
|
536
|
+
|
|
537
|
+
kwargs = {}
|
|
538
|
+
for keyword in node.keywords:
|
|
539
|
+
kwargs[keyword.arg] = self._get_value(keyword.value)
|
|
540
|
+
|
|
541
|
+
return method(**kwargs) if kwargs else method
|
|
542
|
+
|
|
543
|
+
return lambda col: True
|
|
544
|
+
|
|
545
|
+
def _compile_call(self, node: ast.Call) -> Optional[CompiledStep]:
|
|
546
|
+
"""Compile function call"""
|
|
547
|
+
if isinstance(node.func, ast.Attribute):
|
|
548
|
+
namespace_name = (
|
|
549
|
+
node.func.value.id if isinstance(node.func.value, ast.Name) else None
|
|
550
|
+
)
|
|
551
|
+
method_name = node.func.attr
|
|
552
|
+
|
|
553
|
+
namespace = (
|
|
554
|
+
self.namespaces.get(namespace_name) if namespace_name else None
|
|
555
|
+
) or (globals().get(namespace_name) if namespace_name else None)
|
|
556
|
+
if namespace and hasattr(namespace, method_name):
|
|
557
|
+
method = getattr(namespace, method_name)
|
|
558
|
+
|
|
559
|
+
kwargs = {}
|
|
560
|
+
for keyword in node.keywords:
|
|
561
|
+
kwargs[keyword.arg] = self._get_value(keyword.value)
|
|
562
|
+
|
|
563
|
+
action = method(**kwargs) if kwargs else method
|
|
564
|
+
|
|
565
|
+
try:
|
|
566
|
+
return CompiledStep(step_type="transform", action=action)
|
|
567
|
+
except (ValueError, TypeError) as e:
|
|
568
|
+
logger.warning(f"Failed to compile transform: {e}")
|
|
569
|
+
if self.debug:
|
|
570
|
+
logger.debug(
|
|
571
|
+
"Transform compilation error details:", exc_info=True
|
|
572
|
+
)
|
|
573
|
+
return None
|
|
574
|
+
|
|
575
|
+
return None
|
|
576
|
+
|
|
577
|
+
def _get_value(self, node: ast.AST) -> Any:
|
|
578
|
+
"""Extract value from AST node"""
|
|
579
|
+
if isinstance(node, ast.Constant):
|
|
580
|
+
return node.value
|
|
581
|
+
# Python 3.7 compatibility - handle legacy literal nodes
|
|
582
|
+
elif hasattr(ast, "Num") and isinstance(node, (ast.Num, ast.Str, ast.Bytes, ast.NameConstant)): # type: ignore
|
|
583
|
+
return node.value # type: ignore
|
|
584
|
+
else:
|
|
585
|
+
try:
|
|
586
|
+
return ast.literal_eval(node)
|
|
587
|
+
except Exception as e:
|
|
588
|
+
logger.debug(f"Failed to extract value from AST node: {e}")
|
|
589
|
+
return None
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
__all__ = [
|
|
593
|
+
"SmartPrimitive",
|
|
594
|
+
"PrimitiveRegistry",
|
|
595
|
+
]
|
|
File without changes
|