additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -176
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -304
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/distributions.py +0 -22
  100. additory/synthetic/forecast.py +0 -1132
  101. additory/synthetic/linked_list_parser.py +0 -415
  102. additory/synthetic/namespace_lookup.py +0 -129
  103. additory/synthetic/smote.py +0 -320
  104. additory/synthetic/strategies.py +0 -850
  105. additory/synthetic/synthesizer.py +0 -713
  106. additory/utilities/__init__.py +0 -53
  107. additory/utilities/encoding.py +0 -600
  108. additory/utilities/games.py +0 -300
  109. additory/utilities/keys.py +0 -8
  110. additory/utilities/lookup.py +0 -103
  111. additory/utilities/matchers.py +0 -216
  112. additory/utilities/resolvers.py +0 -286
  113. additory/utilities/settings.py +0 -167
  114. additory/utilities/units.py +0 -749
  115. additory/utilities/validators.py +0 -153
  116. additory-0.1.0a3.dist-info/METADATA +0 -288
  117. additory-0.1.0a3.dist-info/RECORD +0 -71
  118. additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
  119. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  120. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -1,551 +1,328 @@
1
- # polars_expression_engine.py
2
- # Polars-only expression processing engine for enhanced expressions system
1
+ """
2
+ Core expression evaluation engine for Additory.
3
3
 
4
- import polars as pl
5
- from typing import Any, Dict, Optional, Union
6
- from dataclasses import dataclass
7
- from datetime import datetime
8
-
9
- from .enhanced_arrow_bridge import EnhancedArrowBridge, ArrowBridgeError
10
- from .ast_builder import build_ast_from_expression
11
- from .logging import log_info, log_warning
12
- from .memory_manager import get_memory_manager
4
+ Main engine that ties together parser, compiler, loader, resolver, and integrity.
5
+ """
13
6
 
7
+ import polars as pl
8
+ from typing import Dict, List, Optional, Tuple
9
+ from pathlib import Path
14
10
 
15
- @dataclass
16
- class ExpressionResult:
17
- """Result of expression execution"""
18
- dataframe: Any
19
- execution_time_ms: float
20
- rows_processed: int
21
- columns_processed: int
22
- backend_type: str
23
- memory_used_mb: float
11
+ from additory.expressions.parser import ExpressionParser
12
+ from additory.expressions.compiler import ExpressionCompiler
13
+ from additory.expressions.loader import load_expressions_from_file
14
+ from additory.expressions.resolver import resolve_dependencies, check_circular_dependencies
15
+ from additory.expressions.integrity import verify_sha
16
+ from additory.core.logging import Logger
24
17
 
25
18
 
26
- class PolarsExpressionError(Exception):
27
- """Raised when Polars expression processing fails"""
28
- pass
19
+ # Global engine instance
20
+ _engine_instance: Optional['ExpressionEngine'] = None
29
21
 
30
22
 
31
- class PolarsExpressionEngine:
32
- """Exclusive Polars-based expression processing engine"""
23
+ class ExpressionEngine:
24
+ """
25
+ Main expression evaluation engine.
26
+
27
+ Singleton class that manages expression loading, parsing, compilation, and evaluation.
28
+ """
33
29
 
34
30
  def __init__(self):
35
- self.arrow_bridge = EnhancedArrowBridge()
36
- self.execution_stats = {
37
- "total_executions": 0,
38
- "total_time_ms": 0.0,
39
- "total_rows_processed": 0,
40
- "errors": 0
41
- }
31
+ """Initialize expression engine."""
32
+ self.loaded_expressions: Dict[str, Dict] = {}
33
+ self.parser = ExpressionParser()
34
+ self.compiler = ExpressionCompiler()
35
+ self.logger = Logger()
42
36
 
43
- # Register with memory manager for cleanup
44
- self.memory_manager = get_memory_manager()
45
- self.memory_manager.register_cleanup_callback(self._cleanup_callback)
37
+ # Load built-in expressions
38
+ self._load_inbuilt_expressions()
46
39
 
47
- def execute_expression(self, df: Any, expression: str, output_column: str,
48
- backend_type: Optional[str] = None) -> ExpressionResult:
49
- """
50
- Execute expression using Polars exclusively
51
-
52
- Args:
53
- df: Input dataframe (any supported backend)
54
- expression: Expression string to execute
55
- output_column: Name for the output column
56
- backend_type: Source backend type (auto-detected if None)
57
-
58
- Returns:
59
- ExpressionResult with processed dataframe and statistics
60
-
61
- Raises:
62
- PolarsExpressionError: If expression execution fails
63
- """
64
- start_time = datetime.now()
40
+ def _load_inbuilt_expressions(self):
41
+ """Load built-in expressions from bundled .add files."""
42
+ # Get the inbuilt expressions directory
43
+ # This would be in the package: additory/inbuilt_expressions/
44
+ inbuilt_dir = Path(__file__).parent.parent / 'inbuilt_expressions'
65
45
 
66
- # Use memory context for monitoring
67
- with self.memory_manager.memory_context(f"expression: {expression[:50]}..."):
68
- try:
69
- # Auto-detect backend if not specified
70
- if backend_type is None:
71
- backend_type = self.arrow_bridge.detect_backend(df)
72
-
73
- # Get memory usage before processing
74
- memory_before = self.arrow_bridge._get_memory_usage_mb()
75
-
76
- # 1. Convert input to Arrow
77
- log_info(f"[polars_engine] Converting {backend_type} to Arrow")
78
- arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
79
-
80
- # 2. Convert Arrow to Polars
81
- log_info("[polars_engine] Converting Arrow to Polars")
82
- polars_df = pl.from_arrow(arrow_table)
83
-
84
- # 3. Execute expression in Polars
85
- log_info(f"[polars_engine] Executing expression: {expression}")
86
- result_df = self._execute_polars_expression(
87
- polars_df, expression, output_column
88
- )
89
-
90
- # 4. Convert back to Arrow
91
- log_info("[polars_engine] Converting result to Arrow")
92
- result_arrow = result_df.to_arrow()
93
-
94
- # 5. Convert to original backend format
95
- log_info(f"[polars_engine] Converting Arrow to {backend_type}")
96
- final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
97
-
98
- # Calculate execution statistics
99
- execution_time = (datetime.now() - start_time).total_seconds() * 1000
100
- memory_after = self.arrow_bridge._get_memory_usage_mb()
101
- memory_used = max(0, memory_after - memory_before)
102
-
103
- # Update global statistics
104
- self.execution_stats["total_executions"] += 1
105
- self.execution_stats["total_time_ms"] += execution_time
106
- self.execution_stats["total_rows_processed"] += result_df.height
107
-
108
- log_info(f"[polars_engine] Expression executed successfully in {execution_time:.1f}ms")
109
-
110
- return ExpressionResult(
111
- dataframe=final_result,
112
- execution_time_ms=execution_time,
113
- rows_processed=result_df.height,
114
- columns_processed=result_df.width,
115
- backend_type=backend_type,
116
- memory_used_mb=memory_used
117
- )
118
-
119
- except Exception as e:
120
- self.execution_stats["errors"] += 1
121
- raise PolarsExpressionError(f"Expression execution failed: {e}")
122
-
123
- finally:
124
- # 6. Always cleanup Arrow memory
125
- self.arrow_bridge.cleanup_arrow_memory()
46
+ if inbuilt_dir.exists():
47
+ self.load_namespace('inbuilt', str(inbuilt_dir))
126
48
 
127
- def _execute_polars_expression(self, polars_df: pl.DataFrame,
128
- expression: str, output_column: str) -> pl.DataFrame:
49
+ def evaluate(self, df: pl.DataFrame, expression: str) -> pl.Series:
129
50
  """
130
- Execute expression AST in Polars
51
+ Evaluate expression and return result.
131
52
 
132
53
  Args:
133
- polars_df: Input Polars DataFrame
134
- expression: Expression string
135
- output_column: Name for output column
54
+ df: DataFrame to evaluate expression on
55
+ expression: Expression string (inline or reference)
136
56
 
137
57
  Returns:
138
- Polars DataFrame with new column
139
-
140
- Raises:
141
- PolarsExpressionError: If expression execution fails
142
- """
143
- try:
144
- # Clean up multiline expressions
145
- cleaned_expression = ' '.join(line.strip() for line in expression.strip().split('\n') if line.strip())
146
-
147
- # Build AST from expression
148
- ast_tree = build_ast_from_expression(cleaned_expression)
149
-
150
- if ast_tree is None:
151
- raise PolarsExpressionError(f"Failed to parse expression: {expression}")
58
+ Polars Series with result
152
59
 
153
- # Convert AST to Polars expression
154
- polars_expr = self._ast_to_polars_expr(ast_tree)
60
+ Example:
61
+ # Inline expression
62
+ result = engine.evaluate(df, 'weight / (height ** 2)')
155
63
 
156
- # Execute expression and add as new column
157
- result_df = polars_df.with_columns([
158
- polars_expr.alias(output_column)
159
- ])
160
-
161
- return result_df
162
-
163
- except Exception as e:
164
- raise PolarsExpressionError(f"Polars expression execution failed: {e}")
165
-
166
- def _ast_to_polars_expr(self, ast_node: Dict[str, Any]) -> pl.Expr:
64
+ # Reference expression
65
+ result = engine.evaluate(df, 'inbuilt:bmi')
167
66
  """
168
- Convert expression AST to Polars expression
67
+ # Check if this is a reference or inline expression
68
+ if is_reference(expression):
69
+ # Parse reference
70
+ namespace, name = parse_expression_reference(expression)
71
+
72
+ # Get expression definition
73
+ expr_def = self.get_expression(f"{namespace}:{name}")
74
+
75
+ # Get expression string
76
+ expr_string = expr_def['expression']
77
+
78
+ # Verify SHA integrity
79
+ if 'sha' in expr_def and expr_def['sha']:
80
+ is_valid = verify_sha(expr_string, expr_def['sha'])
81
+ if not is_valid:
82
+ self.logger.warning(
83
+ f"Expression '{name}' in namespace '{namespace}' failed integrity check"
84
+ )
85
+
86
+ # Log evaluation
87
+ self.logger.info(f"Evaluating expression: {namespace}:{name}")
88
+ else:
89
+ # Inline expression
90
+ expr_string = expression
91
+ self.logger.info(f"Evaluating inline expression")
169
92
 
170
- Args:
171
- ast_node: AST node dictionary
172
-
173
- Returns:
174
- Polars expression
175
-
176
- Raises:
177
- PolarsExpressionError: If AST conversion fails
178
- """
179
- try:
180
- node_type = ast_node.get("type")
181
-
182
- if node_type == "column":
183
- return pl.col(ast_node["name"])
184
-
185
- elif node_type == "literal":
186
- return pl.lit(ast_node["value"])
187
-
188
- elif node_type == "binary":
189
- left = self._ast_to_polars_expr(ast_node["left"])
190
- right = self._ast_to_polars_expr(ast_node["right"])
191
- op = ast_node["op"]
192
-
193
- if op == "+":
194
- return left + right
195
- elif op == "-":
196
- return left - right
197
- elif op == "*":
198
- return left * right
199
- elif op == "/":
200
- return left / right
201
- elif op == "**":
202
- return left ** right
203
- elif op == "%":
204
- return left % right
205
- elif op == "//":
206
- return left // right
207
- else:
208
- raise PolarsExpressionError(f"Unsupported binary operator: {op}")
209
-
210
- elif node_type == "cmp":
211
- left = self._ast_to_polars_expr(ast_node["left"])
212
- right = self._ast_to_polars_expr(ast_node["right"])
213
- op = ast_node["op"]
214
-
215
- if op == "==":
216
- return left == right
217
- elif op == "!=":
218
- return left != right
219
- elif op == ">":
220
- return left > right
221
- elif op == "<":
222
- return left < right
223
- elif op == ">=":
224
- return left >= right
225
- elif op == "<=":
226
- return left <= right
227
- else:
228
- raise PolarsExpressionError(f"Unsupported comparison operator: {op}")
229
-
230
- elif node_type == "bool_op":
231
- op = ast_node["op"]
232
- values = [self._ast_to_polars_expr(v) for v in ast_node["values"]]
233
-
234
- if op == "and":
235
- result = values[0]
236
- for v in values[1:]:
237
- result = result & v
238
- return result
239
- elif op == "or":
240
- result = values[0]
241
- for v in values[1:]:
242
- result = result | v
243
- return result
244
- else:
245
- raise PolarsExpressionError(f"Unsupported boolean operator: {op}")
246
-
247
- elif node_type == "unary_bool":
248
- op = ast_node["op"]
249
- value = self._ast_to_polars_expr(ast_node["value"])
250
-
251
- if op == "not":
252
- return ~value
253
- else:
254
- raise PolarsExpressionError(f"Unsupported unary boolean operator: {op}")
255
-
256
- elif node_type == "if_expr":
257
- # Ternary: a if cond else b
258
- cond = self._ast_to_polars_expr(ast_node["cond"])
259
- then_expr = self._ast_to_polars_expr(ast_node["then"])
260
- else_expr = self._ast_to_polars_expr(ast_node["else"])
261
-
262
- return pl.when(cond).then(then_expr).otherwise(else_expr)
263
-
264
- elif node_type == "call":
265
- # Function calls
266
- func_name = ast_node["name"]
267
- args = [self._ast_to_polars_expr(arg) for arg in ast_node["args"]]
268
-
269
- return self._handle_function_call(func_name, args)
270
-
271
- else:
272
- raise PolarsExpressionError(f"Unsupported AST node type: {node_type}")
273
-
274
- except Exception as e:
275
- raise PolarsExpressionError(f"AST to Polars conversion failed: {e}")
93
+ # Parse expression to AST
94
+ ast = self.parser.parse(expr_string)
95
+
96
+ # Compile AST to Polars expression
97
+ polars_expr = self.compiler.compile(ast, df)
98
+
99
+ # Execute and return result
100
+ result = df.select(polars_expr.alias('result'))['result']
101
+
102
+ return result
276
103
 
277
- def _handle_function_call(self, func_name: str, args: list) -> pl.Expr:
104
+ def load_namespace(self, namespace: str, folder_path: str):
278
105
  """
279
- Handle function calls in expressions
106
+ Load expressions from a namespace folder.
280
107
 
281
108
  Args:
282
- func_name: Name of the function
283
- args: List of Polars expressions as arguments
284
-
285
- Returns:
286
- Polars expression for the function call
109
+ namespace: Namespace name
110
+ folder_path: Path to folder containing .add files
287
111
 
288
- Raises:
289
- PolarsExpressionError: If function is not supported
112
+ Example:
113
+ engine.load_namespace('inbuilt', '/path/to/inbuilt_expressions')
290
114
  """
291
- if func_name == "min":
292
- if len(args) == 1:
293
- return args[0].min()
294
- else:
295
- # Element-wise minimum of multiple expressions
296
- result = args[0]
297
- for arg in args[1:]:
298
- result = pl.min_horizontal([result, arg])
299
- return result
300
-
301
- elif func_name == "max":
302
- if len(args) == 1:
303
- return args[0].max()
304
- else:
305
- # Element-wise maximum of multiple expressions
306
- result = args[0]
307
- for arg in args[1:]:
308
- result = pl.max_horizontal([result, arg])
309
- return result
310
-
311
- elif func_name == "abs":
312
- if len(args) != 1:
313
- raise PolarsExpressionError("abs() requires exactly 1 argument")
314
- return args[0].abs()
315
-
316
- elif func_name == "log":
317
- if len(args) == 1:
318
- return args[0].log()
319
- elif len(args) == 2:
320
- # log(value, base)
321
- return args[0].log() / args[1].log()
322
- else:
323
- raise PolarsExpressionError("log() requires 1 or 2 arguments")
115
+ folder = Path(folder_path)
324
116
 
325
- elif func_name == "exp":
326
- if len(args) != 1:
327
- raise PolarsExpressionError("exp() requires exactly 1 argument")
328
- return args[0].exp()
117
+ if not folder.exists():
118
+ self.logger.warning(f"Namespace folder not found: {folder_path}")
119
+ return
329
120
 
330
- elif func_name == "sqrt":
331
- if len(args) != 1:
332
- raise PolarsExpressionError("sqrt() requires exactly 1 argument")
333
- return args[0].sqrt()
121
+ # Find all .add files
122
+ add_files = list(folder.glob('*.add'))
334
123
 
335
- elif func_name == "pow":
336
- if len(args) != 2:
337
- raise PolarsExpressionError("pow() requires exactly 2 arguments")
338
- return args[0] ** args[1]
124
+ if not add_files:
125
+ self.logger.info(f"No .add files found in {folder_path}")
126
+ return
339
127
 
340
- elif func_name == "round":
341
- if len(args) == 1:
342
- return args[0].round(0)
343
- elif len(args) == 2:
344
- # For round with decimals, the second argument must be a literal integer
345
- if hasattr(args[1], 'meta') and hasattr(args[1].meta, 'output_name'):
346
- # This is a column reference, not a literal
347
- raise PolarsExpressionError("round() decimals parameter must be a literal integer")
348
- return args[0].round(args[1])
349
- else:
350
- raise PolarsExpressionError("round() requires 1 or 2 arguments")
351
-
352
- elif func_name == "floor":
353
- if len(args) != 1:
354
- raise PolarsExpressionError("floor() requires exactly 1 argument")
355
- return args[0].floor()
356
-
357
- elif func_name == "ceil":
358
- if len(args) != 1:
359
- raise PolarsExpressionError("ceil() requires exactly 1 argument")
360
- return args[0].ceil()
128
+ # Load expressions from each file
129
+ loaded_count = 0
130
+ for add_file in add_files:
131
+ try:
132
+ expressions = load_expressions_from_file(str(add_file), namespace)
133
+
134
+ # Store expressions
135
+ for name, expr_def in expressions.items():
136
+ # Create full reference
137
+ full_ref = f"{namespace}:{name}"
138
+
139
+ # Check for duplicates
140
+ if full_ref in self.loaded_expressions:
141
+ self.logger.warning(
142
+ f"Duplicate expression '{name}' in namespace '{namespace}' "
143
+ f"(from {add_file.name})"
144
+ )
145
+ continue
146
+
147
+ # Add source file info
148
+ expr_def['source_file'] = add_file.name
149
+
150
+ # Store expression
151
+ self.loaded_expressions[full_ref] = expr_def
152
+ loaded_count += 1
153
+
154
+ except Exception as e:
155
+ self.logger.error(f"Error loading {add_file.name}: {str(e)}")
361
156
 
362
- else:
363
- raise PolarsExpressionError(f"Unsupported function: {func_name}")
157
+ self.logger.info(
158
+ f"Loaded {loaded_count} expressions from namespace '{namespace}'"
159
+ )
364
160
 
365
- def execute_with_ast(self, df: Any, ast_tree: Dict[str, Any], output_column: str,
366
- backend_type: Optional[str] = None) -> ExpressionResult:
161
+ def get_expression(self, reference: str) -> Dict:
367
162
  """
368
- Execute expression using pre-built AST
163
+ Get expression definition from reference.
369
164
 
370
165
  Args:
371
- df: Input dataframe
372
- ast_tree: Pre-built AST tree
373
- output_column: Name for output column
374
- backend_type: Source backend type
166
+ reference: Expression reference ('inbuilt:bmi', 'myfolder:roi')
375
167
 
376
168
  Returns:
377
- ExpressionResult with processed dataframe
378
- """
379
- start_time = datetime.now()
380
-
381
- try:
382
- # Auto-detect backend if not specified
383
- if backend_type is None:
384
- backend_type = self.arrow_bridge.detect_backend(df)
385
-
386
- # Get memory usage before processing
387
- memory_before = self.arrow_bridge._get_memory_usage_mb()
388
-
389
- # Convert to Polars via Arrow
390
- arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
391
- polars_df = pl.from_arrow(arrow_table)
392
-
393
- # Execute using AST
394
- polars_expr = self._ast_to_polars_expr(ast_tree)
395
- result_df = polars_df.with_columns([polars_expr.alias(output_column)])
169
+ Dictionary with expression definition
396
170
 
397
- # Convert back to original format
398
- result_arrow = result_df.to_arrow()
399
- final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
400
-
401
- # Calculate statistics
402
- execution_time = (datetime.now() - start_time).total_seconds() * 1000
403
- memory_after = self.arrow_bridge._get_memory_usage_mb()
404
- memory_used = max(0, memory_after - memory_before)
405
-
406
- # Update statistics
407
- self.execution_stats["total_executions"] += 1
408
- self.execution_stats["total_time_ms"] += execution_time
409
- self.execution_stats["total_rows_processed"] += result_df.height
171
+ Raises:
172
+ ValueError: If expression not found
410
173
 
411
- return ExpressionResult(
412
- dataframe=final_result,
413
- execution_time_ms=execution_time,
414
- rows_processed=result_df.height,
415
- columns_processed=result_df.width,
416
- backend_type=backend_type,
417
- memory_used_mb=memory_used
174
+ Example:
175
+ expr_def = engine.get_expression('inbuilt:bmi')
176
+ """
177
+ if reference not in self.loaded_expressions:
178
+ raise ValueError(
179
+ f"Expression '{reference}' not found. "
180
+ f"Available expressions: {list(self.loaded_expressions.keys())}"
418
181
  )
419
-
420
- except Exception as e:
421
- self.execution_stats["errors"] += 1
422
- raise PolarsExpressionError(f"AST execution failed: {e}")
423
-
424
- finally:
425
- self.arrow_bridge.cleanup_arrow_memory()
182
+
183
+ return self.loaded_expressions[reference]
426
184
 
427
- def validate_expression(self, expression: str) -> bool:
185
+ def list_expressions(self, namespace: Optional[str] = None) -> List[Dict]:
428
186
  """
429
- Validate expression syntax without executing
187
+ List all available expressions.
430
188
 
431
189
  Args:
432
- expression: Expression string to validate
190
+ namespace: Filter by namespace (None = all)
433
191
 
434
192
  Returns:
435
- True if expression is valid
436
- """
437
- try:
438
- # Clean up multiline expressions
439
- cleaned_expression = ' '.join(line.strip() for line in expression.strip().split('\n') if line.strip())
193
+ List of expression dictionaries
440
194
 
441
- ast_tree = build_ast_from_expression(cleaned_expression)
442
- if ast_tree is None:
443
- return False
195
+ Example:
196
+ # List all expressions
197
+ all_exprs = engine.list_expressions()
444
198
 
445
- # Try to convert AST to Polars expression (dry run)
446
- # This will catch unsupported functions and operators
447
- self._ast_to_polars_expr(ast_tree)
448
- return True
199
+ # List only inbuilt
200
+ inbuilt = engine.list_expressions('inbuilt')
201
+ """
202
+ if namespace is None:
203
+ return list(self.loaded_expressions.values())
204
+
205
+ # Filter by namespace
206
+ return [
207
+ expr_def for ref, expr_def in self.loaded_expressions.items()
208
+ if ref.startswith(f"{namespace}:")
209
+ ]
210
+
211
+ def reload_custom_namespace(self):
212
+ """
213
+ Reload custom namespace expressions.
214
+
215
+ Reloads all .add files from custom folder.
216
+ """
217
+ from additory.core.config import Config
218
+
219
+ config = Config()
220
+ custom_folder = config.get_expressions_folder()
221
+
222
+ if custom_folder:
223
+ # Clear existing custom expressions
224
+ self.clear_custom_namespace()
449
225
 
450
- except Exception as e:
451
- log_warning(f"[polars_engine] Expression validation failed: {e}")
452
- return False
226
+ # Reload
227
+ self.load_namespace('user', custom_folder)
453
228
 
454
- def get_execution_stats(self) -> Dict[str, Any]:
455
- """Get execution statistics"""
456
- stats = self.execution_stats.copy()
229
+ def clear_custom_namespace(self):
230
+ """Clear custom namespace."""
231
+ # Remove all expressions that don't start with 'inbuilt:'
232
+ to_remove = [
233
+ ref for ref in self.loaded_expressions.keys()
234
+ if not ref.startswith('inbuilt:')
235
+ ]
457
236
 
458
- if stats["total_executions"] > 0:
459
- stats["avg_time_ms"] = stats["total_time_ms"] / stats["total_executions"]
460
- stats["avg_rows_per_execution"] = stats["total_rows_processed"] / stats["total_executions"]
461
- else:
462
- stats["avg_time_ms"] = 0.0
463
- stats["avg_rows_per_execution"] = 0
237
+ for ref in to_remove:
238
+ del self.loaded_expressions[ref]
239
+
240
+ self.logger.info(f"Cleared {len(to_remove)} custom expressions")
241
+
242
+
243
+ def get_engine() -> ExpressionEngine:
244
+ """
245
+ Get the global expression engine instance.
246
+
247
+ Returns:
248
+ Global ExpressionEngine instance
464
249
 
465
- return stats
250
+ Example:
251
+ engine = get_engine()
252
+ result = engine.evaluate(df, 'inbuilt:bmi')
253
+ """
254
+ global _engine_instance
466
255
 
467
- def reset_stats(self):
468
- """Reset execution statistics"""
469
- self.execution_stats = {
470
- "total_executions": 0,
471
- "total_time_ms": 0.0,
472
- "total_rows_processed": 0,
473
- "errors": 0
474
- }
475
- log_info("[polars_engine] Statistics reset")
256
+ if _engine_instance is None:
257
+ _engine_instance = ExpressionEngine()
476
258
 
477
- def benchmark_expression(self, df: Any, expression: str, output_column: str,
478
- iterations: int = 3) -> Dict[str, Any]:
479
- """
480
- Benchmark expression execution performance
259
+ return _engine_instance
260
+
261
+
262
+ def parse_expression_reference(expression: str) -> Tuple[str, str]:
263
+ """
264
+ Parse expression reference into namespace and name.
265
+
266
+ Args:
267
+ expression: Expression string
481
268
 
482
- Args:
483
- df: Input dataframe
484
- expression: Expression to benchmark
485
- output_column: Output column name
486
- iterations: Number of iterations
487
-
488
- Returns:
489
- Benchmark results
490
- """
491
- times = []
492
- backend_type = self.arrow_bridge.detect_backend(df)
269
+ Returns:
270
+ Tuple of (namespace, name)
493
271
 
494
- for i in range(iterations):
495
- try:
496
- result = self.execute_expression(df, expression, output_column, backend_type)
497
- times.append(result.execution_time_ms)
498
- except Exception as e:
499
- log_warning(f"[polars_engine] Benchmark iteration {i+1} failed: {e}")
500
- continue
272
+ Raises:
273
+ ValueError: If not a valid reference
501
274
 
502
- if not times:
503
- return {"error": "All benchmark iterations failed"}
275
+ Example:
276
+ namespace, name = parse_expression_reference('inbuilt:bmi')
277
+ # Returns: ('inbuilt', 'bmi')
278
+ """
279
+ if ':' not in expression:
280
+ raise ValueError(f"Invalid expression reference: {expression}")
281
+
282
+ parts = expression.split(':', 1)
283
+ namespace = parts[0]
284
+ name = parts[1]
285
+
286
+ return namespace, name
287
+
288
+
289
+ def is_reference(expression: str) -> bool:
290
+ """
291
+ Check if expression is a reference (not inline).
292
+
293
+ Args:
294
+ expression: Expression string
504
295
 
505
- return {
506
- "expression": expression,
507
- "backend_type": backend_type,
508
- "iterations": len(times),
509
- "min_time_ms": min(times),
510
- "max_time_ms": max(times),
511
- "avg_time_ms": sum(times) / len(times),
512
- "total_time_ms": sum(times)
513
- }
296
+ Returns:
297
+ True if reference, False if inline
298
+
299
+ Example:
300
+ is_reference('inbuilt:bmi') # True
301
+ is_reference('weight / height') # False
302
+ """
303
+ # A reference has the format: namespace:name
304
+ # It should have exactly one colon and no spaces before the colon
305
+ if ':' not in expression:
306
+ return False
514
307
 
515
- def get_supported_functions(self) -> list:
516
- """Get list of supported functions"""
517
- return [
518
- "min", "max", "abs", "log", "exp", "sqrt", "pow",
519
- "round", "floor", "ceil"
520
- ]
308
+ # Check if it looks like a reference (namespace:name)
309
+ parts = expression.split(':', 1)
310
+ if len(parts) != 2:
311
+ return False
521
312
 
522
- def get_supported_operators(self) -> Dict[str, list]:
523
- """Get list of supported operators by category"""
524
- return {
525
- "arithmetic": ["+", "-", "*", "/", "**", "%", "//"],
526
- "comparison": ["==", "!=", ">", "<", ">=", "<="],
527
- "boolean": ["and", "or", "not"],
528
- "conditional": ["if_else"]
529
- }
313
+ namespace = parts[0].strip()
314
+ name = parts[1].strip()
530
315
 
531
- def _cleanup_callback(self):
532
- """Cleanup callback for memory manager"""
533
- try:
534
- # Cleanup Arrow bridge memory
535
- self.arrow_bridge.cleanup_arrow_memory()
536
-
537
- # Reset statistics if they get too large
538
- if self.execution_stats["total_executions"] > 10000:
539
- log_info("[polars_engine] Resetting statistics due to high execution count")
540
- self.reset_stats()
541
-
542
- except Exception as e:
543
- log_warning(f"[polars_engine] Cleanup callback failed: {e}")
316
+ # Namespace and name should be valid identifiers (no spaces, operators, etc.)
317
+ if not namespace or not name:
318
+ return False
319
+
320
+ # Check if namespace looks like an identifier
321
+ if not namespace.replace('_', '').isalnum():
322
+ return False
323
+
324
+ # Check if name looks like an identifier
325
+ if not name.replace('_', '').isalnum():
326
+ return False
544
327
 
545
- def __del__(self):
546
- """Cleanup when engine is destroyed"""
547
- try:
548
- if hasattr(self, 'memory_manager'):
549
- self.memory_manager.unregister_cleanup_callback(self._cleanup_callback)
550
- except Exception:
551
- pass
328
+ return True