additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -176
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -304
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -850
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a3.dist-info/METADATA +0 -288
- additory-0.1.0a3.dist-info/RECORD +0 -71
- additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
additory/expressions/engine.py
CHANGED
|
@@ -1,551 +1,328 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
"""
|
|
2
|
+
Core expression evaluation engine for Additory.
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from dataclasses import dataclass
|
|
7
|
-
from datetime import datetime
|
|
8
|
-
|
|
9
|
-
from .enhanced_arrow_bridge import EnhancedArrowBridge, ArrowBridgeError
|
|
10
|
-
from .ast_builder import build_ast_from_expression
|
|
11
|
-
from .logging import log_info, log_warning
|
|
12
|
-
from .memory_manager import get_memory_manager
|
|
4
|
+
Main engine that ties together parser, compiler, loader, resolver, and integrity.
|
|
5
|
+
"""
|
|
13
6
|
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import Dict, List, Optional, Tuple
|
|
9
|
+
from pathlib import Path
|
|
14
10
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
columns_processed: int
|
|
22
|
-
backend_type: str
|
|
23
|
-
memory_used_mb: float
|
|
11
|
+
from additory.expressions.parser import ExpressionParser
|
|
12
|
+
from additory.expressions.compiler import ExpressionCompiler
|
|
13
|
+
from additory.expressions.loader import load_expressions_from_file
|
|
14
|
+
from additory.expressions.resolver import resolve_dependencies, check_circular_dependencies
|
|
15
|
+
from additory.expressions.integrity import verify_sha
|
|
16
|
+
from additory.core.logging import Logger
|
|
24
17
|
|
|
25
18
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
pass
|
|
19
|
+
# Global engine instance
|
|
20
|
+
_engine_instance: Optional['ExpressionEngine'] = None
|
|
29
21
|
|
|
30
22
|
|
|
31
|
-
class
|
|
32
|
-
"""
|
|
23
|
+
class ExpressionEngine:
|
|
24
|
+
"""
|
|
25
|
+
Main expression evaluation engine.
|
|
26
|
+
|
|
27
|
+
Singleton class that manages expression loading, parsing, compilation, and evaluation.
|
|
28
|
+
"""
|
|
33
29
|
|
|
34
30
|
def __init__(self):
|
|
35
|
-
|
|
36
|
-
self.
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
"errors": 0
|
|
41
|
-
}
|
|
31
|
+
"""Initialize expression engine."""
|
|
32
|
+
self.loaded_expressions: Dict[str, Dict] = {}
|
|
33
|
+
self.parser = ExpressionParser()
|
|
34
|
+
self.compiler = ExpressionCompiler()
|
|
35
|
+
self.logger = Logger()
|
|
42
36
|
|
|
43
|
-
#
|
|
44
|
-
self.
|
|
45
|
-
self.memory_manager.register_cleanup_callback(self._cleanup_callback)
|
|
37
|
+
# Load built-in expressions
|
|
38
|
+
self._load_inbuilt_expressions()
|
|
46
39
|
|
|
47
|
-
def
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
Args:
|
|
53
|
-
df: Input dataframe (any supported backend)
|
|
54
|
-
expression: Expression string to execute
|
|
55
|
-
output_column: Name for the output column
|
|
56
|
-
backend_type: Source backend type (auto-detected if None)
|
|
57
|
-
|
|
58
|
-
Returns:
|
|
59
|
-
ExpressionResult with processed dataframe and statistics
|
|
60
|
-
|
|
61
|
-
Raises:
|
|
62
|
-
PolarsExpressionError: If expression execution fails
|
|
63
|
-
"""
|
|
64
|
-
start_time = datetime.now()
|
|
40
|
+
def _load_inbuilt_expressions(self):
|
|
41
|
+
"""Load built-in expressions from bundled .add files."""
|
|
42
|
+
# Get the inbuilt expressions directory
|
|
43
|
+
# This would be in the package: additory/inbuilt_expressions/
|
|
44
|
+
inbuilt_dir = Path(__file__).parent.parent / 'inbuilt_expressions'
|
|
65
45
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
try:
|
|
69
|
-
# Auto-detect backend if not specified
|
|
70
|
-
if backend_type is None:
|
|
71
|
-
backend_type = self.arrow_bridge.detect_backend(df)
|
|
72
|
-
|
|
73
|
-
# Get memory usage before processing
|
|
74
|
-
memory_before = self.arrow_bridge._get_memory_usage_mb()
|
|
75
|
-
|
|
76
|
-
# 1. Convert input to Arrow
|
|
77
|
-
log_info(f"[polars_engine] Converting {backend_type} to Arrow")
|
|
78
|
-
arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
|
|
79
|
-
|
|
80
|
-
# 2. Convert Arrow to Polars
|
|
81
|
-
log_info("[polars_engine] Converting Arrow to Polars")
|
|
82
|
-
polars_df = pl.from_arrow(arrow_table)
|
|
83
|
-
|
|
84
|
-
# 3. Execute expression in Polars
|
|
85
|
-
log_info(f"[polars_engine] Executing expression: {expression}")
|
|
86
|
-
result_df = self._execute_polars_expression(
|
|
87
|
-
polars_df, expression, output_column
|
|
88
|
-
)
|
|
89
|
-
|
|
90
|
-
# 4. Convert back to Arrow
|
|
91
|
-
log_info("[polars_engine] Converting result to Arrow")
|
|
92
|
-
result_arrow = result_df.to_arrow()
|
|
93
|
-
|
|
94
|
-
# 5. Convert to original backend format
|
|
95
|
-
log_info(f"[polars_engine] Converting Arrow to {backend_type}")
|
|
96
|
-
final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
|
|
97
|
-
|
|
98
|
-
# Calculate execution statistics
|
|
99
|
-
execution_time = (datetime.now() - start_time).total_seconds() * 1000
|
|
100
|
-
memory_after = self.arrow_bridge._get_memory_usage_mb()
|
|
101
|
-
memory_used = max(0, memory_after - memory_before)
|
|
102
|
-
|
|
103
|
-
# Update global statistics
|
|
104
|
-
self.execution_stats["total_executions"] += 1
|
|
105
|
-
self.execution_stats["total_time_ms"] += execution_time
|
|
106
|
-
self.execution_stats["total_rows_processed"] += result_df.height
|
|
107
|
-
|
|
108
|
-
log_info(f"[polars_engine] Expression executed successfully in {execution_time:.1f}ms")
|
|
109
|
-
|
|
110
|
-
return ExpressionResult(
|
|
111
|
-
dataframe=final_result,
|
|
112
|
-
execution_time_ms=execution_time,
|
|
113
|
-
rows_processed=result_df.height,
|
|
114
|
-
columns_processed=result_df.width,
|
|
115
|
-
backend_type=backend_type,
|
|
116
|
-
memory_used_mb=memory_used
|
|
117
|
-
)
|
|
118
|
-
|
|
119
|
-
except Exception as e:
|
|
120
|
-
self.execution_stats["errors"] += 1
|
|
121
|
-
raise PolarsExpressionError(f"Expression execution failed: {e}")
|
|
122
|
-
|
|
123
|
-
finally:
|
|
124
|
-
# 6. Always cleanup Arrow memory
|
|
125
|
-
self.arrow_bridge.cleanup_arrow_memory()
|
|
46
|
+
if inbuilt_dir.exists():
|
|
47
|
+
self.load_namespace('inbuilt', str(inbuilt_dir))
|
|
126
48
|
|
|
127
|
-
def
|
|
128
|
-
expression: str, output_column: str) -> pl.DataFrame:
|
|
49
|
+
def evaluate(self, df: pl.DataFrame, expression: str) -> pl.Series:
|
|
129
50
|
"""
|
|
130
|
-
|
|
51
|
+
Evaluate expression and return result.
|
|
131
52
|
|
|
132
53
|
Args:
|
|
133
|
-
|
|
134
|
-
expression: Expression string
|
|
135
|
-
output_column: Name for output column
|
|
54
|
+
df: DataFrame to evaluate expression on
|
|
55
|
+
expression: Expression string (inline or reference)
|
|
136
56
|
|
|
137
57
|
Returns:
|
|
138
|
-
Polars
|
|
139
|
-
|
|
140
|
-
Raises:
|
|
141
|
-
PolarsExpressionError: If expression execution fails
|
|
142
|
-
"""
|
|
143
|
-
try:
|
|
144
|
-
# Clean up multiline expressions
|
|
145
|
-
cleaned_expression = ' '.join(line.strip() for line in expression.strip().split('\n') if line.strip())
|
|
146
|
-
|
|
147
|
-
# Build AST from expression
|
|
148
|
-
ast_tree = build_ast_from_expression(cleaned_expression)
|
|
149
|
-
|
|
150
|
-
if ast_tree is None:
|
|
151
|
-
raise PolarsExpressionError(f"Failed to parse expression: {expression}")
|
|
58
|
+
Polars Series with result
|
|
152
59
|
|
|
153
|
-
|
|
154
|
-
|
|
60
|
+
Example:
|
|
61
|
+
# Inline expression
|
|
62
|
+
result = engine.evaluate(df, 'weight / (height ** 2)')
|
|
155
63
|
|
|
156
|
-
#
|
|
157
|
-
|
|
158
|
-
polars_expr.alias(output_column)
|
|
159
|
-
])
|
|
160
|
-
|
|
161
|
-
return result_df
|
|
162
|
-
|
|
163
|
-
except Exception as e:
|
|
164
|
-
raise PolarsExpressionError(f"Polars expression execution failed: {e}")
|
|
165
|
-
|
|
166
|
-
def _ast_to_polars_expr(self, ast_node: Dict[str, Any]) -> pl.Expr:
|
|
64
|
+
# Reference expression
|
|
65
|
+
result = engine.evaluate(df, 'inbuilt:bmi')
|
|
167
66
|
"""
|
|
168
|
-
|
|
67
|
+
# Check if this is a reference or inline expression
|
|
68
|
+
if is_reference(expression):
|
|
69
|
+
# Parse reference
|
|
70
|
+
namespace, name = parse_expression_reference(expression)
|
|
71
|
+
|
|
72
|
+
# Get expression definition
|
|
73
|
+
expr_def = self.get_expression(f"{namespace}:{name}")
|
|
74
|
+
|
|
75
|
+
# Get expression string
|
|
76
|
+
expr_string = expr_def['expression']
|
|
77
|
+
|
|
78
|
+
# Verify SHA integrity
|
|
79
|
+
if 'sha' in expr_def and expr_def['sha']:
|
|
80
|
+
is_valid = verify_sha(expr_string, expr_def['sha'])
|
|
81
|
+
if not is_valid:
|
|
82
|
+
self.logger.warning(
|
|
83
|
+
f"Expression '{name}' in namespace '{namespace}' failed integrity check"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Log evaluation
|
|
87
|
+
self.logger.info(f"Evaluating expression: {namespace}:{name}")
|
|
88
|
+
else:
|
|
89
|
+
# Inline expression
|
|
90
|
+
expr_string = expression
|
|
91
|
+
self.logger.info(f"Evaluating inline expression")
|
|
169
92
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
node_type = ast_node.get("type")
|
|
181
|
-
|
|
182
|
-
if node_type == "column":
|
|
183
|
-
return pl.col(ast_node["name"])
|
|
184
|
-
|
|
185
|
-
elif node_type == "literal":
|
|
186
|
-
return pl.lit(ast_node["value"])
|
|
187
|
-
|
|
188
|
-
elif node_type == "binary":
|
|
189
|
-
left = self._ast_to_polars_expr(ast_node["left"])
|
|
190
|
-
right = self._ast_to_polars_expr(ast_node["right"])
|
|
191
|
-
op = ast_node["op"]
|
|
192
|
-
|
|
193
|
-
if op == "+":
|
|
194
|
-
return left + right
|
|
195
|
-
elif op == "-":
|
|
196
|
-
return left - right
|
|
197
|
-
elif op == "*":
|
|
198
|
-
return left * right
|
|
199
|
-
elif op == "/":
|
|
200
|
-
return left / right
|
|
201
|
-
elif op == "**":
|
|
202
|
-
return left ** right
|
|
203
|
-
elif op == "%":
|
|
204
|
-
return left % right
|
|
205
|
-
elif op == "//":
|
|
206
|
-
return left // right
|
|
207
|
-
else:
|
|
208
|
-
raise PolarsExpressionError(f"Unsupported binary operator: {op}")
|
|
209
|
-
|
|
210
|
-
elif node_type == "cmp":
|
|
211
|
-
left = self._ast_to_polars_expr(ast_node["left"])
|
|
212
|
-
right = self._ast_to_polars_expr(ast_node["right"])
|
|
213
|
-
op = ast_node["op"]
|
|
214
|
-
|
|
215
|
-
if op == "==":
|
|
216
|
-
return left == right
|
|
217
|
-
elif op == "!=":
|
|
218
|
-
return left != right
|
|
219
|
-
elif op == ">":
|
|
220
|
-
return left > right
|
|
221
|
-
elif op == "<":
|
|
222
|
-
return left < right
|
|
223
|
-
elif op == ">=":
|
|
224
|
-
return left >= right
|
|
225
|
-
elif op == "<=":
|
|
226
|
-
return left <= right
|
|
227
|
-
else:
|
|
228
|
-
raise PolarsExpressionError(f"Unsupported comparison operator: {op}")
|
|
229
|
-
|
|
230
|
-
elif node_type == "bool_op":
|
|
231
|
-
op = ast_node["op"]
|
|
232
|
-
values = [self._ast_to_polars_expr(v) for v in ast_node["values"]]
|
|
233
|
-
|
|
234
|
-
if op == "and":
|
|
235
|
-
result = values[0]
|
|
236
|
-
for v in values[1:]:
|
|
237
|
-
result = result & v
|
|
238
|
-
return result
|
|
239
|
-
elif op == "or":
|
|
240
|
-
result = values[0]
|
|
241
|
-
for v in values[1:]:
|
|
242
|
-
result = result | v
|
|
243
|
-
return result
|
|
244
|
-
else:
|
|
245
|
-
raise PolarsExpressionError(f"Unsupported boolean operator: {op}")
|
|
246
|
-
|
|
247
|
-
elif node_type == "unary_bool":
|
|
248
|
-
op = ast_node["op"]
|
|
249
|
-
value = self._ast_to_polars_expr(ast_node["value"])
|
|
250
|
-
|
|
251
|
-
if op == "not":
|
|
252
|
-
return ~value
|
|
253
|
-
else:
|
|
254
|
-
raise PolarsExpressionError(f"Unsupported unary boolean operator: {op}")
|
|
255
|
-
|
|
256
|
-
elif node_type == "if_expr":
|
|
257
|
-
# Ternary: a if cond else b
|
|
258
|
-
cond = self._ast_to_polars_expr(ast_node["cond"])
|
|
259
|
-
then_expr = self._ast_to_polars_expr(ast_node["then"])
|
|
260
|
-
else_expr = self._ast_to_polars_expr(ast_node["else"])
|
|
261
|
-
|
|
262
|
-
return pl.when(cond).then(then_expr).otherwise(else_expr)
|
|
263
|
-
|
|
264
|
-
elif node_type == "call":
|
|
265
|
-
# Function calls
|
|
266
|
-
func_name = ast_node["name"]
|
|
267
|
-
args = [self._ast_to_polars_expr(arg) for arg in ast_node["args"]]
|
|
268
|
-
|
|
269
|
-
return self._handle_function_call(func_name, args)
|
|
270
|
-
|
|
271
|
-
else:
|
|
272
|
-
raise PolarsExpressionError(f"Unsupported AST node type: {node_type}")
|
|
273
|
-
|
|
274
|
-
except Exception as e:
|
|
275
|
-
raise PolarsExpressionError(f"AST to Polars conversion failed: {e}")
|
|
93
|
+
# Parse expression to AST
|
|
94
|
+
ast = self.parser.parse(expr_string)
|
|
95
|
+
|
|
96
|
+
# Compile AST to Polars expression
|
|
97
|
+
polars_expr = self.compiler.compile(ast, df)
|
|
98
|
+
|
|
99
|
+
# Execute and return result
|
|
100
|
+
result = df.select(polars_expr.alias('result'))['result']
|
|
101
|
+
|
|
102
|
+
return result
|
|
276
103
|
|
|
277
|
-
def
|
|
104
|
+
def load_namespace(self, namespace: str, folder_path: str):
|
|
278
105
|
"""
|
|
279
|
-
|
|
106
|
+
Load expressions from a namespace folder.
|
|
280
107
|
|
|
281
108
|
Args:
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
Returns:
|
|
286
|
-
Polars expression for the function call
|
|
109
|
+
namespace: Namespace name
|
|
110
|
+
folder_path: Path to folder containing .add files
|
|
287
111
|
|
|
288
|
-
|
|
289
|
-
|
|
112
|
+
Example:
|
|
113
|
+
engine.load_namespace('inbuilt', '/path/to/inbuilt_expressions')
|
|
290
114
|
"""
|
|
291
|
-
|
|
292
|
-
if len(args) == 1:
|
|
293
|
-
return args[0].min()
|
|
294
|
-
else:
|
|
295
|
-
# Element-wise minimum of multiple expressions
|
|
296
|
-
result = args[0]
|
|
297
|
-
for arg in args[1:]:
|
|
298
|
-
result = pl.min_horizontal([result, arg])
|
|
299
|
-
return result
|
|
300
|
-
|
|
301
|
-
elif func_name == "max":
|
|
302
|
-
if len(args) == 1:
|
|
303
|
-
return args[0].max()
|
|
304
|
-
else:
|
|
305
|
-
# Element-wise maximum of multiple expressions
|
|
306
|
-
result = args[0]
|
|
307
|
-
for arg in args[1:]:
|
|
308
|
-
result = pl.max_horizontal([result, arg])
|
|
309
|
-
return result
|
|
310
|
-
|
|
311
|
-
elif func_name == "abs":
|
|
312
|
-
if len(args) != 1:
|
|
313
|
-
raise PolarsExpressionError("abs() requires exactly 1 argument")
|
|
314
|
-
return args[0].abs()
|
|
315
|
-
|
|
316
|
-
elif func_name == "log":
|
|
317
|
-
if len(args) == 1:
|
|
318
|
-
return args[0].log()
|
|
319
|
-
elif len(args) == 2:
|
|
320
|
-
# log(value, base)
|
|
321
|
-
return args[0].log() / args[1].log()
|
|
322
|
-
else:
|
|
323
|
-
raise PolarsExpressionError("log() requires 1 or 2 arguments")
|
|
115
|
+
folder = Path(folder_path)
|
|
324
116
|
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
return args[0].exp()
|
|
117
|
+
if not folder.exists():
|
|
118
|
+
self.logger.warning(f"Namespace folder not found: {folder_path}")
|
|
119
|
+
return
|
|
329
120
|
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
raise PolarsExpressionError("sqrt() requires exactly 1 argument")
|
|
333
|
-
return args[0].sqrt()
|
|
121
|
+
# Find all .add files
|
|
122
|
+
add_files = list(folder.glob('*.add'))
|
|
334
123
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
return args[0] ** args[1]
|
|
124
|
+
if not add_files:
|
|
125
|
+
self.logger.info(f"No .add files found in {folder_path}")
|
|
126
|
+
return
|
|
339
127
|
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
128
|
+
# Load expressions from each file
|
|
129
|
+
loaded_count = 0
|
|
130
|
+
for add_file in add_files:
|
|
131
|
+
try:
|
|
132
|
+
expressions = load_expressions_from_file(str(add_file), namespace)
|
|
133
|
+
|
|
134
|
+
# Store expressions
|
|
135
|
+
for name, expr_def in expressions.items():
|
|
136
|
+
# Create full reference
|
|
137
|
+
full_ref = f"{namespace}:{name}"
|
|
138
|
+
|
|
139
|
+
# Check for duplicates
|
|
140
|
+
if full_ref in self.loaded_expressions:
|
|
141
|
+
self.logger.warning(
|
|
142
|
+
f"Duplicate expression '{name}' in namespace '{namespace}' "
|
|
143
|
+
f"(from {add_file.name})"
|
|
144
|
+
)
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
# Add source file info
|
|
148
|
+
expr_def['source_file'] = add_file.name
|
|
149
|
+
|
|
150
|
+
# Store expression
|
|
151
|
+
self.loaded_expressions[full_ref] = expr_def
|
|
152
|
+
loaded_count += 1
|
|
153
|
+
|
|
154
|
+
except Exception as e:
|
|
155
|
+
self.logger.error(f"Error loading {add_file.name}: {str(e)}")
|
|
361
156
|
|
|
362
|
-
|
|
363
|
-
|
|
157
|
+
self.logger.info(
|
|
158
|
+
f"Loaded {loaded_count} expressions from namespace '{namespace}'"
|
|
159
|
+
)
|
|
364
160
|
|
|
365
|
-
def
|
|
366
|
-
backend_type: Optional[str] = None) -> ExpressionResult:
|
|
161
|
+
def get_expression(self, reference: str) -> Dict:
|
|
367
162
|
"""
|
|
368
|
-
|
|
163
|
+
Get expression definition from reference.
|
|
369
164
|
|
|
370
165
|
Args:
|
|
371
|
-
|
|
372
|
-
ast_tree: Pre-built AST tree
|
|
373
|
-
output_column: Name for output column
|
|
374
|
-
backend_type: Source backend type
|
|
166
|
+
reference: Expression reference ('inbuilt:bmi', 'myfolder:roi')
|
|
375
167
|
|
|
376
168
|
Returns:
|
|
377
|
-
|
|
378
|
-
"""
|
|
379
|
-
start_time = datetime.now()
|
|
380
|
-
|
|
381
|
-
try:
|
|
382
|
-
# Auto-detect backend if not specified
|
|
383
|
-
if backend_type is None:
|
|
384
|
-
backend_type = self.arrow_bridge.detect_backend(df)
|
|
385
|
-
|
|
386
|
-
# Get memory usage before processing
|
|
387
|
-
memory_before = self.arrow_bridge._get_memory_usage_mb()
|
|
388
|
-
|
|
389
|
-
# Convert to Polars via Arrow
|
|
390
|
-
arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
|
|
391
|
-
polars_df = pl.from_arrow(arrow_table)
|
|
392
|
-
|
|
393
|
-
# Execute using AST
|
|
394
|
-
polars_expr = self._ast_to_polars_expr(ast_tree)
|
|
395
|
-
result_df = polars_df.with_columns([polars_expr.alias(output_column)])
|
|
169
|
+
Dictionary with expression definition
|
|
396
170
|
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
|
|
400
|
-
|
|
401
|
-
# Calculate statistics
|
|
402
|
-
execution_time = (datetime.now() - start_time).total_seconds() * 1000
|
|
403
|
-
memory_after = self.arrow_bridge._get_memory_usage_mb()
|
|
404
|
-
memory_used = max(0, memory_after - memory_before)
|
|
405
|
-
|
|
406
|
-
# Update statistics
|
|
407
|
-
self.execution_stats["total_executions"] += 1
|
|
408
|
-
self.execution_stats["total_time_ms"] += execution_time
|
|
409
|
-
self.execution_stats["total_rows_processed"] += result_df.height
|
|
171
|
+
Raises:
|
|
172
|
+
ValueError: If expression not found
|
|
410
173
|
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
174
|
+
Example:
|
|
175
|
+
expr_def = engine.get_expression('inbuilt:bmi')
|
|
176
|
+
"""
|
|
177
|
+
if reference not in self.loaded_expressions:
|
|
178
|
+
raise ValueError(
|
|
179
|
+
f"Expression '{reference}' not found. "
|
|
180
|
+
f"Available expressions: {list(self.loaded_expressions.keys())}"
|
|
418
181
|
)
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
self.execution_stats["errors"] += 1
|
|
422
|
-
raise PolarsExpressionError(f"AST execution failed: {e}")
|
|
423
|
-
|
|
424
|
-
finally:
|
|
425
|
-
self.arrow_bridge.cleanup_arrow_memory()
|
|
182
|
+
|
|
183
|
+
return self.loaded_expressions[reference]
|
|
426
184
|
|
|
427
|
-
def
|
|
185
|
+
def list_expressions(self, namespace: Optional[str] = None) -> List[Dict]:
|
|
428
186
|
"""
|
|
429
|
-
|
|
187
|
+
List all available expressions.
|
|
430
188
|
|
|
431
189
|
Args:
|
|
432
|
-
|
|
190
|
+
namespace: Filter by namespace (None = all)
|
|
433
191
|
|
|
434
192
|
Returns:
|
|
435
|
-
|
|
436
|
-
"""
|
|
437
|
-
try:
|
|
438
|
-
# Clean up multiline expressions
|
|
439
|
-
cleaned_expression = ' '.join(line.strip() for line in expression.strip().split('\n') if line.strip())
|
|
193
|
+
List of expression dictionaries
|
|
440
194
|
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
195
|
+
Example:
|
|
196
|
+
# List all expressions
|
|
197
|
+
all_exprs = engine.list_expressions()
|
|
444
198
|
|
|
445
|
-
#
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
199
|
+
# List only inbuilt
|
|
200
|
+
inbuilt = engine.list_expressions('inbuilt')
|
|
201
|
+
"""
|
|
202
|
+
if namespace is None:
|
|
203
|
+
return list(self.loaded_expressions.values())
|
|
204
|
+
|
|
205
|
+
# Filter by namespace
|
|
206
|
+
return [
|
|
207
|
+
expr_def for ref, expr_def in self.loaded_expressions.items()
|
|
208
|
+
if ref.startswith(f"{namespace}:")
|
|
209
|
+
]
|
|
210
|
+
|
|
211
|
+
def reload_custom_namespace(self):
|
|
212
|
+
"""
|
|
213
|
+
Reload custom namespace expressions.
|
|
214
|
+
|
|
215
|
+
Reloads all .add files from custom folder.
|
|
216
|
+
"""
|
|
217
|
+
from additory.core.config import Config
|
|
218
|
+
|
|
219
|
+
config = Config()
|
|
220
|
+
custom_folder = config.get_expressions_folder()
|
|
221
|
+
|
|
222
|
+
if custom_folder:
|
|
223
|
+
# Clear existing custom expressions
|
|
224
|
+
self.clear_custom_namespace()
|
|
449
225
|
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
return False
|
|
226
|
+
# Reload
|
|
227
|
+
self.load_namespace('user', custom_folder)
|
|
453
228
|
|
|
454
|
-
def
|
|
455
|
-
"""
|
|
456
|
-
|
|
229
|
+
def clear_custom_namespace(self):
|
|
230
|
+
"""Clear custom namespace."""
|
|
231
|
+
# Remove all expressions that don't start with 'inbuilt:'
|
|
232
|
+
to_remove = [
|
|
233
|
+
ref for ref in self.loaded_expressions.keys()
|
|
234
|
+
if not ref.startswith('inbuilt:')
|
|
235
|
+
]
|
|
457
236
|
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
237
|
+
for ref in to_remove:
|
|
238
|
+
del self.loaded_expressions[ref]
|
|
239
|
+
|
|
240
|
+
self.logger.info(f"Cleared {len(to_remove)} custom expressions")
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def get_engine() -> ExpressionEngine:
|
|
244
|
+
"""
|
|
245
|
+
Get the global expression engine instance.
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
Global ExpressionEngine instance
|
|
464
249
|
|
|
465
|
-
|
|
250
|
+
Example:
|
|
251
|
+
engine = get_engine()
|
|
252
|
+
result = engine.evaluate(df, 'inbuilt:bmi')
|
|
253
|
+
"""
|
|
254
|
+
global _engine_instance
|
|
466
255
|
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
self.execution_stats = {
|
|
470
|
-
"total_executions": 0,
|
|
471
|
-
"total_time_ms": 0.0,
|
|
472
|
-
"total_rows_processed": 0,
|
|
473
|
-
"errors": 0
|
|
474
|
-
}
|
|
475
|
-
log_info("[polars_engine] Statistics reset")
|
|
256
|
+
if _engine_instance is None:
|
|
257
|
+
_engine_instance = ExpressionEngine()
|
|
476
258
|
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
259
|
+
return _engine_instance
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def parse_expression_reference(expression: str) -> Tuple[str, str]:
|
|
263
|
+
"""
|
|
264
|
+
Parse expression reference into namespace and name.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
expression: Expression string
|
|
481
268
|
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
expression: Expression to benchmark
|
|
485
|
-
output_column: Output column name
|
|
486
|
-
iterations: Number of iterations
|
|
487
|
-
|
|
488
|
-
Returns:
|
|
489
|
-
Benchmark results
|
|
490
|
-
"""
|
|
491
|
-
times = []
|
|
492
|
-
backend_type = self.arrow_bridge.detect_backend(df)
|
|
269
|
+
Returns:
|
|
270
|
+
Tuple of (namespace, name)
|
|
493
271
|
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
result = self.execute_expression(df, expression, output_column, backend_type)
|
|
497
|
-
times.append(result.execution_time_ms)
|
|
498
|
-
except Exception as e:
|
|
499
|
-
log_warning(f"[polars_engine] Benchmark iteration {i+1} failed: {e}")
|
|
500
|
-
continue
|
|
272
|
+
Raises:
|
|
273
|
+
ValueError: If not a valid reference
|
|
501
274
|
|
|
502
|
-
|
|
503
|
-
|
|
275
|
+
Example:
|
|
276
|
+
namespace, name = parse_expression_reference('inbuilt:bmi')
|
|
277
|
+
# Returns: ('inbuilt', 'bmi')
|
|
278
|
+
"""
|
|
279
|
+
if ':' not in expression:
|
|
280
|
+
raise ValueError(f"Invalid expression reference: {expression}")
|
|
281
|
+
|
|
282
|
+
parts = expression.split(':', 1)
|
|
283
|
+
namespace = parts[0]
|
|
284
|
+
name = parts[1]
|
|
285
|
+
|
|
286
|
+
return namespace, name
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def is_reference(expression: str) -> bool:
|
|
290
|
+
"""
|
|
291
|
+
Check if expression is a reference (not inline).
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
expression: Expression string
|
|
504
295
|
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
296
|
+
Returns:
|
|
297
|
+
True if reference, False if inline
|
|
298
|
+
|
|
299
|
+
Example:
|
|
300
|
+
is_reference('inbuilt:bmi') # True
|
|
301
|
+
is_reference('weight / height') # False
|
|
302
|
+
"""
|
|
303
|
+
# A reference has the format: namespace:name
|
|
304
|
+
# It should have exactly one colon and no spaces before the colon
|
|
305
|
+
if ':' not in expression:
|
|
306
|
+
return False
|
|
514
307
|
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
"round", "floor", "ceil"
|
|
520
|
-
]
|
|
308
|
+
# Check if it looks like a reference (namespace:name)
|
|
309
|
+
parts = expression.split(':', 1)
|
|
310
|
+
if len(parts) != 2:
|
|
311
|
+
return False
|
|
521
312
|
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
return {
|
|
525
|
-
"arithmetic": ["+", "-", "*", "/", "**", "%", "//"],
|
|
526
|
-
"comparison": ["==", "!=", ">", "<", ">=", "<="],
|
|
527
|
-
"boolean": ["and", "or", "not"],
|
|
528
|
-
"conditional": ["if_else"]
|
|
529
|
-
}
|
|
313
|
+
namespace = parts[0].strip()
|
|
314
|
+
name = parts[1].strip()
|
|
530
315
|
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
except Exception as e:
|
|
543
|
-
log_warning(f"[polars_engine] Cleanup callback failed: {e}")
|
|
316
|
+
# Namespace and name should be valid identifiers (no spaces, operators, etc.)
|
|
317
|
+
if not namespace or not name:
|
|
318
|
+
return False
|
|
319
|
+
|
|
320
|
+
# Check if namespace looks like an identifier
|
|
321
|
+
if not namespace.replace('_', '').isalnum():
|
|
322
|
+
return False
|
|
323
|
+
|
|
324
|
+
# Check if name looks like an identifier
|
|
325
|
+
if not name.replace('_', '').isalnum():
|
|
326
|
+
return False
|
|
544
327
|
|
|
545
|
-
|
|
546
|
-
"""Cleanup when engine is destroyed"""
|
|
547
|
-
try:
|
|
548
|
-
if hasattr(self, 'memory_manager'):
|
|
549
|
-
self.memory_manager.unregister_cleanup_callback(self._cleanup_callback)
|
|
550
|
-
except Exception:
|
|
551
|
-
pass
|
|
328
|
+
return True
|