additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -176
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -304
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/distributions.py +0 -22
  100. additory/synthetic/forecast.py +0 -1132
  101. additory/synthetic/linked_list_parser.py +0 -415
  102. additory/synthetic/namespace_lookup.py +0 -129
  103. additory/synthetic/smote.py +0 -320
  104. additory/synthetic/strategies.py +0 -850
  105. additory/synthetic/synthesizer.py +0 -713
  106. additory/utilities/__init__.py +0 -53
  107. additory/utilities/encoding.py +0 -600
  108. additory/utilities/games.py +0 -300
  109. additory/utilities/keys.py +0 -8
  110. additory/utilities/lookup.py +0 -103
  111. additory/utilities/matchers.py +0 -216
  112. additory/utilities/resolvers.py +0 -286
  113. additory/utilities/settings.py +0 -167
  114. additory/utilities/units.py +0 -749
  115. additory/utilities/validators.py +0 -153
  116. additory-0.1.0a3.dist-info/METADATA +0 -288
  117. additory-0.1.0a3.dist-info/RECORD +0 -71
  118. additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
  119. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  120. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -1,601 +0,0 @@
1
- # polars_expression_engine.py
2
- # Polars-only expression processing engine for enhanced expressions system
3
-
4
- import polars as pl
5
- from typing import Any, Dict, Optional, Union
6
- from dataclasses import dataclass
7
- from datetime import datetime
8
-
9
- from .backends.arrow_bridge import EnhancedArrowBridge, ArrowBridgeError
10
- from .ast_builder import build_ast_from_expression
11
- from .logging import log_info, log_warning
12
- from .memory_manager import get_memory_manager
13
-
14
-
15
- @dataclass
16
- class ExpressionResult:
17
- """Result of expression execution"""
18
- dataframe: Any
19
- execution_time_ms: float
20
- rows_processed: int
21
- columns_processed: int
22
- backend_type: str
23
- memory_used_mb: float
24
-
25
-
26
- class PolarsExpressionError(Exception):
27
- """Raised when Polars expression processing fails"""
28
- pass
29
-
30
-
31
- class PolarsExpressionEngine:
32
- """Exclusive Polars-based expression processing engine"""
33
-
34
- def __init__(self):
35
- try:
36
- self.arrow_bridge = EnhancedArrowBridge()
37
- except ArrowBridgeError:
38
- self.arrow_bridge = None
39
- self.execution_stats = {
40
- "total_executions": 0,
41
- "total_time_ms": 0.0,
42
- "total_rows_processed": 0,
43
- "errors": 0
44
- }
45
-
46
- # Register with memory manager for cleanup
47
- self.memory_manager = get_memory_manager()
48
- self.memory_manager.register_cleanup_callback(self._cleanup_callback)
49
-
50
- def execute_expression(self, df: Any, expression: str, output_column: str,
51
- backend_type: Optional[str] = None) -> ExpressionResult:
52
- """
53
- Execute expression using Polars exclusively
54
-
55
- Args:
56
- df: Input dataframe (any supported backend)
57
- expression: Expression string to execute
58
- output_column: Name for the output column
59
- backend_type: Source backend type (auto-detected if None)
60
-
61
- Returns:
62
- ExpressionResult with processed dataframe and statistics
63
-
64
- Raises:
65
- PolarsExpressionError: If expression execution fails
66
- """
67
- start_time = datetime.now()
68
-
69
- # Use memory context for monitoring
70
- with self.memory_manager.memory_context(f"expression: {expression[:50]}..."):
71
- try:
72
- # Auto-detect backend if not specified
73
- if backend_type is None:
74
- if self.arrow_bridge:
75
- backend_type = self.arrow_bridge.detect_backend(df)
76
- else:
77
- backend_type = "pandas" # fallback
78
-
79
- # Get memory usage before processing
80
- if self.arrow_bridge:
81
- memory_before = self.arrow_bridge._get_memory_usage_mb()
82
- else:
83
- memory_before = 0
84
-
85
- # 1. Convert input to Arrow
86
- log_info(f"[polars_engine] Converting {backend_type} to Arrow")
87
- if self.arrow_bridge:
88
- arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
89
- else:
90
- # Fallback: assume pandas and convert directly
91
- import pandas as pd
92
- if isinstance(df, pd.DataFrame):
93
- arrow_table = pl.from_pandas(df).to_arrow()
94
- else:
95
- raise RuntimeError("Arrow bridge not available and input is not pandas DataFrame")
96
-
97
- # 2. Convert Arrow to Polars
98
- log_info("[polars_engine] Converting Arrow to Polars")
99
- polars_df = pl.from_arrow(arrow_table)
100
-
101
- # 3. Execute expression in Polars
102
- log_info(f"[polars_engine] Executing expression: {expression}")
103
- result_df = self._execute_polars_expression(
104
- polars_df, expression, output_column
105
- )
106
-
107
- # 4. Convert back to Arrow
108
- log_info("[polars_engine] Converting result to Arrow")
109
- result_arrow = result_df.to_arrow()
110
-
111
- # 5. Convert to original backend format
112
- log_info(f"[polars_engine] Converting Arrow to {backend_type}")
113
- if self.arrow_bridge:
114
- final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
115
- else:
116
- # Fallback: convert back to pandas
117
- final_result = pl.from_arrow(result_arrow).to_pandas()
118
-
119
- # Calculate execution statistics
120
- execution_time = (datetime.now() - start_time).total_seconds() * 1000
121
- if self.arrow_bridge:
122
- memory_after = self.arrow_bridge._get_memory_usage_mb()
123
- else:
124
- memory_after = 0
125
- memory_used = max(0, memory_after - memory_before)
126
-
127
- # Update global statistics
128
- self.execution_stats["total_executions"] += 1
129
- self.execution_stats["total_time_ms"] += execution_time
130
- self.execution_stats["total_rows_processed"] += result_df.height
131
-
132
- log_info(f"[polars_engine] Expression executed successfully in {execution_time:.1f}ms")
133
-
134
- return ExpressionResult(
135
- dataframe=final_result,
136
- execution_time_ms=execution_time,
137
- rows_processed=result_df.height,
138
- columns_processed=result_df.width,
139
- backend_type=backend_type,
140
- memory_used_mb=memory_used
141
- )
142
-
143
- except Exception as e:
144
- self.execution_stats["errors"] += 1
145
- raise PolarsExpressionError(f"Expression execution failed: {e}")
146
-
147
- finally:
148
- # 6. Always cleanup Arrow memory
149
- if self.arrow_bridge:
150
- self.arrow_bridge.cleanup_arrow_memory()
151
-
152
- def _execute_polars_expression(self, polars_df: pl.DataFrame,
153
- expression: str, output_column: str) -> pl.DataFrame:
154
- """
155
- Execute expression AST in Polars
156
-
157
- Args:
158
- polars_df: Input Polars DataFrame
159
- expression: Expression string
160
- output_column: Name for output column
161
-
162
- Returns:
163
- Polars DataFrame with new column
164
-
165
- Raises:
166
- PolarsExpressionError: If expression execution fails
167
- """
168
- try:
169
- # Clean up multiline expressions
170
- cleaned_expression = ' '.join(line.strip() for line in expression.strip().split('\n') if line.strip())
171
-
172
- # Build AST from expression
173
- ast_tree = build_ast_from_expression(cleaned_expression)
174
-
175
- if ast_tree is None:
176
- raise PolarsExpressionError(f"Failed to parse expression: {expression}")
177
-
178
- # Convert AST to Polars expression
179
- polars_expr = self._ast_to_polars_expr(ast_tree)
180
-
181
- # Execute expression and add as new column
182
- result_df = polars_df.with_columns([
183
- polars_expr.alias(output_column)
184
- ])
185
-
186
- return result_df
187
-
188
- except Exception as e:
189
- raise PolarsExpressionError(f"Polars expression execution failed: {e}")
190
-
191
- def _ast_to_polars_expr(self, ast_node: Dict[str, Any]) -> pl.Expr:
192
- """
193
- Convert expression AST to Polars expression
194
-
195
- Args:
196
- ast_node: AST node dictionary
197
-
198
- Returns:
199
- Polars expression
200
-
201
- Raises:
202
- PolarsExpressionError: If AST conversion fails
203
- """
204
- try:
205
- node_type = ast_node.get("type")
206
-
207
- if node_type == "column":
208
- return pl.col(ast_node["name"])
209
-
210
- elif node_type == "literal":
211
- return pl.lit(ast_node["value"])
212
-
213
- elif node_type == "binary":
214
- left = self._ast_to_polars_expr(ast_node["left"])
215
- right = self._ast_to_polars_expr(ast_node["right"])
216
- op = ast_node["op"]
217
-
218
- if op == "+":
219
- return left + right
220
- elif op == "-":
221
- return left - right
222
- elif op == "*":
223
- return left * right
224
- elif op == "/":
225
- return left / right
226
- elif op == "**":
227
- return left ** right
228
- elif op == "%":
229
- return left % right
230
- elif op == "//":
231
- return left // right
232
- else:
233
- raise PolarsExpressionError(f"Unsupported binary operator: {op}")
234
-
235
- elif node_type == "cmp":
236
- left = self._ast_to_polars_expr(ast_node["left"])
237
- right = self._ast_to_polars_expr(ast_node["right"])
238
- op = ast_node["op"]
239
-
240
- if op == "==":
241
- return left == right
242
- elif op == "!=":
243
- return left != right
244
- elif op == ">":
245
- return left > right
246
- elif op == "<":
247
- return left < right
248
- elif op == ">=":
249
- return left >= right
250
- elif op == "<=":
251
- return left <= right
252
- else:
253
- raise PolarsExpressionError(f"Unsupported comparison operator: {op}")
254
-
255
- elif node_type == "bool_op":
256
- op = ast_node["op"]
257
- values = [self._ast_to_polars_expr(v) for v in ast_node["values"]]
258
-
259
- if op == "and":
260
- result = values[0]
261
- for v in values[1:]:
262
- result = result & v
263
- return result
264
- elif op == "or":
265
- result = values[0]
266
- for v in values[1:]:
267
- result = result | v
268
- return result
269
- else:
270
- raise PolarsExpressionError(f"Unsupported boolean operator: {op}")
271
-
272
- elif node_type == "unary_bool":
273
- op = ast_node["op"]
274
- value = self._ast_to_polars_expr(ast_node["value"])
275
-
276
- if op == "not":
277
- return ~value
278
- else:
279
- raise PolarsExpressionError(f"Unsupported unary boolean operator: {op}")
280
-
281
- elif node_type == "if_expr":
282
- # Ternary: a if cond else b
283
- cond = self._ast_to_polars_expr(ast_node["cond"])
284
- then_expr = self._ast_to_polars_expr(ast_node["then"])
285
- else_expr = self._ast_to_polars_expr(ast_node["else"])
286
-
287
- return pl.when(cond).then(then_expr).otherwise(else_expr)
288
-
289
- elif node_type == "call":
290
- # Function calls
291
- func_name = ast_node["name"]
292
- args = [self._ast_to_polars_expr(arg) for arg in ast_node["args"]]
293
-
294
- return self._handle_function_call(func_name, args)
295
-
296
- else:
297
- raise PolarsExpressionError(f"Unsupported AST node type: {node_type}")
298
-
299
- except Exception as e:
300
- raise PolarsExpressionError(f"AST to Polars conversion failed: {e}")
301
-
302
- def _handle_function_call(self, func_name: str, args: list) -> pl.Expr:
303
- """
304
- Handle function calls in expressions
305
-
306
- Args:
307
- func_name: Name of the function
308
- args: List of Polars expressions as arguments
309
-
310
- Returns:
311
- Polars expression for the function call
312
-
313
- Raises:
314
- PolarsExpressionError: If function is not supported
315
- """
316
- if func_name == "min":
317
- if len(args) == 1:
318
- return args[0].min()
319
- else:
320
- # Element-wise minimum of multiple expressions
321
- result = args[0]
322
- for arg in args[1:]:
323
- result = pl.min_horizontal([result, arg])
324
- return result
325
-
326
- elif func_name == "max":
327
- if len(args) == 1:
328
- return args[0].max()
329
- else:
330
- # Element-wise maximum of multiple expressions
331
- result = args[0]
332
- for arg in args[1:]:
333
- result = pl.max_horizontal([result, arg])
334
- return result
335
-
336
- elif func_name == "abs":
337
- if len(args) != 1:
338
- raise PolarsExpressionError("abs() requires exactly 1 argument")
339
- return args[0].abs()
340
-
341
- elif func_name == "log":
342
- if len(args) == 1:
343
- return args[0].log()
344
- elif len(args) == 2:
345
- # log(value, base)
346
- return args[0].log() / args[1].log()
347
- else:
348
- raise PolarsExpressionError("log() requires 1 or 2 arguments")
349
-
350
- elif func_name == "exp":
351
- if len(args) != 1:
352
- raise PolarsExpressionError("exp() requires exactly 1 argument")
353
- return args[0].exp()
354
-
355
- elif func_name == "sqrt":
356
- if len(args) != 1:
357
- raise PolarsExpressionError("sqrt() requires exactly 1 argument")
358
- return args[0].sqrt()
359
-
360
- elif func_name == "pow":
361
- if len(args) != 2:
362
- raise PolarsExpressionError("pow() requires exactly 2 arguments")
363
- return args[0] ** args[1]
364
-
365
- elif func_name == "round":
366
- if len(args) == 1:
367
- return args[0].round(0)
368
- elif len(args) == 2:
369
- # For round with decimals, the second argument must be a literal integer
370
- if hasattr(args[1], 'meta') and hasattr(args[1].meta, 'output_name'):
371
- # This is a column reference, not a literal
372
- raise PolarsExpressionError("round() decimals parameter must be a literal integer")
373
- return args[0].round(args[1])
374
- else:
375
- raise PolarsExpressionError("round() requires 1 or 2 arguments")
376
-
377
- elif func_name == "floor":
378
- if len(args) != 1:
379
- raise PolarsExpressionError("floor() requires exactly 1 argument")
380
- return args[0].floor()
381
-
382
- elif func_name == "ceil":
383
- if len(args) != 1:
384
- raise PolarsExpressionError("ceil() requires exactly 1 argument")
385
- return args[0].ceil()
386
-
387
- else:
388
- raise PolarsExpressionError(f"Unsupported function: {func_name}")
389
-
390
- def execute_with_ast(self, df: Any, ast_tree: Dict[str, Any], output_column: str,
391
- backend_type: Optional[str] = None) -> ExpressionResult:
392
- """
393
- Execute expression using pre-built AST
394
-
395
- Args:
396
- df: Input dataframe
397
- ast_tree: Pre-built AST tree
398
- output_column: Name for output column
399
- backend_type: Source backend type
400
-
401
- Returns:
402
- ExpressionResult with processed dataframe
403
- """
404
- start_time = datetime.now()
405
-
406
- try:
407
- # Auto-detect backend if not specified
408
- if backend_type is None:
409
- if self.arrow_bridge:
410
- backend_type = self.arrow_bridge.detect_backend(df)
411
- else:
412
- backend_type = "pandas"
413
-
414
- # Get memory usage before processing
415
- if self.arrow_bridge:
416
- memory_before = self.arrow_bridge._get_memory_usage_mb()
417
- else:
418
- memory_before = 0
419
-
420
- # Convert to Polars via Arrow
421
- if self.arrow_bridge:
422
- arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
423
- polars_df = pl.from_arrow(arrow_table)
424
- else:
425
- # Fallback: assume pandas
426
- import pandas as pd
427
- if isinstance(df, pd.DataFrame):
428
- polars_df = pl.from_pandas(df)
429
- else:
430
- raise RuntimeError("Arrow bridge not available and input is not pandas DataFrame")
431
-
432
- # Execute using AST
433
- polars_expr = self._ast_to_polars_expr(ast_tree)
434
- result_df = polars_df.with_columns([polars_expr.alias(output_column)])
435
-
436
- # Convert back to original format
437
- result_arrow = result_df.to_arrow()
438
- if self.arrow_bridge:
439
- final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
440
- else:
441
- final_result = pl.from_arrow(result_arrow).to_pandas()
442
-
443
- # Calculate statistics
444
- execution_time = (datetime.now() - start_time).total_seconds() * 1000
445
- if self.arrow_bridge:
446
- memory_after = self.arrow_bridge._get_memory_usage_mb()
447
- else:
448
- memory_after = 0
449
- memory_used = max(0, memory_after - memory_before)
450
-
451
- # Update statistics
452
- self.execution_stats["total_executions"] += 1
453
- self.execution_stats["total_time_ms"] += execution_time
454
- self.execution_stats["total_rows_processed"] += result_df.height
455
-
456
- return ExpressionResult(
457
- dataframe=final_result,
458
- execution_time_ms=execution_time,
459
- rows_processed=result_df.height,
460
- columns_processed=result_df.width,
461
- backend_type=backend_type,
462
- memory_used_mb=memory_used
463
- )
464
-
465
- except Exception as e:
466
- self.execution_stats["errors"] += 1
467
- raise PolarsExpressionError(f"AST execution failed: {e}")
468
-
469
- finally:
470
- if self.arrow_bridge:
471
- self.arrow_bridge.cleanup_arrow_memory()
472
-
473
- def validate_expression(self, expression: str) -> bool:
474
- """
475
- Validate expression syntax without executing
476
-
477
- Args:
478
- expression: Expression string to validate
479
-
480
- Returns:
481
- True if expression is valid
482
- """
483
- try:
484
- # Clean up multiline expressions
485
- cleaned_expression = ' '.join(line.strip() for line in expression.strip().split('\n') if line.strip())
486
-
487
- ast_tree = build_ast_from_expression(cleaned_expression)
488
- if ast_tree is None:
489
- return False
490
-
491
- # Try to convert AST to Polars expression (dry run)
492
- # This will catch unsupported functions and operators
493
- self._ast_to_polars_expr(ast_tree)
494
- return True
495
-
496
- except Exception as e:
497
- log_warning(f"[polars_engine] Expression validation failed: {e}")
498
- return False
499
-
500
- def get_execution_stats(self) -> Dict[str, Any]:
501
- """Get execution statistics"""
502
- stats = self.execution_stats.copy()
503
-
504
- if stats["total_executions"] > 0:
505
- stats["avg_time_ms"] = stats["total_time_ms"] / stats["total_executions"]
506
- stats["avg_rows_per_execution"] = stats["total_rows_processed"] / stats["total_executions"]
507
- else:
508
- stats["avg_time_ms"] = 0.0
509
- stats["avg_rows_per_execution"] = 0
510
-
511
- return stats
512
-
513
- def reset_stats(self):
514
- """Reset execution statistics"""
515
- self.execution_stats = {
516
- "total_executions": 0,
517
- "total_time_ms": 0.0,
518
- "total_rows_processed": 0,
519
- "errors": 0
520
- }
521
- log_info("[polars_engine] Statistics reset")
522
-
523
- def benchmark_expression(self, df: Any, expression: str, output_column: str,
524
- iterations: int = 3) -> Dict[str, Any]:
525
- """
526
- Benchmark expression execution performance
527
-
528
- Args:
529
- df: Input dataframe
530
- expression: Expression to benchmark
531
- output_column: Output column name
532
- iterations: Number of iterations
533
-
534
- Returns:
535
- Benchmark results
536
- """
537
- times = []
538
- if self.arrow_bridge:
539
- backend_type = self.arrow_bridge.detect_backend(df)
540
- else:
541
- backend_type = "pandas"
542
-
543
- for i in range(iterations):
544
- try:
545
- result = self.execute_expression(df, expression, output_column, backend_type)
546
- times.append(result.execution_time_ms)
547
- except Exception as e:
548
- log_warning(f"[polars_engine] Benchmark iteration {i+1} failed: {e}")
549
- continue
550
-
551
- if not times:
552
- return {"error": "All benchmark iterations failed"}
553
-
554
- return {
555
- "expression": expression,
556
- "backend_type": backend_type,
557
- "iterations": len(times),
558
- "min_time_ms": min(times),
559
- "max_time_ms": max(times),
560
- "avg_time_ms": sum(times) / len(times),
561
- "total_time_ms": sum(times)
562
- }
563
-
564
- def get_supported_functions(self) -> list:
565
- """Get list of supported functions"""
566
- return [
567
- "min", "max", "abs", "log", "exp", "sqrt", "pow",
568
- "round", "floor", "ceil"
569
- ]
570
-
571
- def get_supported_operators(self) -> Dict[str, list]:
572
- """Get list of supported operators by category"""
573
- return {
574
- "arithmetic": ["+", "-", "*", "/", "**", "%", "//"],
575
- "comparison": ["==", "!=", ">", "<", ">=", "<="],
576
- "boolean": ["and", "or", "not"],
577
- "conditional": ["if_else"]
578
- }
579
-
580
- def _cleanup_callback(self):
581
- """Cleanup callback for memory manager"""
582
- try:
583
- # Cleanup Arrow bridge memory
584
- if self.arrow_bridge:
585
- self.arrow_bridge.cleanup_arrow_memory()
586
-
587
- # Reset statistics if they get too large
588
- if self.execution_stats["total_executions"] > 10000:
589
- log_info("[polars_engine] Resetting statistics due to high execution count")
590
- self.reset_stats()
591
-
592
- except Exception as e:
593
- log_warning(f"[polars_engine] Cleanup callback failed: {e}")
594
-
595
- def __del__(self):
596
- """Cleanup when engine is destroyed"""
597
- try:
598
- if hasattr(self, 'memory_manager'):
599
- self.memory_manager.unregister_cleanup_callback(self._cleanup_callback)
600
- except Exception:
601
- pass