additory 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. additory/__init__.py +15 -0
  2. additory/analysis/__init__.py +48 -0
  3. additory/analysis/cardinality.py +126 -0
  4. additory/analysis/correlations.py +124 -0
  5. additory/analysis/distributions.py +376 -0
  6. additory/analysis/quality.py +158 -0
  7. additory/analysis/scan.py +400 -0
  8. additory/augment/__init__.py +24 -0
  9. additory/augment/augmentor.py +653 -0
  10. additory/augment/builtin_lists.py +430 -0
  11. additory/augment/distributions.py +22 -0
  12. additory/augment/forecast.py +1132 -0
  13. additory/augment/list_registry.py +177 -0
  14. additory/augment/smote.py +320 -0
  15. additory/augment/strategies.py +883 -0
  16. additory/common/__init__.py +157 -0
  17. additory/common/backend.py +355 -0
  18. additory/common/column_utils.py +191 -0
  19. additory/common/distributions.py +737 -0
  20. additory/common/exceptions.py +62 -0
  21. additory/common/lists.py +229 -0
  22. additory/common/patterns.py +240 -0
  23. additory/common/resolver.py +567 -0
  24. additory/common/sample_data.py +182 -0
  25. additory/common/validation.py +197 -0
  26. additory/core/__init__.py +27 -0
  27. additory/core/ast_builder.py +165 -0
  28. additory/core/backends/__init__.py +23 -0
  29. additory/core/backends/arrow_bridge.py +476 -0
  30. additory/core/backends/cudf_bridge.py +355 -0
  31. additory/core/column_positioning.py +358 -0
  32. additory/core/compiler_polars.py +166 -0
  33. additory/core/config.py +342 -0
  34. additory/core/enhanced_cache_manager.py +1119 -0
  35. additory/core/enhanced_matchers.py +473 -0
  36. additory/core/enhanced_version_manager.py +325 -0
  37. additory/core/executor.py +59 -0
  38. additory/core/integrity_manager.py +477 -0
  39. additory/core/loader.py +190 -0
  40. additory/core/logging.py +24 -0
  41. additory/core/memory_manager.py +547 -0
  42. additory/core/namespace_manager.py +657 -0
  43. additory/core/parser.py +176 -0
  44. additory/core/polars_expression_engine.py +551 -0
  45. additory/core/registry.py +176 -0
  46. additory/core/sample_data_manager.py +492 -0
  47. additory/core/user_namespace.py +751 -0
  48. additory/core/validator.py +27 -0
  49. additory/dynamic_api.py +308 -0
  50. additory/expressions/__init__.py +26 -0
  51. additory/expressions/engine.py +551 -0
  52. additory/expressions/parser.py +176 -0
  53. additory/expressions/proxy.py +546 -0
  54. additory/expressions/registry.py +313 -0
  55. additory/expressions/samples.py +492 -0
  56. additory/synthetic/__init__.py +101 -0
  57. additory/synthetic/api.py +220 -0
  58. additory/synthetic/common_integration.py +314 -0
  59. additory/synthetic/config.py +262 -0
  60. additory/synthetic/engines.py +529 -0
  61. additory/synthetic/exceptions.py +180 -0
  62. additory/synthetic/file_managers.py +518 -0
  63. additory/synthetic/generator.py +702 -0
  64. additory/synthetic/generator_parser.py +68 -0
  65. additory/synthetic/integration.py +319 -0
  66. additory/synthetic/models.py +241 -0
  67. additory/synthetic/pattern_resolver.py +573 -0
  68. additory/synthetic/performance.py +469 -0
  69. additory/synthetic/polars_integration.py +464 -0
  70. additory/synthetic/proxy.py +60 -0
  71. additory/synthetic/schema_parser.py +685 -0
  72. additory/synthetic/validator.py +553 -0
  73. additory/utilities/__init__.py +53 -0
  74. additory/utilities/encoding.py +600 -0
  75. additory/utilities/games.py +300 -0
  76. additory/utilities/keys.py +8 -0
  77. additory/utilities/lookup.py +103 -0
  78. additory/utilities/matchers.py +216 -0
  79. additory/utilities/resolvers.py +286 -0
  80. additory/utilities/settings.py +167 -0
  81. additory/utilities/units.py +746 -0
  82. additory/utilities/validators.py +153 -0
  83. additory-0.1.0a1.dist-info/METADATA +293 -0
  84. additory-0.1.0a1.dist-info/RECORD +87 -0
  85. additory-0.1.0a1.dist-info/WHEEL +5 -0
  86. additory-0.1.0a1.dist-info/licenses/LICENSE +21 -0
  87. additory-0.1.0a1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,551 @@
1
+ # polars_expression_engine.py
2
+ # Polars-only expression processing engine for enhanced expressions system
3
+
4
+ import polars as pl
5
+ from typing import Any, Dict, Optional, Union
6
+ from dataclasses import dataclass
7
+ from datetime import datetime
8
+
9
+ from .backends.arrow_bridge import EnhancedArrowBridge, ArrowBridgeError
10
+ from .ast_builder import build_ast_from_expression
11
+ from .logging import log_info, log_warning
12
+ from .memory_manager import get_memory_manager
13
+
14
+
15
+ @dataclass
16
+ class ExpressionResult:
17
+ """Result of expression execution"""
18
+ dataframe: Any
19
+ execution_time_ms: float
20
+ rows_processed: int
21
+ columns_processed: int
22
+ backend_type: str
23
+ memory_used_mb: float
24
+
25
+
26
+ class PolarsExpressionError(Exception):
27
+ """Raised when Polars expression processing fails"""
28
+ pass
29
+
30
+
31
+ class PolarsExpressionEngine:
32
+ """Exclusive Polars-based expression processing engine"""
33
+
34
+ def __init__(self):
35
+ self.arrow_bridge = EnhancedArrowBridge()
36
+ self.execution_stats = {
37
+ "total_executions": 0,
38
+ "total_time_ms": 0.0,
39
+ "total_rows_processed": 0,
40
+ "errors": 0
41
+ }
42
+
43
+ # Register with memory manager for cleanup
44
+ self.memory_manager = get_memory_manager()
45
+ self.memory_manager.register_cleanup_callback(self._cleanup_callback)
46
+
47
+ def execute_expression(self, df: Any, expression: str, output_column: str,
48
+ backend_type: Optional[str] = None) -> ExpressionResult:
49
+ """
50
+ Execute expression using Polars exclusively
51
+
52
+ Args:
53
+ df: Input dataframe (any supported backend)
54
+ expression: Expression string to execute
55
+ output_column: Name for the output column
56
+ backend_type: Source backend type (auto-detected if None)
57
+
58
+ Returns:
59
+ ExpressionResult with processed dataframe and statistics
60
+
61
+ Raises:
62
+ PolarsExpressionError: If expression execution fails
63
+ """
64
+ start_time = datetime.now()
65
+
66
+ # Use memory context for monitoring
67
+ with self.memory_manager.memory_context(f"expression: {expression[:50]}..."):
68
+ try:
69
+ # Auto-detect backend if not specified
70
+ if backend_type is None:
71
+ backend_type = self.arrow_bridge.detect_backend(df)
72
+
73
+ # Get memory usage before processing
74
+ memory_before = self.arrow_bridge._get_memory_usage_mb()
75
+
76
+ # 1. Convert input to Arrow
77
+ log_info(f"[polars_engine] Converting {backend_type} to Arrow")
78
+ arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
79
+
80
+ # 2. Convert Arrow to Polars
81
+ log_info("[polars_engine] Converting Arrow to Polars")
82
+ polars_df = pl.from_arrow(arrow_table)
83
+
84
+ # 3. Execute expression in Polars
85
+ log_info(f"[polars_engine] Executing expression: {expression}")
86
+ result_df = self._execute_polars_expression(
87
+ polars_df, expression, output_column
88
+ )
89
+
90
+ # 4. Convert back to Arrow
91
+ log_info("[polars_engine] Converting result to Arrow")
92
+ result_arrow = result_df.to_arrow()
93
+
94
+ # 5. Convert to original backend format
95
+ log_info(f"[polars_engine] Converting Arrow to {backend_type}")
96
+ final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
97
+
98
+ # Calculate execution statistics
99
+ execution_time = (datetime.now() - start_time).total_seconds() * 1000
100
+ memory_after = self.arrow_bridge._get_memory_usage_mb()
101
+ memory_used = max(0, memory_after - memory_before)
102
+
103
+ # Update global statistics
104
+ self.execution_stats["total_executions"] += 1
105
+ self.execution_stats["total_time_ms"] += execution_time
106
+ self.execution_stats["total_rows_processed"] += result_df.height
107
+
108
+ log_info(f"[polars_engine] Expression executed successfully in {execution_time:.1f}ms")
109
+
110
+ return ExpressionResult(
111
+ dataframe=final_result,
112
+ execution_time_ms=execution_time,
113
+ rows_processed=result_df.height,
114
+ columns_processed=result_df.width,
115
+ backend_type=backend_type,
116
+ memory_used_mb=memory_used
117
+ )
118
+
119
+ except Exception as e:
120
+ self.execution_stats["errors"] += 1
121
+ raise PolarsExpressionError(f"Expression execution failed: {e}")
122
+
123
+ finally:
124
+ # 6. Always cleanup Arrow memory
125
+ self.arrow_bridge.cleanup_arrow_memory()
126
+
127
+ def _execute_polars_expression(self, polars_df: pl.DataFrame,
128
+ expression: str, output_column: str) -> pl.DataFrame:
129
+ """
130
+ Execute expression AST in Polars
131
+
132
+ Args:
133
+ polars_df: Input Polars DataFrame
134
+ expression: Expression string
135
+ output_column: Name for output column
136
+
137
+ Returns:
138
+ Polars DataFrame with new column
139
+
140
+ Raises:
141
+ PolarsExpressionError: If expression execution fails
142
+ """
143
+ try:
144
+ # Clean up multiline expressions
145
+ cleaned_expression = ' '.join(line.strip() for line in expression.strip().split('\n') if line.strip())
146
+
147
+ # Build AST from expression
148
+ ast_tree = build_ast_from_expression(cleaned_expression)
149
+
150
+ if ast_tree is None:
151
+ raise PolarsExpressionError(f"Failed to parse expression: {expression}")
152
+
153
+ # Convert AST to Polars expression
154
+ polars_expr = self._ast_to_polars_expr(ast_tree)
155
+
156
+ # Execute expression and add as new column
157
+ result_df = polars_df.with_columns([
158
+ polars_expr.alias(output_column)
159
+ ])
160
+
161
+ return result_df
162
+
163
+ except Exception as e:
164
+ raise PolarsExpressionError(f"Polars expression execution failed: {e}")
165
+
166
+ def _ast_to_polars_expr(self, ast_node: Dict[str, Any]) -> pl.Expr:
167
+ """
168
+ Convert expression AST to Polars expression
169
+
170
+ Args:
171
+ ast_node: AST node dictionary
172
+
173
+ Returns:
174
+ Polars expression
175
+
176
+ Raises:
177
+ PolarsExpressionError: If AST conversion fails
178
+ """
179
+ try:
180
+ node_type = ast_node.get("type")
181
+
182
+ if node_type == "column":
183
+ return pl.col(ast_node["name"])
184
+
185
+ elif node_type == "literal":
186
+ return pl.lit(ast_node["value"])
187
+
188
+ elif node_type == "binary":
189
+ left = self._ast_to_polars_expr(ast_node["left"])
190
+ right = self._ast_to_polars_expr(ast_node["right"])
191
+ op = ast_node["op"]
192
+
193
+ if op == "+":
194
+ return left + right
195
+ elif op == "-":
196
+ return left - right
197
+ elif op == "*":
198
+ return left * right
199
+ elif op == "/":
200
+ return left / right
201
+ elif op == "**":
202
+ return left ** right
203
+ elif op == "%":
204
+ return left % right
205
+ elif op == "//":
206
+ return left // right
207
+ else:
208
+ raise PolarsExpressionError(f"Unsupported binary operator: {op}")
209
+
210
+ elif node_type == "cmp":
211
+ left = self._ast_to_polars_expr(ast_node["left"])
212
+ right = self._ast_to_polars_expr(ast_node["right"])
213
+ op = ast_node["op"]
214
+
215
+ if op == "==":
216
+ return left == right
217
+ elif op == "!=":
218
+ return left != right
219
+ elif op == ">":
220
+ return left > right
221
+ elif op == "<":
222
+ return left < right
223
+ elif op == ">=":
224
+ return left >= right
225
+ elif op == "<=":
226
+ return left <= right
227
+ else:
228
+ raise PolarsExpressionError(f"Unsupported comparison operator: {op}")
229
+
230
+ elif node_type == "bool_op":
231
+ op = ast_node["op"]
232
+ values = [self._ast_to_polars_expr(v) for v in ast_node["values"]]
233
+
234
+ if op == "and":
235
+ result = values[0]
236
+ for v in values[1:]:
237
+ result = result & v
238
+ return result
239
+ elif op == "or":
240
+ result = values[0]
241
+ for v in values[1:]:
242
+ result = result | v
243
+ return result
244
+ else:
245
+ raise PolarsExpressionError(f"Unsupported boolean operator: {op}")
246
+
247
+ elif node_type == "unary_bool":
248
+ op = ast_node["op"]
249
+ value = self._ast_to_polars_expr(ast_node["value"])
250
+
251
+ if op == "not":
252
+ return ~value
253
+ else:
254
+ raise PolarsExpressionError(f"Unsupported unary boolean operator: {op}")
255
+
256
+ elif node_type == "if_expr":
257
+ # Ternary: a if cond else b
258
+ cond = self._ast_to_polars_expr(ast_node["cond"])
259
+ then_expr = self._ast_to_polars_expr(ast_node["then"])
260
+ else_expr = self._ast_to_polars_expr(ast_node["else"])
261
+
262
+ return pl.when(cond).then(then_expr).otherwise(else_expr)
263
+
264
+ elif node_type == "call":
265
+ # Function calls
266
+ func_name = ast_node["name"]
267
+ args = [self._ast_to_polars_expr(arg) for arg in ast_node["args"]]
268
+
269
+ return self._handle_function_call(func_name, args)
270
+
271
+ else:
272
+ raise PolarsExpressionError(f"Unsupported AST node type: {node_type}")
273
+
274
+ except Exception as e:
275
+ raise PolarsExpressionError(f"AST to Polars conversion failed: {e}")
276
+
277
+ def _handle_function_call(self, func_name: str, args: list) -> pl.Expr:
278
+ """
279
+ Handle function calls in expressions
280
+
281
+ Args:
282
+ func_name: Name of the function
283
+ args: List of Polars expressions as arguments
284
+
285
+ Returns:
286
+ Polars expression for the function call
287
+
288
+ Raises:
289
+ PolarsExpressionError: If function is not supported
290
+ """
291
+ if func_name == "min":
292
+ if len(args) == 1:
293
+ return args[0].min()
294
+ else:
295
+ # Element-wise minimum of multiple expressions
296
+ result = args[0]
297
+ for arg in args[1:]:
298
+ result = pl.min_horizontal([result, arg])
299
+ return result
300
+
301
+ elif func_name == "max":
302
+ if len(args) == 1:
303
+ return args[0].max()
304
+ else:
305
+ # Element-wise maximum of multiple expressions
306
+ result = args[0]
307
+ for arg in args[1:]:
308
+ result = pl.max_horizontal([result, arg])
309
+ return result
310
+
311
+ elif func_name == "abs":
312
+ if len(args) != 1:
313
+ raise PolarsExpressionError("abs() requires exactly 1 argument")
314
+ return args[0].abs()
315
+
316
+ elif func_name == "log":
317
+ if len(args) == 1:
318
+ return args[0].log()
319
+ elif len(args) == 2:
320
+ # log(value, base)
321
+ return args[0].log() / args[1].log()
322
+ else:
323
+ raise PolarsExpressionError("log() requires 1 or 2 arguments")
324
+
325
+ elif func_name == "exp":
326
+ if len(args) != 1:
327
+ raise PolarsExpressionError("exp() requires exactly 1 argument")
328
+ return args[0].exp()
329
+
330
+ elif func_name == "sqrt":
331
+ if len(args) != 1:
332
+ raise PolarsExpressionError("sqrt() requires exactly 1 argument")
333
+ return args[0].sqrt()
334
+
335
+ elif func_name == "pow":
336
+ if len(args) != 2:
337
+ raise PolarsExpressionError("pow() requires exactly 2 arguments")
338
+ return args[0] ** args[1]
339
+
340
+ elif func_name == "round":
341
+ if len(args) == 1:
342
+ return args[0].round(0)
343
+ elif len(args) == 2:
344
+ # For round with decimals, the second argument must be a literal integer
345
+ if hasattr(args[1], 'meta') and hasattr(args[1].meta, 'output_name'):
346
+ # This is a column reference, not a literal
347
+ raise PolarsExpressionError("round() decimals parameter must be a literal integer")
348
+ return args[0].round(args[1])
349
+ else:
350
+ raise PolarsExpressionError("round() requires 1 or 2 arguments")
351
+
352
+ elif func_name == "floor":
353
+ if len(args) != 1:
354
+ raise PolarsExpressionError("floor() requires exactly 1 argument")
355
+ return args[0].floor()
356
+
357
+ elif func_name == "ceil":
358
+ if len(args) != 1:
359
+ raise PolarsExpressionError("ceil() requires exactly 1 argument")
360
+ return args[0].ceil()
361
+
362
+ else:
363
+ raise PolarsExpressionError(f"Unsupported function: {func_name}")
364
+
365
+ def execute_with_ast(self, df: Any, ast_tree: Dict[str, Any], output_column: str,
366
+ backend_type: Optional[str] = None) -> ExpressionResult:
367
+ """
368
+ Execute expression using pre-built AST
369
+
370
+ Args:
371
+ df: Input dataframe
372
+ ast_tree: Pre-built AST tree
373
+ output_column: Name for output column
374
+ backend_type: Source backend type
375
+
376
+ Returns:
377
+ ExpressionResult with processed dataframe
378
+ """
379
+ start_time = datetime.now()
380
+
381
+ try:
382
+ # Auto-detect backend if not specified
383
+ if backend_type is None:
384
+ backend_type = self.arrow_bridge.detect_backend(df)
385
+
386
+ # Get memory usage before processing
387
+ memory_before = self.arrow_bridge._get_memory_usage_mb()
388
+
389
+ # Convert to Polars via Arrow
390
+ arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
391
+ polars_df = pl.from_arrow(arrow_table)
392
+
393
+ # Execute using AST
394
+ polars_expr = self._ast_to_polars_expr(ast_tree)
395
+ result_df = polars_df.with_columns([polars_expr.alias(output_column)])
396
+
397
+ # Convert back to original format
398
+ result_arrow = result_df.to_arrow()
399
+ final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
400
+
401
+ # Calculate statistics
402
+ execution_time = (datetime.now() - start_time).total_seconds() * 1000
403
+ memory_after = self.arrow_bridge._get_memory_usage_mb()
404
+ memory_used = max(0, memory_after - memory_before)
405
+
406
+ # Update statistics
407
+ self.execution_stats["total_executions"] += 1
408
+ self.execution_stats["total_time_ms"] += execution_time
409
+ self.execution_stats["total_rows_processed"] += result_df.height
410
+
411
+ return ExpressionResult(
412
+ dataframe=final_result,
413
+ execution_time_ms=execution_time,
414
+ rows_processed=result_df.height,
415
+ columns_processed=result_df.width,
416
+ backend_type=backend_type,
417
+ memory_used_mb=memory_used
418
+ )
419
+
420
+ except Exception as e:
421
+ self.execution_stats["errors"] += 1
422
+ raise PolarsExpressionError(f"AST execution failed: {e}")
423
+
424
+ finally:
425
+ self.arrow_bridge.cleanup_arrow_memory()
426
+
427
+ def validate_expression(self, expression: str) -> bool:
428
+ """
429
+ Validate expression syntax without executing
430
+
431
+ Args:
432
+ expression: Expression string to validate
433
+
434
+ Returns:
435
+ True if expression is valid
436
+ """
437
+ try:
438
+ # Clean up multiline expressions
439
+ cleaned_expression = ' '.join(line.strip() for line in expression.strip().split('\n') if line.strip())
440
+
441
+ ast_tree = build_ast_from_expression(cleaned_expression)
442
+ if ast_tree is None:
443
+ return False
444
+
445
+ # Try to convert AST to Polars expression (dry run)
446
+ # This will catch unsupported functions and operators
447
+ self._ast_to_polars_expr(ast_tree)
448
+ return True
449
+
450
+ except Exception as e:
451
+ log_warning(f"[polars_engine] Expression validation failed: {e}")
452
+ return False
453
+
454
+ def get_execution_stats(self) -> Dict[str, Any]:
455
+ """Get execution statistics"""
456
+ stats = self.execution_stats.copy()
457
+
458
+ if stats["total_executions"] > 0:
459
+ stats["avg_time_ms"] = stats["total_time_ms"] / stats["total_executions"]
460
+ stats["avg_rows_per_execution"] = stats["total_rows_processed"] / stats["total_executions"]
461
+ else:
462
+ stats["avg_time_ms"] = 0.0
463
+ stats["avg_rows_per_execution"] = 0
464
+
465
+ return stats
466
+
467
+ def reset_stats(self):
468
+ """Reset execution statistics"""
469
+ self.execution_stats = {
470
+ "total_executions": 0,
471
+ "total_time_ms": 0.0,
472
+ "total_rows_processed": 0,
473
+ "errors": 0
474
+ }
475
+ log_info("[polars_engine] Statistics reset")
476
+
477
+ def benchmark_expression(self, df: Any, expression: str, output_column: str,
478
+ iterations: int = 3) -> Dict[str, Any]:
479
+ """
480
+ Benchmark expression execution performance
481
+
482
+ Args:
483
+ df: Input dataframe
484
+ expression: Expression to benchmark
485
+ output_column: Output column name
486
+ iterations: Number of iterations
487
+
488
+ Returns:
489
+ Benchmark results
490
+ """
491
+ times = []
492
+ backend_type = self.arrow_bridge.detect_backend(df)
493
+
494
+ for i in range(iterations):
495
+ try:
496
+ result = self.execute_expression(df, expression, output_column, backend_type)
497
+ times.append(result.execution_time_ms)
498
+ except Exception as e:
499
+ log_warning(f"[polars_engine] Benchmark iteration {i+1} failed: {e}")
500
+ continue
501
+
502
+ if not times:
503
+ return {"error": "All benchmark iterations failed"}
504
+
505
+ return {
506
+ "expression": expression,
507
+ "backend_type": backend_type,
508
+ "iterations": len(times),
509
+ "min_time_ms": min(times),
510
+ "max_time_ms": max(times),
511
+ "avg_time_ms": sum(times) / len(times),
512
+ "total_time_ms": sum(times)
513
+ }
514
+
515
+ def get_supported_functions(self) -> list:
516
+ """Get list of supported functions"""
517
+ return [
518
+ "min", "max", "abs", "log", "exp", "sqrt", "pow",
519
+ "round", "floor", "ceil"
520
+ ]
521
+
522
+ def get_supported_operators(self) -> Dict[str, list]:
523
+ """Get list of supported operators by category"""
524
+ return {
525
+ "arithmetic": ["+", "-", "*", "/", "**", "%", "//"],
526
+ "comparison": ["==", "!=", ">", "<", ">=", "<="],
527
+ "boolean": ["and", "or", "not"],
528
+ "conditional": ["if_else"]
529
+ }
530
+
531
+ def _cleanup_callback(self):
532
+ """Cleanup callback for memory manager"""
533
+ try:
534
+ # Cleanup Arrow bridge memory
535
+ self.arrow_bridge.cleanup_arrow_memory()
536
+
537
+ # Reset statistics if they get too large
538
+ if self.execution_stats["total_executions"] > 10000:
539
+ log_info("[polars_engine] Resetting statistics due to high execution count")
540
+ self.reset_stats()
541
+
542
+ except Exception as e:
543
+ log_warning(f"[polars_engine] Cleanup callback failed: {e}")
544
+
545
+ def __del__(self):
546
+ """Cleanup when engine is destroyed"""
547
+ try:
548
+ if hasattr(self, 'memory_manager'):
549
+ self.memory_manager.unregister_cleanup_callback(self._cleanup_callback)
550
+ except Exception:
551
+ pass