PyPI - additory - Versions diffs - 0.1.0a1__py3-none-any.whl - Mend

additory 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

additory/__init__.py +15 -0
additory/analysis/__init__.py +48 -0
additory/analysis/cardinality.py +126 -0
additory/analysis/correlations.py +124 -0
additory/analysis/distributions.py +376 -0
additory/analysis/quality.py +158 -0
additory/analysis/scan.py +400 -0
additory/augment/__init__.py +24 -0
additory/augment/augmentor.py +653 -0
additory/augment/builtin_lists.py +430 -0
additory/augment/distributions.py +22 -0
additory/augment/forecast.py +1132 -0
additory/augment/list_registry.py +177 -0
additory/augment/smote.py +320 -0
additory/augment/strategies.py +883 -0
additory/common/__init__.py +157 -0
additory/common/backend.py +355 -0
additory/common/column_utils.py +191 -0
additory/common/distributions.py +737 -0
additory/common/exceptions.py +62 -0
additory/common/lists.py +229 -0
additory/common/patterns.py +240 -0
additory/common/resolver.py +567 -0
additory/common/sample_data.py +182 -0
additory/common/validation.py +197 -0
additory/core/__init__.py +27 -0
additory/core/ast_builder.py +165 -0
additory/core/backends/__init__.py +23 -0
additory/core/backends/arrow_bridge.py +476 -0
additory/core/backends/cudf_bridge.py +355 -0
additory/core/column_positioning.py +358 -0
additory/core/compiler_polars.py +166 -0
additory/core/config.py +342 -0
additory/core/enhanced_cache_manager.py +1119 -0
additory/core/enhanced_matchers.py +473 -0
additory/core/enhanced_version_manager.py +325 -0
additory/core/executor.py +59 -0
additory/core/integrity_manager.py +477 -0
additory/core/loader.py +190 -0
additory/core/logging.py +24 -0
additory/core/memory_manager.py +547 -0
additory/core/namespace_manager.py +657 -0
additory/core/parser.py +176 -0
additory/core/polars_expression_engine.py +551 -0
additory/core/registry.py +176 -0
additory/core/sample_data_manager.py +492 -0
additory/core/user_namespace.py +751 -0
additory/core/validator.py +27 -0
additory/dynamic_api.py +308 -0
additory/expressions/__init__.py +26 -0
additory/expressions/engine.py +551 -0
additory/expressions/parser.py +176 -0
additory/expressions/proxy.py +546 -0
additory/expressions/registry.py +313 -0
additory/expressions/samples.py +492 -0
additory/synthetic/__init__.py +101 -0
additory/synthetic/api.py +220 -0
additory/synthetic/common_integration.py +314 -0
additory/synthetic/config.py +262 -0
additory/synthetic/engines.py +529 -0
additory/synthetic/exceptions.py +180 -0
additory/synthetic/file_managers.py +518 -0
additory/synthetic/generator.py +702 -0
additory/synthetic/generator_parser.py +68 -0
additory/synthetic/integration.py +319 -0
additory/synthetic/models.py +241 -0
additory/synthetic/pattern_resolver.py +573 -0
additory/synthetic/performance.py +469 -0
additory/synthetic/polars_integration.py +464 -0
additory/synthetic/proxy.py +60 -0
additory/synthetic/schema_parser.py +685 -0
additory/synthetic/validator.py +553 -0
additory/utilities/__init__.py +53 -0
additory/utilities/encoding.py +600 -0
additory/utilities/games.py +300 -0
additory/utilities/keys.py +8 -0
additory/utilities/lookup.py +103 -0
additory/utilities/matchers.py +216 -0
additory/utilities/resolvers.py +286 -0
additory/utilities/settings.py +167 -0
additory/utilities/units.py +746 -0
additory/utilities/validators.py +153 -0
additory-0.1.0a1.dist-info/METADATA +293 -0
additory-0.1.0a1.dist-info/RECORD +87 -0
additory-0.1.0a1.dist-info/WHEEL +5 -0
additory-0.1.0a1.dist-info/licenses/LICENSE +21 -0
additory-0.1.0a1.dist-info/top_level.txt +1 -0

additory/core/polars_expression_engine.py ADDED Viewed

@@ -0,0 +1,551 @@
+# polars_expression_engine.py
+# Polars-only expression processing engine for enhanced expressions system
+import polars as pl
+from typing import Any, Dict, Optional, Union
+from dataclasses import dataclass
+from datetime import datetime
+from .backends.arrow_bridge import EnhancedArrowBridge, ArrowBridgeError
+from .ast_builder import build_ast_from_expression
+from .logging import log_info, log_warning
+from .memory_manager import get_memory_manager
+@dataclass
+class ExpressionResult:
+    """Result of expression execution"""
+    dataframe: Any
+    execution_time_ms: float
+    rows_processed: int
+    columns_processed: int
+    backend_type: str
+    memory_used_mb: float
+class PolarsExpressionError(Exception):
+    """Raised when Polars expression processing fails"""
+    pass
+class PolarsExpressionEngine:
+    """Exclusive Polars-based expression processing engine"""
+    def __init__(self):
+        self.arrow_bridge = EnhancedArrowBridge()
+        self.execution_stats = {
+            "total_executions": 0,
+            "total_time_ms": 0.0,
+            "total_rows_processed": 0,
+            "errors": 0
+        }
+        # Register with memory manager for cleanup
+        self.memory_manager = get_memory_manager()
+        self.memory_manager.register_cleanup_callback(self._cleanup_callback)
+    def execute_expression(self, df: Any, expression: str, output_column: str,
+                         backend_type: Optional[str] = None) -> ExpressionResult:
+        """
+        Execute expression using Polars exclusively
+        Args:
+            df: Input dataframe (any supported backend)
+            expression: Expression string to execute
+            output_column: Name for the output column
+            backend_type: Source backend type (auto-detected if None)
+        Returns:
+            ExpressionResult with processed dataframe and statistics
+        Raises:
+            PolarsExpressionError: If expression execution fails
+        """
+        start_time = datetime.now()
+        # Use memory context for monitoring
+        with self.memory_manager.memory_context(f"expression: {expression[:50]}..."):
+            try:
+                # Auto-detect backend if not specified
+                if backend_type is None:
+                    backend_type = self.arrow_bridge.detect_backend(df)
+                # Get memory usage before processing
+                memory_before = self.arrow_bridge._get_memory_usage_mb()
+                # 1. Convert input to Arrow
+                log_info(f"[polars_engine] Converting {backend_type} to Arrow")
+                arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
+                # 2. Convert Arrow to Polars
+                log_info("[polars_engine] Converting Arrow to Polars")
+                polars_df = pl.from_arrow(arrow_table)
+                # 3. Execute expression in Polars
+                log_info(f"[polars_engine] Executing expression: {expression}")
+                result_df = self._execute_polars_expression(
+                    polars_df, expression, output_column
+                )
+                # 4. Convert back to Arrow
+                log_info("[polars_engine] Converting result to Arrow")
+                result_arrow = result_df.to_arrow()
+                # 5. Convert to original backend format
+                log_info(f"[polars_engine] Converting Arrow to {backend_type}")
+                final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
+                # Calculate execution statistics
+                execution_time = (datetime.now() - start_time).total_seconds() * 1000
+                memory_after = self.arrow_bridge._get_memory_usage_mb()
+                memory_used = max(0, memory_after - memory_before)
+                # Update global statistics
+                self.execution_stats["total_executions"] += 1
+                self.execution_stats["total_time_ms"] += execution_time
+                self.execution_stats["total_rows_processed"] += result_df.height
+                log_info(f"[polars_engine] Expression executed successfully in {execution_time:.1f}ms")
+                return ExpressionResult(
+                    dataframe=final_result,
+                    execution_time_ms=execution_time,
+                    rows_processed=result_df.height,
+                    columns_processed=result_df.width,
+                    backend_type=backend_type,
+                    memory_used_mb=memory_used
+                )
+            except Exception as e:
+                self.execution_stats["errors"] += 1
+                raise PolarsExpressionError(f"Expression execution failed: {e}")
+            finally:
+                # 6. Always cleanup Arrow memory
+                self.arrow_bridge.cleanup_arrow_memory()
+    def _execute_polars_expression(self, polars_df: pl.DataFrame,
+                                 expression: str, output_column: str) -> pl.DataFrame:
+        """
+        Execute expression AST in Polars
+        Args:
+            polars_df: Input Polars DataFrame
+            expression: Expression string
+            output_column: Name for output column
+        Returns:
+            Polars DataFrame with new column
+        Raises:
+            PolarsExpressionError: If expression execution fails
+        """
+        try:
+            # Clean up multiline expressions
+            cleaned_expression = ' '.join(line.strip() for line in expression.strip().split('\n') if line.strip())
+            # Build AST from expression
+            ast_tree = build_ast_from_expression(cleaned_expression)
+            if ast_tree is None:
+                raise PolarsExpressionError(f"Failed to parse expression: {expression}")
+            # Convert AST to Polars expression
+            polars_expr = self._ast_to_polars_expr(ast_tree)
+            # Execute expression and add as new column
+            result_df = polars_df.with_columns([
+                polars_expr.alias(output_column)
+            ])
+            return result_df
+        except Exception as e:
+            raise PolarsExpressionError(f"Polars expression execution failed: {e}")
+    def _ast_to_polars_expr(self, ast_node: Dict[str, Any]) -> pl.Expr:
+        """
+        Convert expression AST to Polars expression
+        Args:
+            ast_node: AST node dictionary
+        Returns:
+            Polars expression
+        Raises:
+            PolarsExpressionError: If AST conversion fails
+        """
+        try:
+            node_type = ast_node.get("type")
+            if node_type == "column":
+                return pl.col(ast_node["name"])
+            elif node_type == "literal":
+                return pl.lit(ast_node["value"])
+            elif node_type == "binary":
+                left = self._ast_to_polars_expr(ast_node["left"])
+                right = self._ast_to_polars_expr(ast_node["right"])
+                op = ast_node["op"]
+                if op == "+":
+                    return left + right
+                elif op == "-":
+                    return left - right
+                elif op == "*":
+                    return left * right
+                elif op == "/":
+                    return left / right
+                elif op == "**":
+                    return left ** right
+                elif op == "%":
+                    return left % right
+                elif op == "//":
+                    return left // right
+                else:
+                    raise PolarsExpressionError(f"Unsupported binary operator: {op}")
+            elif node_type == "cmp":
+                left = self._ast_to_polars_expr(ast_node["left"])
+                right = self._ast_to_polars_expr(ast_node["right"])
+                op = ast_node["op"]
+                if op == "==":
+                    return left == right
+                elif op == "!=":
+                    return left != right
+                elif op == ">":
+                    return left > right
+                elif op == "<":
+                    return left < right
+                elif op == ">=":
+                    return left >= right
+                elif op == "<=":
+                    return left <= right
+                else:
+                    raise PolarsExpressionError(f"Unsupported comparison operator: {op}")
+            elif node_type == "bool_op":
+                op = ast_node["op"]
+                values = [self._ast_to_polars_expr(v) for v in ast_node["values"]]
+                if op == "and":
+                    result = values[0]
+                    for v in values[1:]:
+                        result = result & v
+                    return result
+                elif op == "or":
+                    result = values[0]
+                    for v in values[1:]:
+                        result = result | v
+                    return result
+                else:
+                    raise PolarsExpressionError(f"Unsupported boolean operator: {op}")
+            elif node_type == "unary_bool":
+                op = ast_node["op"]
+                value = self._ast_to_polars_expr(ast_node["value"])
+                if op == "not":
+                    return ~value
+                else:
+                    raise PolarsExpressionError(f"Unsupported unary boolean operator: {op}")
+            elif node_type == "if_expr":
+                # Ternary: a if cond else b
+                cond = self._ast_to_polars_expr(ast_node["cond"])
+                then_expr = self._ast_to_polars_expr(ast_node["then"])
+                else_expr = self._ast_to_polars_expr(ast_node["else"])
+                return pl.when(cond).then(then_expr).otherwise(else_expr)
+            elif node_type == "call":
+                # Function calls
+                func_name = ast_node["name"]
+                args = [self._ast_to_polars_expr(arg) for arg in ast_node["args"]]
+                return self._handle_function_call(func_name, args)
+            else:
+                raise PolarsExpressionError(f"Unsupported AST node type: {node_type}")
+        except Exception as e:
+            raise PolarsExpressionError(f"AST to Polars conversion failed: {e}")
+    def _handle_function_call(self, func_name: str, args: list) -> pl.Expr:
+        """
+        Handle function calls in expressions
+        Args:
+            func_name: Name of the function
+            args: List of Polars expressions as arguments
+        Returns:
+            Polars expression for the function call
+        Raises:
+            PolarsExpressionError: If function is not supported
+        """
+        if func_name == "min":
+            if len(args) == 1:
+                return args[0].min()
+            else:
+                # Element-wise minimum of multiple expressions
+                result = args[0]
+                for arg in args[1:]:
+                    result = pl.min_horizontal([result, arg])
+                return result
+        elif func_name == "max":
+            if len(args) == 1:
+                return args[0].max()
+            else:
+                # Element-wise maximum of multiple expressions
+                result = args[0]
+                for arg in args[1:]:
+                    result = pl.max_horizontal([result, arg])
+                return result
+        elif func_name == "abs":
+            if len(args) != 1:
+                raise PolarsExpressionError("abs() requires exactly 1 argument")
+            return args[0].abs()
+        elif func_name == "log":
+            if len(args) == 1:
+                return args[0].log()
+            elif len(args) == 2:
+                # log(value, base)
+                return args[0].log() / args[1].log()
+            else:
+                raise PolarsExpressionError("log() requires 1 or 2 arguments")
+        elif func_name == "exp":
+            if len(args) != 1:
+                raise PolarsExpressionError("exp() requires exactly 1 argument")
+            return args[0].exp()
+        elif func_name == "sqrt":
+            if len(args) != 1:
+                raise PolarsExpressionError("sqrt() requires exactly 1 argument")
+            return args[0].sqrt()
+        elif func_name == "pow":
+            if len(args) != 2:
+                raise PolarsExpressionError("pow() requires exactly 2 arguments")
+            return args[0] ** args[1]
+        elif func_name == "round":
+            if len(args) == 1:
+                return args[0].round(0)
+            elif len(args) == 2:
+                # For round with decimals, the second argument must be a literal integer
+                if hasattr(args[1], 'meta') and hasattr(args[1].meta, 'output_name'):
+                    # This is a column reference, not a literal
+                    raise PolarsExpressionError("round() decimals parameter must be a literal integer")
+                return args[0].round(args[1])
+            else:
+                raise PolarsExpressionError("round() requires 1 or 2 arguments")
+        elif func_name == "floor":
+            if len(args) != 1:
+                raise PolarsExpressionError("floor() requires exactly 1 argument")
+            return args[0].floor()
+        elif func_name == "ceil":
+            if len(args) != 1:
+                raise PolarsExpressionError("ceil() requires exactly 1 argument")
+            return args[0].ceil()
+        else:
+            raise PolarsExpressionError(f"Unsupported function: {func_name}")
+    def execute_with_ast(self, df: Any, ast_tree: Dict[str, Any], output_column: str,
+                        backend_type: Optional[str] = None) -> ExpressionResult:
+        """
+        Execute expression using pre-built AST
+        Args:
+            df: Input dataframe
+            ast_tree: Pre-built AST tree
+            output_column: Name for output column
+            backend_type: Source backend type
+        Returns:
+            ExpressionResult with processed dataframe
+        """
+        start_time = datetime.now()
+        try:
+            # Auto-detect backend if not specified
+            if backend_type is None:
+                backend_type = self.arrow_bridge.detect_backend(df)
+            # Get memory usage before processing
+            memory_before = self.arrow_bridge._get_memory_usage_mb()
+            # Convert to Polars via Arrow
+            arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
+            polars_df = pl.from_arrow(arrow_table)
+            # Execute using AST
+            polars_expr = self._ast_to_polars_expr(ast_tree)
+            result_df = polars_df.with_columns([polars_expr.alias(output_column)])
+            # Convert back to original format
+            result_arrow = result_df.to_arrow()
+            final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
+            # Calculate statistics
+            execution_time = (datetime.now() - start_time).total_seconds() * 1000
+            memory_after = self.arrow_bridge._get_memory_usage_mb()
+            memory_used = max(0, memory_after - memory_before)
+            # Update statistics
+            self.execution_stats["total_executions"] += 1
+            self.execution_stats["total_time_ms"] += execution_time
+            self.execution_stats["total_rows_processed"] += result_df.height
+            return ExpressionResult(
+                dataframe=final_result,
+                execution_time_ms=execution_time,
+                rows_processed=result_df.height,
+                columns_processed=result_df.width,
+                backend_type=backend_type,
+                memory_used_mb=memory_used
+            )
+        except Exception as e:
+            self.execution_stats["errors"] += 1
+            raise PolarsExpressionError(f"AST execution failed: {e}")
+        finally:
+            self.arrow_bridge.cleanup_arrow_memory()
+    def validate_expression(self, expression: str) -> bool:
+        """
+        Validate expression syntax without executing
+        Args:
+            expression: Expression string to validate
+        Returns:
+            True if expression is valid
+        """
+        try:
+            # Clean up multiline expressions
+            cleaned_expression = ' '.join(line.strip() for line in expression.strip().split('\n') if line.strip())
+            ast_tree = build_ast_from_expression(cleaned_expression)
+            if ast_tree is None:
+                return False
+            # Try to convert AST to Polars expression (dry run)
+            # This will catch unsupported functions and operators
+            self._ast_to_polars_expr(ast_tree)
+            return True
+        except Exception as e:
+            log_warning(f"[polars_engine] Expression validation failed: {e}")
+            return False
+    def get_execution_stats(self) -> Dict[str, Any]:
+        """Get execution statistics"""
+        stats = self.execution_stats.copy()
+        if stats["total_executions"] > 0:
+            stats["avg_time_ms"] = stats["total_time_ms"] / stats["total_executions"]
+            stats["avg_rows_per_execution"] = stats["total_rows_processed"] / stats["total_executions"]
+        else:
+            stats["avg_time_ms"] = 0.0
+            stats["avg_rows_per_execution"] = 0
+        return stats
+    def reset_stats(self):
+        """Reset execution statistics"""
+        self.execution_stats = {
+            "total_executions": 0,
+            "total_time_ms": 0.0,
+            "total_rows_processed": 0,
+            "errors": 0
+        }
+        log_info("[polars_engine] Statistics reset")
+    def benchmark_expression(self, df: Any, expression: str, output_column: str,
+                           iterations: int = 3) -> Dict[str, Any]:
+        """
+        Benchmark expression execution performance
+        Args:
+            df: Input dataframe
+            expression: Expression to benchmark
+            output_column: Output column name
+            iterations: Number of iterations
+        Returns:
+            Benchmark results
+        """
+        times = []
+        backend_type = self.arrow_bridge.detect_backend(df)
+        for i in range(iterations):
+            try:
+                result = self.execute_expression(df, expression, output_column, backend_type)
+                times.append(result.execution_time_ms)
+            except Exception as e:
+                log_warning(f"[polars_engine] Benchmark iteration {i+1} failed: {e}")
+                continue
+        if not times:
+            return {"error": "All benchmark iterations failed"}
+        return {
+            "expression": expression,
+            "backend_type": backend_type,
+            "iterations": len(times),
+            "min_time_ms": min(times),
+            "max_time_ms": max(times),
+            "avg_time_ms": sum(times) / len(times),
+            "total_time_ms": sum(times)
+        }
+    def get_supported_functions(self) -> list:
+        """Get list of supported functions"""
+        return [
+            "min", "max", "abs", "log", "exp", "sqrt", "pow",
+            "round", "floor", "ceil"
+        ]
+    def get_supported_operators(self) -> Dict[str, list]:
+        """Get list of supported operators by category"""
+        return {
+            "arithmetic": ["+", "-", "*", "/", "**", "%", "//"],
+            "comparison": ["==", "!=", ">", "<", ">=", "<="],
+            "boolean": ["and", "or", "not"],
+            "conditional": ["if_else"]
+        }
+    def _cleanup_callback(self):
+        """Cleanup callback for memory manager"""
+        try:
+            # Cleanup Arrow bridge memory
+            self.arrow_bridge.cleanup_arrow_memory()
+            # Reset statistics if they get too large
+            if self.execution_stats["total_executions"] > 10000:
+                log_info("[polars_engine] Resetting statistics due to high execution count")
+                self.reset_stats()
+        except Exception as e:
+            log_warning(f"[polars_engine] Cleanup callback failed: {e}")
+    def __del__(self):
+        """Cleanup when engine is destroyed"""
+        try:
+            if hasattr(self, 'memory_manager'):
+                self.memory_manager.unregister_cleanup_callback(self._cleanup_callback)
+        except Exception:
+            pass