PyPI - additory - Versions diffs - 0.1.0a1__py3-none-any.whl - Mend

additory 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

additory/__init__.py +15 -0
additory/analysis/__init__.py +48 -0
additory/analysis/cardinality.py +126 -0
additory/analysis/correlations.py +124 -0
additory/analysis/distributions.py +376 -0
additory/analysis/quality.py +158 -0
additory/analysis/scan.py +400 -0
additory/augment/__init__.py +24 -0
additory/augment/augmentor.py +653 -0
additory/augment/builtin_lists.py +430 -0
additory/augment/distributions.py +22 -0
additory/augment/forecast.py +1132 -0
additory/augment/list_registry.py +177 -0
additory/augment/smote.py +320 -0
additory/augment/strategies.py +883 -0
additory/common/__init__.py +157 -0
additory/common/backend.py +355 -0
additory/common/column_utils.py +191 -0
additory/common/distributions.py +737 -0
additory/common/exceptions.py +62 -0
additory/common/lists.py +229 -0
additory/common/patterns.py +240 -0
additory/common/resolver.py +567 -0
additory/common/sample_data.py +182 -0
additory/common/validation.py +197 -0
additory/core/__init__.py +27 -0
additory/core/ast_builder.py +165 -0
additory/core/backends/__init__.py +23 -0
additory/core/backends/arrow_bridge.py +476 -0
additory/core/backends/cudf_bridge.py +355 -0
additory/core/column_positioning.py +358 -0
additory/core/compiler_polars.py +166 -0
additory/core/config.py +342 -0
additory/core/enhanced_cache_manager.py +1119 -0
additory/core/enhanced_matchers.py +473 -0
additory/core/enhanced_version_manager.py +325 -0
additory/core/executor.py +59 -0
additory/core/integrity_manager.py +477 -0
additory/core/loader.py +190 -0
additory/core/logging.py +24 -0
additory/core/memory_manager.py +547 -0
additory/core/namespace_manager.py +657 -0
additory/core/parser.py +176 -0
additory/core/polars_expression_engine.py +551 -0
additory/core/registry.py +176 -0
additory/core/sample_data_manager.py +492 -0
additory/core/user_namespace.py +751 -0
additory/core/validator.py +27 -0
additory/dynamic_api.py +308 -0
additory/expressions/__init__.py +26 -0
additory/expressions/engine.py +551 -0
additory/expressions/parser.py +176 -0
additory/expressions/proxy.py +546 -0
additory/expressions/registry.py +313 -0
additory/expressions/samples.py +492 -0
additory/synthetic/__init__.py +101 -0
additory/synthetic/api.py +220 -0
additory/synthetic/common_integration.py +314 -0
additory/synthetic/config.py +262 -0
additory/synthetic/engines.py +529 -0
additory/synthetic/exceptions.py +180 -0
additory/synthetic/file_managers.py +518 -0
additory/synthetic/generator.py +702 -0
additory/synthetic/generator_parser.py +68 -0
additory/synthetic/integration.py +319 -0
additory/synthetic/models.py +241 -0
additory/synthetic/pattern_resolver.py +573 -0
additory/synthetic/performance.py +469 -0
additory/synthetic/polars_integration.py +464 -0
additory/synthetic/proxy.py +60 -0
additory/synthetic/schema_parser.py +685 -0
additory/synthetic/validator.py +553 -0
additory/utilities/__init__.py +53 -0
additory/utilities/encoding.py +600 -0
additory/utilities/games.py +300 -0
additory/utilities/keys.py +8 -0
additory/utilities/lookup.py +103 -0
additory/utilities/matchers.py +216 -0
additory/utilities/resolvers.py +286 -0
additory/utilities/settings.py +167 -0
additory/utilities/units.py +746 -0
additory/utilities/validators.py +153 -0
additory-0.1.0a1.dist-info/METADATA +293 -0
additory-0.1.0a1.dist-info/RECORD +87 -0
additory-0.1.0a1.dist-info/WHEEL +5 -0
additory-0.1.0a1.dist-info/licenses/LICENSE +21 -0
additory-0.1.0a1.dist-info/top_level.txt +1 -0

additory/common/sample_data.py ADDED Viewed

@@ -0,0 +1,182 @@
+"""
+Centralized Sample Dataset Management
+Provides sample datasets for demonstrations across all additory modules.
+Sample datasets are stored as .add files in reference/ directories and
+loaded on-demand using the existing .add file parser.
+Usage:
+    from additory.common.sample_data import get_sample_dataset
+    # For augment
+    df = get_sample_dataset("augment", "sample")
+    # For expressions (future)
+    df = get_sample_dataset("expressions", "sample")
+    df_unclean = get_sample_dataset("expressions", "sample_unclean")
+"""
+import polars as pl
+from pathlib import Path
+from typing import Optional
+import yaml
+from additory.common.exceptions import ValidationError
+def get_sample_dataset(
+    module: str = "augment",
+    block: str = "sample",
+    dataset_type: str = "clean"
+) -> pl.DataFrame:
+    """
+    Load a sample dataset from .add files.
+    This function provides centralized access to sample datasets across
+    all additory modules (augment, expressions, utilities). Sample datasets
+    are stored as .add files in the reference/ directory structure.
+    Args:
+        module: Module name ("augment", "expressions", "utilities")
+        block: Block name within the .add file ("sample" for augment)
+        dataset_type: Type of sample data ("clean" or "unclean")
+    Returns:
+        Polars DataFrame with sample data
+    Raises:
+        ValidationError: If module, block, or dataset_type not found
+    Examples:
+        >>> # Load augment sample dataset
+        >>> df = get_sample_dataset("augment", "sample")
+        >>> print(df.shape)
+        (50, 10)
+        >>> # Load expressions sample dataset (future)
+        >>> df = get_sample_dataset("expressions", "sample", "clean")
+        >>> df_unclean = get_sample_dataset("expressions", "sample", "unclean")
+    Sample Dataset Structure (augment):
+        - id: Sequential numeric IDs (1-50)
+        - emp_id: Employee IDs with pattern (EMP_001 - EMP_050)
+        - order_id: Order IDs with different padding (ORD_0001 - ORD_0050)
+        - age: Age values (18-65 range)
+        - salary: Salary values (40k-120k range)
+        - first_name: First names from builtin list
+        - last_name: Last names from builtin list
+        - department: Departments from builtin list
+        - status: Status values from builtin list
+        - region: Geographic regions (North, South, East, West)
+    """
+    # Construct path to .add file
+    base_path = Path(__file__).parent.parent.parent / "reference"
+    if module == "augment":
+        add_file_path = base_path / "augment_definitions" / f"{block}_0.1.add"
+    elif module == "expressions":
+        add_file_path = base_path / "expressions_definitions" / f"{block}_0.1.add"
+    elif module == "utilities":
+        add_file_path = base_path / "utilities_definitions" / f"{block}_0.1.add"
+    else:
+        raise ValidationError(
+            f"Unknown module '{module}'. "
+            f"Valid modules: augment, expressions, utilities"
+        )
+    # Check if file exists
+    if not add_file_path.exists():
+        raise ValidationError(
+            f"Sample dataset file not found: {add_file_path}\n"
+            f"Module: {module}, Block: {block}"
+        )
+    # Load and parse .add file
+    try:
+        with open(add_file_path, 'r') as f:
+            content = yaml.safe_load(f)
+    except Exception as e:
+        raise ValidationError(
+            f"Failed to parse sample dataset file: {add_file_path}\n"
+            f"Error: {e}"
+        )
+    # Extract sample data
+    sample_section = content.get("sample", {})
+    if not sample_section:
+        raise ValidationError(
+            f"No 'sample' section found in {add_file_path}"
+        )
+    # Get the requested dataset type (clean or unclean)
+    dataset = sample_section.get(dataset_type)
+    if dataset is None:
+        available_types = list(sample_section.keys())
+        raise ValidationError(
+            f"Dataset type '{dataset_type}' not found in {add_file_path}\n"
+            f"Available types: {available_types}"
+        )
+    # Convert to Polars DataFrame
+    try:
+        df = pl.DataFrame(dataset)
+    except Exception as e:
+        raise ValidationError(
+            f"Failed to create DataFrame from sample data: {e}"
+        )
+    return df
+def list_available_samples() -> dict:
+    """
+    List all available sample datasets.
+    Returns:
+        Dictionary mapping module names to available samples
+    Example:
+        >>> samples = list_available_samples()
+        >>> print(samples)
+        {
+            'augment': ['sample'],
+            'expressions': ['sample'],
+            'utilities': []
+        }
+    """
+    base_path = Path(__file__).parent.parent.parent / "reference"
+    available = {}
+    # Check augment
+    augment_path = base_path / "augment_definitions"
+    if augment_path.exists():
+        available['augment'] = [
+            f.stem.rsplit('_', 1)[0]  # Remove version suffix
+            for f in augment_path.glob("*.add")
+        ]
+    else:
+        available['augment'] = []
+    # Check expressions
+    expressions_path = base_path / "expressions_definitions"
+    if expressions_path.exists():
+        available['expressions'] = [
+            f.stem.rsplit('_', 1)[0]  # Remove version suffix
+            for f in expressions_path.glob("*.add")
+        ]
+    else:
+        available['expressions'] = []
+    # Check utilities
+    utilities_path = base_path / "utilities_definitions"
+    if utilities_path.exists():
+        available['utilities'] = [
+            f.stem.rsplit('_', 1)[0]  # Remove version suffix
+            for f in utilities_path.glob("*.add")
+        ]
+    else:
+        available['utilities'] = []
+    return available

additory/common/validation.py ADDED Viewed

@@ -0,0 +1,197 @@
+"""
+Common Validation Utilities
+Provides consistent validation across all additory modules.
+"""
+from typing import Any, List, Union
+from .backend import is_dataframe, detect_backend
+from .exceptions import ValidationError
+def validate_dataframe(df: Any, name: str = "dataframe") -> None:
+    """
+    Validate that input is a supported dataframe type.
+    Args:
+        df: Input to validate
+        name: Name for error messages
+    Raises:
+        ValidationError: If not a supported dataframe or empty
+    Examples:
+        >>> validate_dataframe(df, "input dataframe")
+    """
+    if not is_dataframe(df):
+        raise ValidationError(
+            f"{name} must be a DataFrame (pandas, polars, or cudf). "
+            f"Got: {type(df)}"
+        )
+    if len(df) == 0:
+        raise ValidationError(f"{name} is empty")
+def validate_columns_exist(df: Any, columns: Union[str, List[str]],
+                          df_name: str = "dataframe") -> None:
+    """
+    Validate that columns exist in dataframe.
+    Args:
+        df: Dataframe to check
+        columns: Column name(s) to validate
+        df_name: Name for error messages
+    Raises:
+        ValidationError: If columns don't exist
+    Examples:
+        >>> validate_columns_exist(df, ['col1', 'col2'], "my_dataframe")
+        >>> validate_columns_exist(df, 'single_col')
+    """
+    if isinstance(columns, str):
+        columns = [columns]
+    df_columns = list(df.columns)
+    missing_columns = [col for col in columns if col not in df_columns]
+    if missing_columns:
+        raise ValidationError(
+            f"Column(s) {missing_columns} not found in {df_name}. "
+            f"Available columns: {df_columns}"
+        )
+def validate_positive_number(value: Union[int, float], param_name: str) -> None:
+    """
+    Validate that value is a positive number.
+    Args:
+        value: Value to validate
+        param_name: Parameter name for error messages
+    Raises:
+        ValidationError: If not a positive number
+    Examples:
+        >>> validate_positive_number(10, "max_categories")
+    """
+    if not isinstance(value, (int, float)):
+        raise ValidationError(f"{param_name} must be a number, got {type(value)}")
+    if value <= 0:
+        raise ValidationError(f"{param_name} must be positive, got {value}")
+def validate_non_negative_number(value: Union[int, float], param_name: str) -> None:
+    """
+    Validate that value is a non-negative number.
+    Args:
+        value: Value to validate
+        param_name: Parameter name for error messages
+    Raises:
+        ValidationError: If not a non-negative number
+    Examples:
+        >>> validate_non_negative_number(0, "min_value")
+    """
+    if not isinstance(value, (int, float)):
+        raise ValidationError(f"{param_name} must be a number, got {type(value)}")
+    if value < 0:
+        raise ValidationError(f"{param_name} must be non-negative, got {value}")
+def validate_parameter_choice(value: Any, choices: List[Any],
+                             param_name: str) -> None:
+    """
+    Validate that parameter value is in allowed choices.
+    Args:
+        value: Value to validate
+        choices: List of allowed values
+        param_name: Parameter name for error messages
+    Raises:
+        ValidationError: If value not in choices
+    Examples:
+        >>> validate_parameter_choice('after', ['before', 'after', 'end'], 'position')
+    """
+    if value not in choices:
+        raise ValidationError(
+            f"Invalid {param_name}: '{value}'. "
+            f"Must be one of: {choices}"
+        )
+def validate_ratio(value: float, param_name: str) -> None:
+    """
+    Validate that value is a ratio between 0 and 1.
+    Args:
+        value: Value to validate
+        param_name: Parameter name for error messages
+    Raises:
+        ValidationError: If not a valid ratio
+    Examples:
+        >>> validate_ratio(0.5, "max_cardinality_ratio")
+    """
+    if not isinstance(value, (int, float)):
+        raise ValidationError(f"{param_name} must be a number, got {type(value)}")
+    if not 0.0 <= value <= 1.0:
+        raise ValidationError(f"{param_name} must be between 0.0 and 1.0, got {value}")
+def validate_string_not_empty(value: str, param_name: str) -> None:
+    """
+    Validate that string is not empty.
+    Args:
+        value: String to validate
+        param_name: Parameter name for error messages
+    Raises:
+        ValidationError: If string is empty or not a string
+    Examples:
+        >>> validate_string_not_empty("column_name", "column")
+    """
+    if not isinstance(value, str):
+        raise ValidationError(f"{param_name} must be a string, got {type(value)}")
+    if not value.strip():
+        raise ValidationError(f"{param_name} cannot be empty")
+def validate_integer_in_range(value: int, param_name: str,
+                              min_val: int = None, max_val: int = None) -> None:
+    """
+    Validate that integer is within specified range.
+    Args:
+        value: Integer to validate
+        param_name: Parameter name for error messages
+        min_val: Minimum allowed value (inclusive)
+        max_val: Maximum allowed value (inclusive)
+    Raises:
+        ValidationError: If not an integer or out of range
+    Examples:
+        >>> validate_integer_in_range(50, "max_categories", min_val=1, max_val=200)
+    """
+    if not isinstance(value, int):
+        raise ValidationError(f"{param_name} must be an integer, got {type(value)}")
+    if min_val is not None and value < min_val:
+        raise ValidationError(f"{param_name} must be >= {min_val}, got {value}")
+    if max_val is not None and value > max_val:
+        raise ValidationError(f"{param_name} must be <= {max_val}, got {value}")

additory/core/__init__.py ADDED Viewed

@@ -0,0 +1,27 @@
+# core/__init__.py
+# Expose core engine components
+from .executor import execute_expression
+from .registry import (
+    resolve_formula,
+    set_formula_version,
+    set_formula_root,
+    set_custom_formula_path,
+)
+from .loader import load_expression
+from .parser import parse_expression
+from .validator import validate_expression
+from .logging import log_info, log_warning
+__all__ = [
+    "execute_expression",
+    "resolve_formula",
+    "set_formula_version",
+    "set_formula_root",
+    "set_custom_formula_path",
+    "load_expression",
+    "parse_expression",
+    "validate_expression",
+    "log_info",
+    "log_warning",
+]

additory/core/ast_builder.py ADDED Viewed

@@ -0,0 +1,165 @@
+# ast_builder.py
+#
+# Extended AST builder for additory DSL.
+# Backward compatible with minimal arithmetic DSL.
+# Adds:
+#   - comparisons
+#   - boolean logic
+#   - ternary (Python-style: a if cond else b)
+#   - function calls (min, max, abs, log, exp)
+#
+import ast
+def build_ast_from_expression(expr: str) -> dict:
+    """
+    Convert a Python-like expression string into our internal AST format.
+    Uses Python's ast module as a parser, then transforms nodes.
+    """
+    if not expr or not expr.strip():
+        return None
+    py_ast = ast.parse(expr, mode="eval")
+    return _convert(py_ast.body)
+def _convert(node):
+    """Convert Python AST → additory AST."""
+    # ------------------------------------------------------------
+    # Literals
+    # ------------------------------------------------------------
+    if isinstance(node, ast.Constant):
+        return {"type": "literal", "value": node.value}
+    # ------------------------------------------------------------
+    # Column reference
+    # ------------------------------------------------------------
+    if isinstance(node, ast.Name):
+        return {"type": "column", "name": node.id}
+    # ------------------------------------------------------------
+    # Binary arithmetic: + - * / **
+    # ------------------------------------------------------------
+    if isinstance(node, ast.BinOp):
+        return {
+            "type": "binary",
+            "op": _op_symbol(node.op),
+            "left": _convert(node.left),
+            "right": _convert(node.right),
+        }
+    # ------------------------------------------------------------
+    # Unary arithmetic: -x, +x
+    # ------------------------------------------------------------
+    if isinstance(node, ast.UnaryOp):
+        if isinstance(node.op, ast.UAdd):
+            return _convert(node.operand)
+        if isinstance(node.op, ast.USub):
+            return {
+                "type": "binary",
+                "op": "*",
+                "left": {"type": "literal", "value": -1},
+                "right": _convert(node.operand),
+            }
+        if isinstance(node.op, ast.Not):
+            return {
+                "type": "unary_bool",
+                "op": "not",
+                "value": _convert(node.operand),
+            }
+    # ------------------------------------------------------------
+    # Boolean operations: and/or
+    # ------------------------------------------------------------
+    if isinstance(node, ast.BoolOp):
+        op = "and" if isinstance(node.op, ast.And) else "or"
+        return {
+            "type": "bool_op",
+            "op": op,
+            "values": [_convert(v) for v in node.values],
+        }
+    # ------------------------------------------------------------
+    # Comparisons: == != > < >= <=
+    # ------------------------------------------------------------
+    if isinstance(node, ast.Compare):
+        # Python allows chained comparisons: a < b < c
+        # We only support simple binary comparisons
+        if len(node.ops) != 1 or len(node.comparators) != 1:
+            raise NotImplementedError("Chained comparisons not supported")
+        op = _cmp_symbol(node.ops[0])
+        return {
+            "type": "cmp",
+            "op": op,
+            "left": _convert(node.left),
+            "right": _convert(node.comparators[0]),
+        }
+    # ------------------------------------------------------------
+    # Ternary: a if cond else b
+    # ------------------------------------------------------------
+    if isinstance(node, ast.IfExp):
+        return {
+            "type": "if_expr",
+            "cond": _convert(node.test),
+            "then": _convert(node.body),
+            "else": _convert(node.orelse),
+        }
+    # ------------------------------------------------------------
+    # Function calls: min, max, abs, log, exp
+    # ------------------------------------------------------------
+    if isinstance(node, ast.Call):
+        if not isinstance(node.func, ast.Name):
+            raise NotImplementedError("Only simple function calls supported")
+        name = node.func.id
+        args = [_convert(a) for a in node.args]
+        return {
+            "type": "call",
+            "name": name,
+            "args": args,
+        }
+    raise NotImplementedError(f"Unsupported AST node: {type(node)}")
+def _op_symbol(op):
+    """Map Python AST operator → string symbol."""
+    if isinstance(op, ast.Add):
+        return "+"
+    if isinstance(op, ast.Sub):
+        return "-"
+    if isinstance(op, ast.Mult):
+        return "*"
+    if isinstance(op, ast.Div):
+        return "/"
+    if isinstance(op, ast.Pow):
+        return "**"
+    if isinstance(op, ast.Mod):
+        return "%"
+    if isinstance(op, ast.FloorDiv):
+        return "//"
+    raise NotImplementedError(f"Unsupported operator: {type(op)}")
+def _cmp_symbol(op):
+    """Map Python AST comparison operator → string symbol."""
+    if isinstance(op, ast.Eq):
+        return "=="
+    if isinstance(op, ast.NotEq):
+        return "!="
+    if isinstance(op, ast.Gt):
+        return ">"
+    if isinstance(op, ast.Lt):
+        return "<"
+    if isinstance(op, ast.GtE):
+        return ">="
+    if isinstance(op, ast.LtE):
+        return "<="
+    raise NotImplementedError(f"Unsupported comparison operator: {type(op)}")

additory/core/backends/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+# additory/core/backends/__init__.py
+# Backend support system
+"""
+Backend Support Module
+This module provides universal backend support for dataframes:
+- Arrow bridge for cross-backend compatibility
+- Enhanced cuDF support with GPU acceleration
+- Memory management and cleanup
+"""
+# Backend functionality
+from .arrow_bridge import EnhancedArrowBridge, ArrowBridgeError
+from .cudf_bridge import get_cudf_bridge, EnhancedCuDFBridge, CuDFBridgeError
+__all__ = [
+    'EnhancedArrowBridge',
+    'ArrowBridgeError',
+    'get_cudf_bridge',
+    'EnhancedCuDFBridge',
+    'CuDFBridgeError'
+]