PyPI - additory - Versions diffs - 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl - Mend

additory 0.1.0a4py3-none-any.whl → 0.1.1a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (121) hide show

additory/__init__.py +58 -14
additory/common/__init__.py +31 -147
additory/common/column_selector.py +255 -0
additory/common/distributions.py +286 -613
additory/common/extractors.py +313 -0
additory/common/knn_imputation.py +332 -0
additory/common/result.py +380 -0
additory/common/strategy_parser.py +243 -0
additory/common/unit_conversions.py +338 -0
additory/common/validation.py +283 -103
additory/core/__init__.py +34 -22
additory/core/backend.py +258 -0
additory/core/config.py +177 -305
additory/core/logging.py +230 -24
additory/core/memory_manager.py +157 -495
additory/expressions/__init__.py +2 -23
additory/expressions/compiler.py +457 -0
additory/expressions/engine.py +264 -487
additory/expressions/integrity.py +179 -0
additory/expressions/loader.py +263 -0
additory/expressions/parser.py +363 -167
additory/expressions/resolver.py +274 -0
additory/functions/__init__.py +1 -0
additory/functions/analyze/__init__.py +144 -0
additory/functions/analyze/cardinality.py +58 -0
additory/functions/analyze/correlations.py +66 -0
additory/functions/analyze/distributions.py +53 -0
additory/functions/analyze/duplicates.py +49 -0
additory/functions/analyze/features.py +61 -0
additory/functions/analyze/imputation.py +66 -0
additory/functions/analyze/outliers.py +65 -0
additory/functions/analyze/patterns.py +65 -0
additory/functions/analyze/presets.py +72 -0
additory/functions/analyze/quality.py +59 -0
additory/functions/analyze/timeseries.py +53 -0
additory/functions/analyze/types.py +45 -0
additory/functions/expressions/__init__.py +161 -0
additory/functions/snapshot/__init__.py +82 -0
additory/functions/snapshot/filter.py +119 -0
additory/functions/synthetic/__init__.py +113 -0
additory/functions/synthetic/mode_detector.py +47 -0
additory/functions/synthetic/strategies/__init__.py +1 -0
additory/functions/synthetic/strategies/advanced.py +35 -0
additory/functions/synthetic/strategies/augmentative.py +160 -0
additory/functions/synthetic/strategies/generative.py +168 -0
additory/functions/synthetic/strategies/presets.py +116 -0
additory/functions/to/__init__.py +188 -0
additory/functions/to/lookup.py +351 -0
additory/functions/to/merge.py +189 -0
additory/functions/to/sort.py +91 -0
additory/functions/to/summarize.py +170 -0
additory/functions/transform/__init__.py +140 -0
additory/functions/transform/datetime.py +79 -0
additory/functions/transform/extract.py +85 -0
additory/functions/transform/harmonize.py +105 -0
additory/functions/transform/knn.py +62 -0
additory/functions/transform/onehotencoding.py +68 -0
additory/functions/transform/transpose.py +42 -0
additory-0.1.1a1.dist-info/METADATA +83 -0
additory-0.1.1a1.dist-info/RECORD +62 -0
additory/analysis/__init__.py +0 -48
additory/analysis/cardinality.py +0 -126
additory/analysis/correlations.py +0 -124
additory/analysis/distributions.py +0 -376
additory/analysis/quality.py +0 -158
additory/analysis/scan.py +0 -400
additory/common/backend.py +0 -371
additory/common/column_utils.py +0 -191
additory/common/exceptions.py +0 -62
additory/common/lists.py +0 -229
additory/common/patterns.py +0 -240
additory/common/resolver.py +0 -567
additory/common/sample_data.py +0 -182
additory/core/ast_builder.py +0 -165
additory/core/backends/__init__.py +0 -23
additory/core/backends/arrow_bridge.py +0 -483
additory/core/backends/cudf_bridge.py +0 -355
additory/core/column_positioning.py +0 -358
additory/core/compiler_polars.py +0 -166
additory/core/enhanced_cache_manager.py +0 -1119
additory/core/enhanced_matchers.py +0 -473
additory/core/enhanced_version_manager.py +0 -325
additory/core/executor.py +0 -59
additory/core/integrity_manager.py +0 -477
additory/core/loader.py +0 -190
additory/core/namespace_manager.py +0 -657
additory/core/parser.py +0 -176
additory/core/polars_expression_engine.py +0 -601
additory/core/registry.py +0 -177
additory/core/sample_data_manager.py +0 -492
additory/core/user_namespace.py +0 -751
additory/core/validator.py +0 -27
additory/dynamic_api.py +0 -352
additory/expressions/proxy.py +0 -549
additory/expressions/registry.py +0 -313
additory/expressions/samples.py +0 -492
additory/synthetic/__init__.py +0 -13
additory/synthetic/column_name_resolver.py +0 -149
additory/synthetic/deduce.py +0 -259
additory/synthetic/distributions.py +0 -22
additory/synthetic/forecast.py +0 -1132
additory/synthetic/linked_list_parser.py +0 -415
additory/synthetic/namespace_lookup.py +0 -129
additory/synthetic/smote.py +0 -320
additory/synthetic/strategies.py +0 -926
additory/synthetic/synthesizer.py +0 -713
additory/utilities/__init__.py +0 -53
additory/utilities/encoding.py +0 -600
additory/utilities/games.py +0 -300
additory/utilities/keys.py +0 -8
additory/utilities/lookup.py +0 -103
additory/utilities/matchers.py +0 -216
additory/utilities/resolvers.py +0 -286
additory/utilities/settings.py +0 -167
additory/utilities/units.py +0 -749
additory/utilities/validators.py +0 -153
additory-0.1.0a4.dist-info/METADATA +0 -311
additory-0.1.0a4.dist-info/RECORD +0 -72
additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
{additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
{additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0

additory/functions/synthetic/strategies/generative.py ADDED Viewed

@@ -0,0 +1,168 @@
+"""
+Generate synthetic data from scratch.
+This module provides functionality to create synthetic data using various strategies.
+"""
+import polars as pl
+from typing import Dict, Any
+import numpy as np
+from additory.common.distributions import (
+    generate_normal, generate_uniform, generate_correlated
+)
+from additory.core.logging import Logger
+def generate_data(n_rows: int, strategy: Dict[str, str]) -> pl.DataFrame:
+    """
+    Generate synthetic data from scratch.
+    Args:
+        n_rows: Number of rows to generate
+        strategy: Dictionary mapping column to generation strategy
+    Returns:
+        DataFrame with synthetic data
+    Example:
+        >>> result = generate_data(n_rows=1000, strategy={
+        ...     'id': 'increment:start=1',
+        ...     'age': 'range:18-65',
+        ...     'status': 'choice:[Active,Inactive,Pending]'
+        ... })
+    """
+    logger = Logger()
+    logger.info(f"Generating {n_rows} rows with {len(strategy)} columns")
+    if not strategy:
+        raise ValueError("strategy dictionary cannot be empty")
+    if n_rows <= 0:
+        raise ValueError(f"n_rows must be positive, got {n_rows}")
+    # Generate each column
+    columns = {}
+    for col_name, strategy_value in strategy.items():
+        logger.info(f"Generating column: {col_name}")
+        columns[col_name] = generate_column(n_rows, strategy_value)
+    # Create DataFrame
+    result = pl.DataFrame(columns)
+    logger.info(f"Generated {len(result)} rows × {len(result.columns)} columns")
+    return result
+def generate_column(n_rows: int, strategy_value: str) -> pl.Series:
+    """
+    Generate a single column based on strategy.
+    Args:
+        n_rows: Number of rows
+        strategy_value: Strategy string
+    Returns:
+        Series with generated data
+    Supported strategies:
+        - increment:start=1:step=1
+        - range:18-65
+        - choice:[A,B,C]
+        - normal:mean=50:std=10
+        - uniform:low=0:high=100
+    """
+    # Parse strategy
+    strategy_type, params = parse_strategy_value(strategy_value)
+    # Generate based on type
+    if strategy_type == 'increment':
+        start = params.get('start', 1)
+        step = params.get('step', 1)
+        return pl.Series(range(start, start + n_rows * step, step))
+    elif strategy_type == 'range':
+        low, high = params['range']
+        return pl.Series(np.random.randint(low, high + 1, n_rows))
+    elif strategy_type == 'choice':
+        choices = params['choices']
+        weights = params.get('weights')
+        if weights:
+            return pl.Series(np.random.choice(choices, n_rows, p=weights))
+        else:
+            return pl.Series(np.random.choice(choices, n_rows))
+    elif strategy_type == 'normal':
+        mean = params['mean']
+        std = params['std']
+        return generate_normal(n_rows, mean, std)
+    elif strategy_type == 'uniform':
+        low = params['low']
+        high = params['high']
+        return generate_uniform(n_rows, low, high)
+    else:
+        raise ValueError(f"Unsupported strategy type: {strategy_type}")
+def parse_strategy_value(strategy_value: str) -> tuple[str, Dict[str, Any]]:
+    """
+    Parse strategy string into type and parameters.
+    Args:
+        strategy_value: Strategy string
+    Returns:
+        Tuple of (strategy_type, parameters)
+    Examples:
+        >>> parse_strategy_value('increment:start=1:step=2')
+        ('increment', {'start': 1, 'step': 2})
+        >>> parse_strategy_value('range:18-65')
+        ('range', {'range': (18, 65)})
+        >>> parse_strategy_value('choice:[A,B,C]')
+        ('choice', {'choices': ['A', 'B', 'C']})
+    """
+    parts = strategy_value.split(':')
+    strategy_type = parts[0]
+    params = {}
+    if strategy_type == 'increment':
+        for part in parts[1:]:
+            if '=' in part:
+                key, value = part.split('=')
+                params[key] = int(value)
+    elif strategy_type == 'range':
+        if len(parts) > 1:
+            range_str = parts[1]
+            low, high = map(int, range_str.split('-'))
+            params['range'] = (low, high)
+    elif strategy_type == 'choice':
+        if len(parts) > 1:
+            choices_str = parts[1]
+            # Remove brackets
+            choices_str = choices_str.strip('[]')
+            # Split by comma
+            choices = [c.strip() for c in choices_str.split(',')]
+            params['choices'] = choices
+    elif strategy_type == 'normal':
+        for part in parts[1:]:
+            if '=' in part:
+                key, value = part.split('=')
+                params[key] = float(value)
+    elif strategy_type == 'uniform':
+        for part in parts[1:]:
+            if '=' in part:
+                key, value = part.split('=')
+                params[key] = float(value)
+    return strategy_type, params

additory/functions/synthetic/strategies/presets.py ADDED Viewed

@@ -0,0 +1,116 @@
+"""
+Preset configurations for common synthetic data types.
+This module provides preset configurations for generating common data types.
+"""
+import polars as pl
+from typing import Dict, List
+from additory.functions.synthetic.strategies.generative import generate_data
+from additory.core.logging import Logger
+# Define presets
+PRESETS = {
+    'users': {
+        'id': 'increment:start=1',
+        'age': 'range:18-75',
+        'status': 'choice:[Active,Inactive,Pending]',
+        'country': 'choice:[USA,UK,Canada,Australia,Germany]'
+    },
+    'transactions': {
+        'transaction_id': 'increment:start=1000',
+        'amount': 'uniform:low=10:high=1000',
+        'currency': 'choice:[USD,EUR,GBP]',
+        'status': 'choice:[completed,pending,failed]'
+    },
+    'products': {
+        'product_id': 'increment:start=1',
+        'price': 'uniform:low=5:high=500',
+        'stock': 'range:0-1000',
+        'category': 'choice:[Electronics,Clothing,Food,Books]'
+    },
+    'timeseries': {
+        'value': 'normal:mean=100:std=20'
+    },
+    'medical': {
+        'patient_id': 'increment:start=1',
+        'age': 'range:18-90',
+        'weight': 'normal:mean=70:std=15',
+        'height': 'normal:mean=170:std=10'
+    }
+}
+def apply_preset(preset_name: str, n_rows: int) -> pl.DataFrame:
+    """
+    Apply preset configuration.
+    Args:
+        preset_name: Name of preset
+        n_rows: Number of rows to generate
+    Returns:
+        DataFrame with synthetic data
+    Raises:
+        ValueError: If preset not found
+    Example:
+        >>> result = apply_preset('users', n_rows=1000)
+    """
+    logger = Logger()
+    if preset_name not in PRESETS:
+        available = list_presets()
+        raise ValueError(
+            f"Preset '{preset_name}' not found. "
+            f"Available presets: {', '.join(available)}"
+        )
+    logger.info(f"Applying preset: {preset_name}")
+    # Get preset strategy
+    strategy = get_preset_strategy(preset_name)
+    # Generate data
+    result = generate_data(n_rows, strategy)
+    logger.info(f"Preset '{preset_name}' applied: {len(result)} rows")
+    return result
+def get_preset_strategy(preset_name: str) -> Dict[str, str]:
+    """
+    Get strategy dictionary for preset.
+    Args:
+        preset_name: Name of preset
+    Returns:
+        Strategy dictionary
+    Raises:
+        ValueError: If preset not found
+    """
+    if preset_name not in PRESETS:
+        raise ValueError(f"Preset '{preset_name}' not found")
+    return PRESETS[preset_name].copy()
+def list_presets() -> List[str]:
+    """
+    List available presets.
+    Returns:
+        List of preset names
+    Example:
+        >>> presets = list_presets()
+        >>> print(presets)
+        ['users', 'transactions', 'products', 'timeseries', 'medical']
+    """
+    return list(PRESETS.keys())

additory/functions/to/__init__.py ADDED Viewed

@@ -0,0 +1,188 @@
+"""
+Main to function - add columns to DataFrame.
+This module provides the main user-facing to() function with multiple modes:
+- lookup: Add columns from reference DataFrame
+- summarize: Group and aggregate data
+- merge: Merge multiple DataFrames
+- sort: Sort DataFrame
+"""
+import polars as pl
+from typing import Union, List, Optional, Dict, Any
+from additory.core.backend import detect_backend, to_polars, from_polars
+from additory.core.logging import Logger
+from additory.core.memory_manager import MemoryManager
+from additory.common.validation import validate_dataframe, validate_not_empty
+from additory.common.result import wrap_result
+from additory.functions.to.lookup import perform_lookup
+from additory.functions.to.summarize import perform_summarize
+from additory.functions.to.merge import perform_merge
+from additory.functions.to.sort import perform_sort
+def to(
+    df: Any,
+    from_df: Optional[Any] = None,
+    bring: Optional[Union[str, List[str]]] = None,
+    against: Optional[Union[str, List[str]]] = None,
+    on: Optional[Union[str, List[str]]] = None,  # Alias for 'against'
+    to: Optional[str] = None,
+    bring_at: Optional[str] = None,
+    strategy: Optional[Dict] = None,
+    **kwargs
+) -> Any:
+    """
+    Add columns to DataFrame using various modes.
+    Args:
+        df: Input DataFrame (or list of DataFrames for merge mode)
+        from_df: Reference DataFrame (for lookup mode)
+        bring: Column(s) to bring (for lookup mode)
+        against: Key column(s) for matching (for lookup mode)
+        to: Special mode indicator ('@summarize', '@merge', '@sort')
+        bring_at: Position to insert columns
+        strategy: Strategy dictionary for advanced control
+        **kwargs: Mode-specific parameters
+    Returns:
+        DataFrame with added columns (wrapped in Result)
+    Modes:
+        1. Lookup (default): Add columns from reference DataFrame
+           - Triggered by: from_df is provided
+           - Example: to(df, from_df=products, bring='price', against='product_id')
+        2. Summarize: Group and aggregate data
+           - Triggered by: to='@summarize'
+           - Example: to(df, to='@summarize', group_by='category', aggregations={'sales': 'sum'})
+        3. Merge: Merge multiple DataFrames
+           - Triggered by: to='@merge'
+           - Example: to([df1, df2, df3], to='@merge')
+        4. Sort: Sort DataFrame
+           - Triggered by: to='@sort'
+           - Example: to(df, to='@sort', by='date', descending=True)
+    """
+    logger = Logger()
+    memory_manager = MemoryManager()
+    # Handle 'on' as alias for 'against'
+    if on is not None and against is None:
+        against = on
+    elif on is not None and against is not None:
+        raise ValueError("Cannot specify both 'on' and 'against' parameters")
+    try:
+        # Detect mode
+        mode = detect_mode(df, from_df, to, **kwargs)
+        logger.set_context('to', {'mode': mode})
+        logger.info(f"Starting to() function in '{mode}' mode")
+        # Handle merge mode specially (df is a list)
+        if mode == 'merge':
+            # Validate list of DataFrames
+            if not isinstance(df, list):
+                raise TypeError("For merge mode, df must be a list of DataFrames")
+            # Detect backend from first DataFrame
+            backend = detect_backend(df[0])
+            # Convert all to Polars
+            polars_dfs = [to_polars(d) for d in df]
+            # Perform merge
+            result = perform_merge(polars_dfs, **kwargs)
+        else:
+            # Single DataFrame modes
+            # Validate input
+            validate_dataframe(df)
+            validate_not_empty(df)
+            # Detect backend and convert to Polars
+            backend = detect_backend(df)
+            polars_df = to_polars(df)
+            # Dispatch to appropriate mode
+            if mode == 'summarize':
+                result = perform_summarize(polars_df, **kwargs)
+            elif mode == 'sort':
+                result = perform_sort(polars_df, **kwargs)
+            elif mode == 'lookup':
+                # Convert from_df to Polars
+                polars_from_df = to_polars(from_df)
+                result = perform_lookup(
+                    polars_df,
+                    polars_from_df,
+                    bring,
+                    against,
+                    bring_at,
+                    strategy
+                )
+            else:
+                raise ValueError(f"Unknown mode: {mode}")
+        # Convert back to original backend
+        result = from_polars(result, backend)
+        # Cleanup
+        memory_manager.cleanup()
+        # Wrap result
+        logger.info(f"to() function complete: {len(result)} rows, {len(result.columns)} columns")
+        return wrap_result(result, 'to', metadata={'mode': mode})
+    except Exception as e:
+        logger.error(f"Error in to() function: {str(e)}", error_location="to")
+        raise
+def detect_mode(
+    df: Any,
+    from_df: Optional[Any],
+    to: Optional[str],
+    **kwargs
+) -> str:
+    """
+    Detect which mode to use based on parameters.
+    Args:
+        df: Input DataFrame
+        from_df: Reference DataFrame
+        to: Mode indicator
+        **kwargs: Additional parameters
+    Returns:
+        Mode string ('lookup', 'summarize', 'merge', 'sort')
+    Raises:
+        ValueError: If mode cannot be determined
+    """
+    # Check for explicit mode indicators
+    if to == '@summarize':
+        return 'summarize'
+    elif to == '@merge':
+        return 'merge'
+    elif to == '@sort':
+        return 'sort'
+    # Check for lookup mode (from_df provided)
+    elif from_df is not None:
+        return 'lookup'
+    # Cannot determine mode
+    else:
+        raise ValueError(
+            "Cannot determine mode. Please provide either:\n"
+            "  - from_df (for lookup mode)\n"
+            "  - to='@summarize' (for summarize mode)\n"
+            "  - to='@merge' (for merge mode)\n"
+            "  - to='@sort' (for sort mode)"
+        )
+__all__ = ['to']

additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

additory 0.1.0a4py3-none-any.whl → 0.1.1a1py3-none-any.whl