PyPI - additory - Versions diffs - 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl - Mend

additory 0.1.0a4py3-none-any.whl → 0.1.1a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (121) hide show

additory/__init__.py +58 -14
additory/common/__init__.py +31 -147
additory/common/column_selector.py +255 -0
additory/common/distributions.py +286 -613
additory/common/extractors.py +313 -0
additory/common/knn_imputation.py +332 -0
additory/common/result.py +380 -0
additory/common/strategy_parser.py +243 -0
additory/common/unit_conversions.py +338 -0
additory/common/validation.py +283 -103
additory/core/__init__.py +34 -22
additory/core/backend.py +258 -0
additory/core/config.py +177 -305
additory/core/logging.py +230 -24
additory/core/memory_manager.py +157 -495
additory/expressions/__init__.py +2 -23
additory/expressions/compiler.py +457 -0
additory/expressions/engine.py +264 -487
additory/expressions/integrity.py +179 -0
additory/expressions/loader.py +263 -0
additory/expressions/parser.py +363 -167
additory/expressions/resolver.py +274 -0
additory/functions/__init__.py +1 -0
additory/functions/analyze/__init__.py +144 -0
additory/functions/analyze/cardinality.py +58 -0
additory/functions/analyze/correlations.py +66 -0
additory/functions/analyze/distributions.py +53 -0
additory/functions/analyze/duplicates.py +49 -0
additory/functions/analyze/features.py +61 -0
additory/functions/analyze/imputation.py +66 -0
additory/functions/analyze/outliers.py +65 -0
additory/functions/analyze/patterns.py +65 -0
additory/functions/analyze/presets.py +72 -0
additory/functions/analyze/quality.py +59 -0
additory/functions/analyze/timeseries.py +53 -0
additory/functions/analyze/types.py +45 -0
additory/functions/expressions/__init__.py +161 -0
additory/functions/snapshot/__init__.py +82 -0
additory/functions/snapshot/filter.py +119 -0
additory/functions/synthetic/__init__.py +113 -0
additory/functions/synthetic/mode_detector.py +47 -0
additory/functions/synthetic/strategies/__init__.py +1 -0
additory/functions/synthetic/strategies/advanced.py +35 -0
additory/functions/synthetic/strategies/augmentative.py +160 -0
additory/functions/synthetic/strategies/generative.py +168 -0
additory/functions/synthetic/strategies/presets.py +116 -0
additory/functions/to/__init__.py +188 -0
additory/functions/to/lookup.py +351 -0
additory/functions/to/merge.py +189 -0
additory/functions/to/sort.py +91 -0
additory/functions/to/summarize.py +170 -0
additory/functions/transform/__init__.py +140 -0
additory/functions/transform/datetime.py +79 -0
additory/functions/transform/extract.py +85 -0
additory/functions/transform/harmonize.py +105 -0
additory/functions/transform/knn.py +62 -0
additory/functions/transform/onehotencoding.py +68 -0
additory/functions/transform/transpose.py +42 -0
additory-0.1.1a1.dist-info/METADATA +83 -0
additory-0.1.1a1.dist-info/RECORD +62 -0
additory/analysis/__init__.py +0 -48
additory/analysis/cardinality.py +0 -126
additory/analysis/correlations.py +0 -124
additory/analysis/distributions.py +0 -376
additory/analysis/quality.py +0 -158
additory/analysis/scan.py +0 -400
additory/common/backend.py +0 -371
additory/common/column_utils.py +0 -191
additory/common/exceptions.py +0 -62
additory/common/lists.py +0 -229
additory/common/patterns.py +0 -240
additory/common/resolver.py +0 -567
additory/common/sample_data.py +0 -182
additory/core/ast_builder.py +0 -165
additory/core/backends/__init__.py +0 -23
additory/core/backends/arrow_bridge.py +0 -483
additory/core/backends/cudf_bridge.py +0 -355
additory/core/column_positioning.py +0 -358
additory/core/compiler_polars.py +0 -166
additory/core/enhanced_cache_manager.py +0 -1119
additory/core/enhanced_matchers.py +0 -473
additory/core/enhanced_version_manager.py +0 -325
additory/core/executor.py +0 -59
additory/core/integrity_manager.py +0 -477
additory/core/loader.py +0 -190
additory/core/namespace_manager.py +0 -657
additory/core/parser.py +0 -176
additory/core/polars_expression_engine.py +0 -601
additory/core/registry.py +0 -177
additory/core/sample_data_manager.py +0 -492
additory/core/user_namespace.py +0 -751
additory/core/validator.py +0 -27
additory/dynamic_api.py +0 -352
additory/expressions/proxy.py +0 -549
additory/expressions/registry.py +0 -313
additory/expressions/samples.py +0 -492
additory/synthetic/__init__.py +0 -13
additory/synthetic/column_name_resolver.py +0 -149
additory/synthetic/deduce.py +0 -259
additory/synthetic/distributions.py +0 -22
additory/synthetic/forecast.py +0 -1132
additory/synthetic/linked_list_parser.py +0 -415
additory/synthetic/namespace_lookup.py +0 -129
additory/synthetic/smote.py +0 -320
additory/synthetic/strategies.py +0 -926
additory/synthetic/synthesizer.py +0 -713
additory/utilities/__init__.py +0 -53
additory/utilities/encoding.py +0 -600
additory/utilities/games.py +0 -300
additory/utilities/keys.py +0 -8
additory/utilities/lookup.py +0 -103
additory/utilities/matchers.py +0 -216
additory/utilities/resolvers.py +0 -286
additory/utilities/settings.py +0 -167
additory/utilities/units.py +0 -749
additory/utilities/validators.py +0 -153
additory-0.1.0a4.dist-info/METADATA +0 -311
additory-0.1.0a4.dist-info/RECORD +0 -72
additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
{additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
{additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0

additory/synthetic/deduce.py DELETED Viewed

@@ -1,259 +0,0 @@
-#!/usr/bin/env python3
-"""
-Text-based label deduction for additory.
-Uses TF-IDF + cosine similarity to deduce labels from text.
-Pure Python, no LLMs, offline-first.
-"""
-import math
-import re
-from collections import Counter
-from typing import Union, List, Optional
-import pandas as pd
-import polars as pl
-def tokenize(text: str) -> List[str]:
-    """
-    Tokenize text into words.
-    Args:
-        text: Input text
-    Returns:
-        List of lowercase tokens
-    """
-    if text is None or not isinstance(text, str):
-        return []
-    text = text.lower()
-    text = re.sub(r"[^a-z0-9\s]", " ", text)
-    return [w for w in text.split() if w]
-def vectorize(tokens: List[str]) -> Counter:
-    """
-    Convert tokens to TF vector (term frequency).
-    Args:
-        tokens: List of tokens
-    Returns:
-        Counter with term frequencies
-    """
-    return Counter(tokens)
-def cosine_similarity(v1: Counter, v2: Counter) -> float:
-    """
-    Compute cosine similarity between two vectors.
-    Args:
-        v1: First vector (Counter)
-        v2: Second vector (Counter)
-    Returns:
-        Similarity score (0-1)
-    """
-    # Dot product
-    dot = sum(v1[t] * v2[t] for t in v1 if t in v2)
-    # Magnitudes
-    mag1 = math.sqrt(sum(v * v for v in v1.values()))
-    mag2 = math.sqrt(sum(v * v for v in v2.values()))
-    if mag1 == 0 or mag2 == 0:
-        return 0.0
-    return dot / (mag1 * mag2)
-def _deduce_polars(
-    df: pl.DataFrame,
-    from_column: Union[str, List[str]],
-    to_column: str,
-    min_examples: int = 3
-) -> pl.DataFrame:
-    """
-    Deduce missing labels using text similarity (Polars-native).
-    Args:
-        df: Polars DataFrame
-        from_column: Text column(s) to analyze
-        to_column: Label column to fill
-        min_examples: Minimum labeled examples required
-    Returns:
-        DataFrame with deduced labels
-    Raises:
-        ValueError: If insufficient labeled examples
-    """
-    # Normalize from_column to list
-    if isinstance(from_column, str):
-        source_cols = [from_column]
-    else:
-        source_cols = from_column
-    # Validate columns exist
-    for col in source_cols:
-        if col not in df.columns:
-            raise ValueError(f"Column '{col}' not found in DataFrame")
-    if to_column not in df.columns:
-        raise ValueError(f"Column '{to_column}' not found in DataFrame")
-    # Create combined text column if multiple sources
-    if len(source_cols) == 1:
-        text_col = source_cols[0]
-        df_work = df.clone()
-    else:
-        # Concatenate multiple columns with spaces
-        df_work = df.with_columns([
-            pl.concat_str(
-                [pl.col(c).fill_null("") for c in source_cols],
-                separator=" "
-            ).alias("__deduce_text__")
-        ])
-        text_col = "__deduce_text__"
-    # Split into labeled and unlabeled
-    labeled_df = df_work.filter(pl.col(to_column).is_not_null())
-    unlabeled_df = df_work.filter(pl.col(to_column).is_null())
-    # Check if we have enough labeled examples
-    n_labeled = len(labeled_df)
-    if n_labeled == 0:
-        raise ValueError(
-            f"⚠️ Cannot deduce labels: No labeled examples found in '{to_column}' column.\n"
-            f"Please manually label at least {min_examples} examples per category, then run again.\n\n"
-            f"Note: additory uses pure Python text similarity (no LLMs, no external calls).\n"
-            f"Your data never leaves your machine."
-        )
-    if n_labeled < min_examples:
-        print(
-            f"⚠️ Only {n_labeled} labeled examples found. "
-            f"For better accuracy, label at least {min_examples} examples.\n"
-            f"Proceeding with available data..."
-        )
-    # If no unlabeled rows, return original
-    if len(unlabeled_df) == 0:
-        if len(source_cols) > 1:
-            # Remove temporary column
-            return df_work.drop("__deduce_text__")
-        return df_work
-    # Precompute vectors for labeled rows
-    labeled_vectors = []
-    for row in labeled_df.iter_rows(named=True):
-        text = row[text_col]
-        label = row[to_column]
-        tokens = tokenize(text)
-        vec = vectorize(tokens)
-        labeled_vectors.append((vec, label))
-    # Deduce labels for unlabeled rows
-    deduced_labels = []
-    for row in unlabeled_df.iter_rows(named=True):
-        text = row[text_col]
-        tokens = tokenize(text)
-        vec = vectorize(tokens)
-        # Find most similar labeled example
-        best_label = None
-        best_score = -1.0
-        for labeled_vec, label in labeled_vectors:
-            score = cosine_similarity(vec, labeled_vec)
-            if score > best_score:
-                best_score = score
-                best_label = label
-        deduced_labels.append(best_label)
-    # Create deduced labels series
-    deduced_series = pl.Series(to_column, deduced_labels)
-    # Update unlabeled rows with deduced labels
-    unlabeled_df = unlabeled_df.with_columns([deduced_series])
-    # Combine labeled and unlabeled back together
-    result_df = pl.concat([labeled_df, unlabeled_df])
-    # Remove temporary column if created
-    if len(source_cols) > 1:
-        result_df = result_df.drop("__deduce_text__")
-    # Print success message
-    n_deduced = len(deduced_labels)
-    print(f"✓ Deduced {n_deduced} label{'s' if n_deduced != 1 else ''} from {n_labeled} examples (offline, no LLMs)")
-    return result_df
-def deduce(
-    df: Union[pd.DataFrame, pl.DataFrame],
-    from_column: Union[str, List[str]],
-    to_column: str
-) -> Union[pd.DataFrame, pl.DataFrame]:
-    """
-    Deduce missing labels based on text similarity to labeled examples.
-    Uses cosine similarity on TF-IDF vectors. Pure Python, no LLMs, offline-first.
-    Requires at least 3 labeled examples to work.
-    When multiple source columns are provided, they are concatenated with
-    spaces before computing similarity.
-    Args:
-        df: DataFrame with some labeled and some unlabeled rows
-        from_column: Text column(s) to analyze
-                    - str: Single column (e.g., "comment")
-                    - List[str]: Multiple columns (e.g., ["comment", "notes"])
-        to_column: Label column to fill (e.g., "status")
-    Returns:
-        DataFrame with deduced labels filled in
-    Examples:
-        # Single column
-        >>> result = add.deduce(df, from_column="comment", to_column="status")
-        # Multiple columns (better accuracy)
-        >>> result = add.deduce(
-        ...     df,
-        ...     from_column=["comment", "notes", "description"],
-        ...     to_column="status"
-        ... )
-    Privacy: Your data never leaves your machine. No external connections.
-    """
-    # Detect input backend
-    if isinstance(df, pd.DataFrame):
-        backend = "pandas"
-        # Convert to Polars
-        df_polars = pl.from_pandas(df)
-    elif isinstance(df, pl.DataFrame):
-        backend = "polars"
-        df_polars = df
-    else:
-        # Try arrow bridge (for cudf, etc.)
-        try:
-            df_polars = pl.from_arrow(df)
-            backend = "arrow"
-        except Exception:
-            raise TypeError(f"Unsupported DataFrame type: {type(df)}")
-    # Process in Polars
-    result_polars = _deduce_polars(df_polars, from_column, to_column)
-    # Convert back to original format
-    if backend == "pandas":
-        return result_polars.to_pandas()
-    elif backend == "polars":
-        return result_polars
-    else:  # arrow
-        return result_polars.to_arrow()

additory/synthetic/distributions.py DELETED Viewed

@@ -1,22 +0,0 @@
-"""
-Distribution Strategies for Synthetic Data Generation
-DEPRECATED: This module has been moved to additory.common.distributions
-Please update your imports to use additory.common.distributions instead.
-This file is kept for backward compatibility and will be removed in a future version.
-"""
-import warnings
-# Issue deprecation warning
-warnings.warn(
-    "additory.synthetic.distributions is deprecated. "
-    "Please use additory.common.distributions instead. "
-    "This module will be removed in a future version.",
-    DeprecationWarning,
-    stacklevel=2
-)
-# Import everything from common.distributions for backward compatibility
-from additory.common.distributions import *  # noqa: F401, F403

additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

additory 0.1.0a4py3-none-any.whl → 0.1.1a1py3-none-any.whl