PyPI - dataforge-ml - Versions diffs - 0.1.0__py3-none-any.whl - Mend

dataforge-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

dataforge_ml-0.1.0.dist-info/METADATA +34 -0
dataforge_ml-0.1.0.dist-info/RECORD +54 -0
dataforge_ml-0.1.0.dist-info/WHEEL +5 -0
dataforge_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
dataforge_ml-0.1.0.dist-info/top_level.txt +5 -0
models/__init__.py +0 -0
models/_data_structure.py +7 -0
models/_data_types.py +12 -0
profiling/__init__.py +35 -0
profiling/_base.py +101 -0
profiling/_boolean_config.py +37 -0
profiling/_boolean_profiler.py +191 -0
profiling/_categorical.py +315 -0
profiling/_categorical_config.py +87 -0
profiling/_correlation_config.py +225 -0
profiling/_correlation_profiler.py +544 -0
profiling/_datetime_config.py +98 -0
profiling/_datetime_profiler.py +406 -0
profiling/_missingness_config.py +137 -0
profiling/_missingness_profiler.py +252 -0
profiling/_numeric_config.py +116 -0
profiling/_numeric_profiler.py +403 -0
profiling/_tabular.py +249 -0
profiling/_target_config.py +74 -0
profiling/_target_profiler.py +156 -0
profiling/_text_config.py +40 -0
profiling/_text_profiler.py +194 -0
profiling/_type_detector.py +463 -0
profiling/config.py +236 -0
profiling/structural.py +280 -0
splitting/__init__.py +4 -0
splitting/_config.py +56 -0
splitting/_splitter.py +202 -0
tests/__init__.py +0 -0
tests/conftest.py +7 -0
tests/integration/__init__.py +0 -0
tests/integration/conftest.py +82 -0
tests/integration/test_structural_end_to_end.py +219 -0
tests/unit/__init__.py +0 -0
tests/unit/profiling/__init__.py +0 -0
tests/unit/profiling/conftest.py +81 -0
tests/unit/profiling/test_boolean_profiler.py +91 -0
tests/unit/profiling/test_categorical_profiler.py +182 -0
tests/unit/profiling/test_correlation_profiler.py +124 -0
tests/unit/profiling/test_datetime_profiler.py +133 -0
tests/unit/profiling/test_missingness_profiler.py +51 -0
tests/unit/profiling/test_numeric_profiler.py +212 -0
tests/unit/profiling/test_target_profiler.py +44 -0
tests/unit/profiling/test_text_profiler.py +61 -0
tests/unit/profiling/test_type_detector.py +32 -0
tests/unit/splitting/__init__.py +0 -0
tests/unit/splitting/test_data_splitter.py +417 -0
utils/__init__.py +0 -0
utils/data_loader.py +110 -0

profiling/_categorical.py ADDED Viewed

@@ -0,0 +1,315 @@
+"""
+CategoricalProfiler  –  Phase 1 extension: Categorical Column Profiling.
+Per-column metrics (opt-in via ProfileConfig.categorical_columns):
+  1. Cardinality & unique ratio
+  2. Ordinal vs nominal detection
+  3. Top-5 value counts with percentages
+  4. Rare category analysis          (<1 % frequency threshold)
+  5. Whitespace-only value count
+  6. Mixed-type flag                 (some values numeric, some not)
+  7. Free-text / natural-language flag
+        (avg word count >5 OR avg char length >50 OR avg token count >10)
+  8. Imbalance metrics
+        – class ratio  (max_freq / min_freq)
+        – Shannon entropy
+        – Gini impurity
+Integration
+-----------
+Add `categorical_columns: list[str] | None` to ProfileConfig, then call::
+    from profiling.categorical import CategoricalProfiler
+    cat_profiler = CategoricalProfiler(
+        columns=["status", "country", "product_type"],
+        config=cfg,
+    )
+    cat_result = cat_profiler.profile(df)
+The result is a CategoricalProfileResult; attach it to TabularProfileResult
+however suits your downstream pipeline.
+"""
+from __future__ import annotations
+import math
+import polars as pl
+from ._base import ColumnBatchProfiler
+from ._categorical_config import (
+    CategoricalProfileResult,
+    CategoricalStats,
+    TopValueEntry,
+    CategoricalFlag,
+    RareCategoryStats,
+    ImbalanceMetrics,
+)
+from .config import (
+    ProfileConfig,
+    SemanticType,
+)
+from ..models._data_types import _CAT_DTYPES
+# ---------------------------------------------------------------------------
+# Module-level thresholds (documented so callers can see what drives flags)
+# ---------------------------------------------------------------------------
+_RARE_THRESHOLD_PCT: float = 0.01  # <1 % of rows → rare
+_MIXED_TYPE_MIN_MINOR_PCT: float = 0.05
+_MIXED_TYPE_Z_SCORE: float = 1.96
+_NEAR_CONSTANT_THRESHOLD: float = 0.90
+class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
+    """
+    Categorical profiler for Polars DataFrames.
+    Parameters
+    ----------
+    columns : list[str]
+        Columns to profile. The profiler intersects this list with
+        the DataFrame's actual columns at runtime.
+    config : ProfileConfig | None
+        Shared profiling configuration (used for chunk_size, etc.).
+    Usage
+    -----
+    >>> profiler = CategoricalProfiler(
+    ...     columns=["status", "country", "product_type"],
+    ... )
+    >>> result = profiler.profile(df)
+    >>> print(result)
+    """
+    def __init__(
+        self,
+        config: ProfileConfig | None = None,
+    ) -> None:
+        super().__init__(config)
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def profile(
+        self,
+        data: pl.DataFrame,
+        columns: list[str],
+    ) -> CategoricalProfileResult:
+        return self._run(data, columns)
+    # ------------------------------------------------------------------
+    # Orchestration
+    # ------------------------------------------------------------------
+    def _eligible(
+        self,
+        series: pl.Series,
+    ) -> bool:
+        override = self.config.column_overrides.get(series.name)
+        if override == SemanticType.Categorical:
+            return True
+        if override is not None:
+            return False
+        return series.dtype in _CAT_DTYPES
+    def _run(
+        self,
+        df: pl.DataFrame,
+        columns: list[str],
+    ) -> CategoricalProfileResult:
+        result = CategoricalProfileResult()
+        # Resolve columns against actual schema
+        available = [
+            c
+            for c in self._resolve_columns(df.columns, columns)
+            if self._eligible(df[c])
+        ]
+        result.analysed_columns = available
+        n_rows = df.height
+        for col_name in available:
+            series = df[col_name]
+            profile = self._profile_column(series, col_name, n_rows)
+            result.columns[col_name] = profile
+        return result
+    # ------------------------------------------------------------------
+    # Per-column driver
+    # ------------------------------------------------------------------
+    def _profile_column(
+        self,
+        series: pl.Series,
+        col_name: str,
+        n_rows: int,
+    ) -> CategoricalStats:
+        profile = CategoricalStats()
+        # Cast to String for uniform downstream treatment
+        str_series = series.cast(pl.Utf8, strict=False)
+        # 1. Cardinality
+        self._compute_cardinality(str_series, profile, n_rows)
+        # 3. Value distribution (top-5, rare categories, imbalance)
+        #    Returns the value-count frame for reuse in later steps.
+        self._compute_value_distribution(str_series, profile, n_rows)
+        # 5. Mixed-type flag
+        #    We already know from TypeDetector whether the column was numeric-
+        #    coerced; here we detect columns that are *partly* numeric and
+        #    partly not — a different (and more expensive) check.
+        self._check_mixed_type(str_series, profile)
+        return profile
+    # ------------------------------------------------------------------
+    # Step 1: Cardinality
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _compute_cardinality(
+        series: pl.Series,
+        profile: CategoricalStats,
+        n_rows: int,
+    ) -> None:
+        cardinality = series.drop_nulls().n_unique()
+        profile.cardinality = cardinality
+        profile.unique_ratio = cardinality / n_rows if n_rows > 0 else 0.0
+    # ------------------------------------------------------------------
+    # Step 2: Value distribution
+    # ------------------------------------------------------------------
+    def _compute_value_distribution(
+        self,
+        series: pl.Series,
+        profile: CategoricalStats,
+        n_rows: int,
+    ) -> pl.DataFrame:
+        """
+        Build value-count frame, populate top-5, rare stats, and imbalance.
+        Returns the full value-count DataFrame for possible reuse.
+        """
+        # Exclude nulls and whitespace-only values from distribution stats
+        clean = series.filter(
+            ~series.is_null()
+            & (series.str.strip_chars() != "")
+            & ~series.str.to_uppercase().is_in(["NA", "NAN", "NULL", "NONE", "?"])
+        )
+        if clean.len() == 0:
+            return pl.DataFrame({"value": [], "count": []})
+        vc = clean.value_counts(sort=True).rename(  # sorted descending by count
+            {"count": "count"}
+        )  # polars already names it "count"
+        # Polars value_counts column name for the values is the series name
+        value_col = series.name
+        # --- Top-10 ---
+        top10_rows = min(10, vc.height)
+        profile.top_values = [
+            TopValueEntry(
+                value=vc[value_col][i],
+                count=int(vc["count"][i]),
+                percentage=int(vc["count"][i]) / n_rows if n_rows > 0 else 0.0,
+            )
+            for i in range(top10_rows)
+        ]
+        profile.mode_frequency = profile.top_values[0].percentage
+        if profile.mode_frequency > _NEAR_CONSTANT_THRESHOLD:
+            profile.flags.append(CategoricalFlag.NearConstant)
+        # --- Rare category analysis ---
+        rare_threshold_abs = max(1, math.floor(_RARE_THRESHOLD_PCT * n_rows))
+        rare_mask = vc["count"] < rare_threshold_abs
+        rare_rows = vc.filter(rare_mask)
+        profile.rare_categories = RareCategoryStats(
+            threshold_pct=_RARE_THRESHOLD_PCT,
+            rare_category_count=rare_rows.height,
+            total_rare_rows=(
+                int(rare_rows["count"].sum()) if rare_rows.height > 0 else 0
+            ),
+        )
+        profile.rare_categories.rare_row_percentage = (
+            profile.rare_categories.total_rare_rows / n_rows if n_rows > 0 else 0.0
+        )
+        # --- Imbalance metrics ---
+        # Class Ratio -> raw distribution
+        # Entropy -> randomness / information content
+        # Gini -> impurity / misclassification risk
+        counts = vc["count"].cast(pl.Float64)
+        total = float(counts.sum())
+        if total > 0:
+            probs = counts / total
+            max_freq = float(probs.max())  # type: ignore[arg-type]
+            min_freq = float(probs.min())  # type: ignore[arg-type]
+            class_ratio = max_freq / min_freq if min_freq > 0 else float("inf")
+            entropy = float(-(probs * probs.log(base=2)).fill_nan(0.0).sum())
+            gini = float(1.0 - (probs**2).sum())
+            profile.imbalance = ImbalanceMetrics(
+                class_ratio=class_ratio,
+                shannon_entropy=entropy,
+                gini_impurity=gini,
+            )
+        return vc
+    # ------------------------------------------------------------------
+    # Step 5: Mixed-type flag
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _check_mixed_type(
+        series: pl.Series,
+        profile: CategoricalStats,
+    ) -> None:
+        """
+        Flag if the column contains both numeric-looking and non-numeric-looking
+        values.  We use a regex pre-filter so that the vast majority of
+        clearly non-numeric strings are rejected cheaply, and we only
+        apply the heavier float-cast check to ambiguous values.
+        """
+        non_null = series.drop_nulls()
+        n_total = non_null.len()
+        if n_total == 0:
+            return
+        numeric_cast = non_null.cast(pl.Float64, strict=False)
+        n_numeric = n_total - numeric_cast.null_count()
+        n_non_numeric = n_total - n_numeric
+        if n_numeric == 0 or n_non_numeric == 0:
+            return
+        n_minority = min(n_numeric, n_non_numeric)
+        p_minority = n_minority / n_total
+        z = _MIXED_TYPE_Z_SCORE
+        denominator = 1 + (z**2) / n_total
+        center = p_minority + (z**2) / (2 * n_total)
+        spread = z * math.sqrt(
+            (p_minority * (1 - p_minority) + (z**2) / (4 * n_total)) / n_total
+        )
+        lower_bound = (center - spread) / denominator
+        if lower_bound >= _MIXED_TYPE_MIN_MINOR_PCT:
+            profile.flags.append(CategoricalFlag.MixedType)

profiling/_categorical_config.py ADDED Viewed

@@ -0,0 +1,87 @@
+"""
+Result dataclasses for categorical column profiling.
+These complement TabularProfileResult and are populated by
+CategoricalProfiler, which is opt-in via ProfileConfig.categorical_columns.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from enum import StrEnum
+# ---------------------------------------------------------------------------
+# Categorical stats dataclasses (canonical home — config.py re-exports these)
+# ---------------------------------------------------------------------------
+class CategoricalFlag(StrEnum):
+    MixedType = "mixed_type"
+    FreeText = "free_text"
+    NearConstant = "near_constant"
+@dataclass
+class TopValueEntry:
+    value: object
+    count: int
+    percentage: float
+@dataclass
+class RareCategoryStats:
+    threshold_pct: float
+    rare_category_count: int = 0
+    total_rare_rows: int = 0
+    rare_row_percentage: float = 0.0
+@dataclass
+class ImbalanceMetrics:
+    class_ratio: float = 0.0
+    shannon_entropy: float = 0.0
+    gini_impurity: float = 0.0
+@dataclass
+class CategoricalStats:
+    cardinality: int = 0
+    unique_ratio: float = 0.0
+    mode_frequency: float = 0.0
+    top_values: list[TopValueEntry] = field(default_factory=list)
+    rare_categories: RareCategoryStats = field(
+        default_factory=lambda: RareCategoryStats(threshold_pct=0.01),
+    )
+    imbalance: ImbalanceMetrics = field(default_factory=ImbalanceMetrics)
+    flags: list[CategoricalFlag] = field(default_factory=list)
+CategoricalColumnProfile = CategoricalStats
+# ---------------------------------------------------------------------------
+# Top-level result
+# ---------------------------------------------------------------------------
+@dataclass
+class CategoricalProfileResult:
+    """
+    Categorical profile for all opted-in columns.
+    Attributes
+    ----------
+    columns : dict[str, CategoricalColumnProfile]
+        Per-column profiles, keyed by column name.
+    analysed_columns : list[str]
+        Columns that were actually profiled (after schema intersection).
+    """
+    columns: dict[str, CategoricalStats] = field(default_factory=dict)
+    analysed_columns: list[str] = field(default_factory=list)
+    def __str__(self) -> str:  # pragma: no cover
+        lines = ["=== Categorical Profile ==="]
+        for profile in self.columns.values():
+            lines.append(str(profile))
+        return "\n".join(lines)

profiling/_correlation_config.py ADDED Viewed

@@ -0,0 +1,225 @@
+"""
+Result dataclasses for correlation and information-structure profiling.
+Populated by CorrelationProfiler, which is opt-in via
+ProfileConfig.correlation_target_column (and implicitly by passing
+numeric/categorical column lists that are already resolved upstream).
+Design notes
+------------
+- Pearson matrix   : linear relationships between numeric columns.
+- Spearman matrix  : monotonic (rank-based) relationships; robust to
+                     outliers and non-linearity.
+- Near-redundancy  : any pair with |r| > 0.95 flagged — identical signal,
+                     one should be dropped before modelling.
+- Feature–target   : Pearson for numeric target, ANOVA F / eta² for
+                     categorical target.  Top-10 reported.
+- Mutual information: MI for all features vs target (classif or regression).
+                     Captures non-linear dependencies correlation misses.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from enum import StrEnum
+from typing import Optional
+# ---------------------------------------------------------------------------
+# Enums
+# ---------------------------------------------------------------------------
+class CorrelationMethod(StrEnum):
+    Pearson  = "pearson"
+    Spearman = "spearman"
+class TargetType(StrEnum):
+    Numeric      = "numeric"      # numeric target  → Pearson + MI regression
+    Categorical  = "categorical"  # categorical target → ANOVA/eta² + MI classif
+# ---------------------------------------------------------------------------
+# Pairwise correlation result
+# ---------------------------------------------------------------------------
+@dataclass
+class CorrelationPair:
+    """
+    A single entry in the pairwise correlation results.
+    Attributes
+    ----------
+    col_a, col_b : str
+        The two column names (col_a < col_b lexicographically,
+        so each pair appears exactly once).
+    pearson_r : float | None
+        Pearson r.  None when fewer than 3 non-null paired observations.
+    spearman_r : float | None
+        Spearman r.  None under the same condition.
+    near_redundant : bool
+        True when max(|pearson_r|, |spearman_r|) > threshold (default 0.95).
+    """
+    col_a: str
+    col_b: str
+    pearson_r:  Optional[float] = None
+    spearman_r: Optional[float] = None
+    near_redundant: bool = False
+# ---------------------------------------------------------------------------
+# Feature–target entries
+# ---------------------------------------------------------------------------
+@dataclass
+class NumericTargetCorrelation:
+    """
+    Pearson r between one numeric feature and a numeric target.
+    Attributes
+    ----------
+    feature : str
+    pearson_r : float | None
+    """
+    feature:   str
+    pearson_r: Optional[float] = None
+@dataclass
+class CategoricalTargetCorrelation:
+    """
+    ANOVA-based association between one categorical feature and a numeric
+    target (or a numeric feature vs a categorical target when the roles
+    are reversed — see CorrelationProfiler docs).
+    Attributes
+    ----------
+    feature : str
+    f_statistic : float | None
+        One-way ANOVA F-statistic.  Higher F → stronger group separation.
+    p_value : float | None
+        p-value for the F-test.
+    eta_squared : float | None
+        Effect size: SS_between / SS_total.  Ranges [0, 1].
+        Rule of thumb: 0.01 small, 0.06 medium, 0.14 large.
+    """
+    feature:     str
+    f_statistic: Optional[float] = None
+    p_value:     Optional[float] = None
+    eta_squared: Optional[float] = None
+# ---------------------------------------------------------------------------
+# Mutual information
+# ---------------------------------------------------------------------------
+@dataclass
+class MutualInformationEntry:
+    """
+    MI score for one feature vs the target.
+    Attributes
+    ----------
+    feature : str
+    mi_score : float
+        Raw MI value (nats, sklearn default).  Not directly comparable
+        across datasets — use rank ordering within this dataset.
+    rank : int
+        1 = highest MI (most informative).
+    """
+    feature:  str
+    mi_score: float = 0.0
+    rank:     int   = 0
+# ---------------------------------------------------------------------------
+# Near-redundancy summary
+# ---------------------------------------------------------------------------
+@dataclass
+class NearRedundancyGroup:
+    """
+    A cluster of mutually near-redundant columns.
+    All pairs within the group exceed the |r| > 0.95 threshold.
+    The suggested_drop list contains every column except the first
+    alphabetically — a simple, deterministic heuristic.
+    """
+    columns:       list[str] = field(default_factory=list)
+    suggested_drop: list[str] = field(default_factory=list)
+# ---------------------------------------------------------------------------
+# Top-level result
+# ---------------------------------------------------------------------------
+@dataclass
+class CorrelationProfileResult:
+    """
+    Full correlation and information-structure profile.
+    Attributes
+    ----------
+    analysed_numeric_columns : list[str]
+        Numeric columns actually included in the pairwise matrices.
+    pairwise : list[CorrelationPair]
+        All (col_a, col_b) pairs, each carrying Pearson and Spearman r.
+    near_redundant_pairs : list[CorrelationPair]
+        Subset of *pairwise* where near_redundant is True.
+    near_redundancy_groups : list[NearRedundancyGroup]
+        Union-find clusters of near-redundant columns.
+    target_column : str | None
+        The target column supplied by the caller (may be None when no
+        target is provided — only pairwise matrices are then computed).
+    target_type : TargetType | None
+    feature_target_numeric : list[NumericTargetCorrelation]
+        Populated when target is numeric.  Top-10 by |Pearson r|.
+    feature_target_categorical : list[CategoricalTargetCorrelation]
+        Populated when target is categorical.  Top-10 by eta².
+    mutual_information : list[MutualInformationEntry]
+        All features ranked by MI vs target.  Empty when no target.
+    pearson_matrix : dict[str, dict[str, float]]
+        Full symmetric Pearson matrix (numeric columns only).
+    spearman_matrix : dict[str, dict[str, float]]
+        Full symmetric Spearman matrix (numeric columns only).
+    """
+    # Column scope
+    analysed_numeric_columns: list[str] = field(default_factory=list)
+    # Pairwise matrices
+    pearson_matrix:  dict[str, dict[str, float]] = field(default_factory=dict)
+    spearman_matrix: dict[str, dict[str, float]] = field(default_factory=dict)
+    # Pairwise summaries
+    pairwise:             list[CorrelationPair] = field(default_factory=list)
+    near_redundant_pairs: list[CorrelationPair] = field(default_factory=list)
+    near_redundancy_groups: list[NearRedundancyGroup] = field(default_factory=list)
+    # Target info
+    target_column: Optional[str]       = None
+    target_type:   Optional[TargetType] = None
+    # Feature–target correlations (top-10 each)
+    feature_target_numeric:      list[NumericTargetCorrelation]      = field(default_factory=list)
+    feature_target_categorical:  list[CategoricalTargetCorrelation]  = field(default_factory=list)
+    # Mutual information (all features, ranked)
+    mutual_information: list[MutualInformationEntry] = field(default_factory=list)
+    # ------------------------------------------------------------------
+    # Convenience helpers
+    # ------------------------------------------------------------------
+    def top_mi(self, n: int = 10) -> list[MutualInformationEntry]:
+        """Return the top-n features by mutual information score."""
+        return self.mutual_information[:n]
+    def get_pearson(self, col_a: str, col_b: str) -> Optional[float]:
+        return self.pearson_matrix.get(col_a, {}).get(col_b)
+    def get_spearman(self, col_a: str, col_b: str) -> Optional[float]:
+        return self.spearman_matrix.get(col_a, {}).get(col_b)