PyPI - dataforge-ml - Versions diffs - 0.5.0__tar.gz → 0.7.0__tar.gz - Mend

dataforge-ml 0.5.0tar.gz → 0.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

{dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataforge-ml
-Version: 0.5.0
+Version: 0.7.0
 Summary: A automated feature engineering and designing pipeline library
 License: MIT
 Classifier: License :: OSI Approved :: MIT License

{dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "dataforge-ml"
-version = "0.5.0"
+version = "0.7.0"
 description = "A automated feature engineering and designing pipeline library"
 readme = "README.md"
 requires-python = ">=3.10"

{dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_boolean_profiler.py RENAMED Viewed

@@ -84,19 +84,7 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
         if override is not None:
             return False
-        # Native boolean dtype
-        if series.dtype == pl.Boolean:
-            return True
-        # Integer {0, 1} column — check after dropping nulls
-        if series.dtype in _INT_DTYPES:
-            clean = series.drop_nulls()
-            if clean.len() == 0:
-                return False
-            unique_vals = set(clean.unique().to_list())
-            return unique_vals <= {0, 1}
-        return False
+        return True
     # ------------------------------------------------------------------
     # Orchestration

{dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_categorical.py RENAMED Viewed

@@ -49,7 +49,6 @@ from .config import (
     ProfileConfig,
     SemanticType,
 )
-from ..models._data_types import _CAT_DTYPES
 # ---------------------------------------------------------------------------
 # Module-level thresholds (documented so callers can see what drives flags)
@@ -115,7 +114,7 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
         if override is not None:
             return False
-        return series.dtype in _CAT_DTYPES
+        return True
     def _run(
         self,

{dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_correlation_config.py RENAMED Viewed

@@ -7,41 +7,41 @@ numeric/categorical column lists that are already resolved upstream).
 Design notes
 ------------
-- Pearson matrix   : linear relationships between numeric columns.
-- Spearman matrix  : monotonic (rank-based) relationships; robust to
-                     outliers and non-linearity.
-- Near-redundancy  : any pair with |r| > 0.95 flagged — identical signal,
-                     one should be dropped before modelling.
-- Feature–target   : Pearson for numeric target, ANOVA F / eta² for
-                     categorical target.  Top-10 reported.
-- Mutual information: MI for all features vs target (classif or regression).
-                     Captures non-linear dependencies correlation misses.
+- Pearson / Spearman : linear / monotonic relationships between numeric columns.
+- Cramér's V         : association between categorical column pairs [0, 1].
+- Eta-squared        : numeric-categorical association via ANOVA [0, 1].
+- Near-redundancy    : Pearson/Spearman |r| > 0.95, Cramér's V > 0.80,
+                       or eta² > 0.50 flagged — near-identical signal.
+- Feature–target     : Pearson (numeric target), ANOVA/eta² (categorical target).
+- Mutual information : MI for all features vs target (classif or regression).
 """
 from __future__ import annotations
 from dataclasses import dataclass, field
 from enum import StrEnum
 from typing import Optional
 # ---------------------------------------------------------------------------
 # Enums
 # ---------------------------------------------------------------------------
 class CorrelationMethod(StrEnum):
-    Pearson  = "pearson"
+    Pearson = "pearson"
     Spearman = "spearman"
 class TargetType(StrEnum):
-    Numeric      = "numeric"      # numeric target  → Pearson + MI regression
-    Categorical  = "categorical"  # categorical target → ANOVA/eta² + MI classif
+    Numeric = "numeric"  # numeric target  → Pearson + MI regression
+    Categorical = "categorical"  # categorical target → ANOVA/eta² + MI classif
 # ---------------------------------------------------------------------------
 # Pairwise correlation result
 # ---------------------------------------------------------------------------
 @dataclass
 class CorrelationPair:
     """
@@ -62,14 +62,74 @@ class CorrelationPair:
     col_a: str
     col_b: str
-    pearson_r:  Optional[float] = None
+    pearson_r: Optional[float] = None
     spearman_r: Optional[float] = None
     near_redundant: bool = False
     def to_dict(self) -> dict:
         return {
-            "col_a": self.col_a, "col_b": self.col_b,
-            "pearson_r": self.pearson_r, "spearman_r": self.spearman_r,
+            "col_a": self.col_a,
+            "col_b": self.col_b,
+            "pearson_r": self.pearson_r,
+            "spearman_r": self.spearman_r,
+            "near_redundant": self.near_redundant,
+        }
+@dataclass
+class CramerVPair:
+    """
+    Cramér's V association between two categorical columns.
+    Attributes
+    ----------
+    col_a, col_b : str
+    cramer_v : float | None
+        Cramér's V in [0, 1]. None when computation fails or sample too small.
+    near_redundant : bool
+        True when cramer_v exceeds the near-redundancy threshold (default 0.80).
+    """
+    col_a: str = ""
+    col_b: str = ""
+    cramer_v: Optional[float] = None
+    near_redundant: bool = False
+    def to_dict(self) -> dict:
+        return {
+            "col_a": self.col_a,
+            "col_b": self.col_b,
+            "cramer_v": self.cramer_v,
+            "near_redundant": self.near_redundant,
+        }
+@dataclass
+class EtaSquaredPair:
+    """
+    Eta-squared (η²) association between a numeric and a categorical column.
+    Attributes
+    ----------
+    numeric_col : str
+    categorical_col : str
+    eta_squared : float | None
+        Effect size in [0, 1]. None when computation fails.
+        Rule of thumb: 0.01 small, 0.06 medium, 0.14 large.
+    near_redundant : bool
+        True when eta_squared exceeds the near-redundancy threshold (default 0.50).
+    """
+    numeric_col: str = ""
+    categorical_col: str = ""
+    eta_squared: Optional[float] = None
+    near_redundant: bool = False
+    def to_dict(self) -> dict:
+        return {
+            "numeric_col": self.numeric_col,
+            "categorical_col": self.categorical_col,
+            "eta_squared": self.eta_squared,
             "near_redundant": self.near_redundant,
         }
@@ -78,6 +138,7 @@ class CorrelationPair:
 # Feature–target entries
 # ---------------------------------------------------------------------------
 @dataclass
 class NumericTargetCorrelation:
     """
@@ -88,7 +149,8 @@ class NumericTargetCorrelation:
     feature : str
     pearson_r : float | None
     """
-    feature:   str
+    feature: str
     pearson_r: Optional[float] = None
     def to_dict(self) -> dict:
@@ -113,15 +175,18 @@ class CategoricalTargetCorrelation:
         Effect size: SS_between / SS_total.  Ranges [0, 1].
         Rule of thumb: 0.01 small, 0.06 medium, 0.14 large.
     """
-    feature:     str
+    feature: str
     f_statistic: Optional[float] = None
-    p_value:     Optional[float] = None
+    p_value: Optional[float] = None
     eta_squared: Optional[float] = None
     def to_dict(self) -> dict:
         return {
-            "feature": self.feature, "f_statistic": self.f_statistic,
-            "p_value": self.p_value, "eta_squared": self.eta_squared,
+            "feature": self.feature,
+            "f_statistic": self.f_statistic,
+            "p_value": self.p_value,
+            "eta_squared": self.eta_squared,
         }
@@ -129,6 +194,7 @@ class CategoricalTargetCorrelation:
 # Mutual information
 # ---------------------------------------------------------------------------
 @dataclass
 class MutualInformationEntry:
     """
@@ -143,9 +209,10 @@ class MutualInformationEntry:
     rank : int
         1 = highest MI (most informative).
     """
-    feature:  str
+    feature: str
     mi_score: float = 0.0
-    rank:     int   = 0
+    rank: int = 0
     def to_dict(self) -> dict:
         return {"feature": self.feature, "mi_score": self.mi_score, "rank": self.rank}
@@ -155,6 +222,7 @@ class MutualInformationEntry:
 # Near-redundancy summary
 # ---------------------------------------------------------------------------
 @dataclass
 class NearRedundancyGroup:
     """
@@ -164,17 +232,22 @@ class NearRedundancyGroup:
     The suggested_drop list contains every column except the first
     alphabetically — a simple, deterministic heuristic.
     """
-    columns:       list[str] = field(default_factory=list)
+    columns: list[str] = field(default_factory=list)
     suggested_drop: list[str] = field(default_factory=list)
     def to_dict(self) -> dict:
-        return {"columns": list(self.columns), "suggested_drop": list(self.suggested_drop)}
+        return {
+            "columns": list(self.columns),
+            "suggested_drop": list(self.suggested_drop),
+        }
 # ---------------------------------------------------------------------------
 # Top-level result
 # ---------------------------------------------------------------------------
 @dataclass
 class CorrelationProfileResult:
     """
@@ -211,23 +284,34 @@ class CorrelationProfileResult:
     # Column scope
     analysed_numeric_columns: list[str] = field(default_factory=list)
+    analysed_categorical_columns: list[str] = field(default_factory=list)
     # Pairwise matrices
-    pearson_matrix:  dict[str, dict[str, float]] = field(default_factory=dict)
+    pearson_matrix: dict[str, dict[str, float]] = field(default_factory=dict)
     spearman_matrix: dict[str, dict[str, float]] = field(default_factory=dict)
-    # Pairwise summaries
-    pairwise:             list[CorrelationPair] = field(default_factory=list)
+    # Pairwise summaries — numeric ↔ numeric
+    pairwise: list[CorrelationPair] = field(default_factory=list)
     near_redundant_pairs: list[CorrelationPair] = field(default_factory=list)
     near_redundancy_groups: list[NearRedundancyGroup] = field(default_factory=list)
+    # Pairwise summaries — categorical ↔ categorical (Cramér's V)
+    cramer_v_pairs: list[CramerVPair] = field(default_factory=list)
+    near_redundant_cramer_v_pairs: list[CramerVPair] = field(default_factory=list)
+    # Pairwise summaries — numeric ↔ categorical (eta-squared)
+    eta_squared_pairs: list[EtaSquaredPair] = field(default_factory=list)
+    near_redundant_eta_squared_pairs: list[EtaSquaredPair] = field(default_factory=list)
     # Target info
-    target_column: Optional[str]       = None
-    target_type:   Optional[TargetType] = None
+    target_column: Optional[str] = None
+    target_type: Optional[TargetType] = None
     # Feature–target correlations (top-10 each)
-    feature_target_numeric:      list[NumericTargetCorrelation]      = field(default_factory=list)
-    feature_target_categorical:  list[CategoricalTargetCorrelation]  = field(default_factory=list)
+    feature_target_numeric: list[NumericTargetCorrelation] = field(default_factory=list)
+    feature_target_categorical: list[CategoricalTargetCorrelation] = field(
+        default_factory=list
+    )
     # Mutual information (all features, ranked)
     mutual_information: list[MutualInformationEntry] = field(default_factory=list)
@@ -249,14 +333,29 @@ class CorrelationProfileResult:
     def to_dict(self) -> dict:
         return {
             "analysed_numeric_columns": list(self.analysed_numeric_columns),
+            "analysed_categorical_columns": list(self.analysed_categorical_columns),
             "pearson_matrix": {k: dict(v) for k, v in self.pearson_matrix.items()},
             "spearman_matrix": {k: dict(v) for k, v in self.spearman_matrix.items()},
             "pairwise": [p.to_dict() for p in self.pairwise],
             "near_redundant_pairs": [p.to_dict() for p in self.near_redundant_pairs],
-            "near_redundancy_groups": [g.to_dict() for g in self.near_redundancy_groups],
+            "near_redundancy_groups": [
+                g.to_dict() for g in self.near_redundancy_groups
+            ],
+            "cramer_v_pairs": [p.to_dict() for p in self.cramer_v_pairs],
+            "near_redundant_cramer_v_pairs": [
+                p.to_dict() for p in self.near_redundant_cramer_v_pairs
+            ],
+            "eta_squared_pairs": [p.to_dict() for p in self.eta_squared_pairs],
+            "near_redundant_eta_squared_pairs": [
+                p.to_dict() for p in self.near_redundant_eta_squared_pairs
+            ],
             "target_column": self.target_column,
             "target_type": str(self.target_type) if self.target_type else None,
-            "feature_target_numeric": [f.to_dict() for f in self.feature_target_numeric],
-            "feature_target_categorical": [f.to_dict() for f in self.feature_target_categorical],
+            "feature_target_numeric": [
+                f.to_dict() for f in self.feature_target_numeric
+            ],
+            "feature_target_categorical": [
+                f.to_dict() for f in self.feature_target_categorical
+            ],
             "mutual_information": [m.to_dict() for m in self.mutual_information],
         }

{dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_correlation_profiler.py RENAMED Viewed

@@ -47,6 +47,8 @@ from ._correlation_config import (
     CategoricalTargetCorrelation,
     CorrelationPair,
     CorrelationProfileResult,
+    CramerVPair,
+    EtaSquaredPair,
     MutualInformationEntry,
     NearRedundancyGroup,
     NumericTargetCorrelation,
@@ -55,6 +57,8 @@ from ._correlation_config import (
 from ..models._data_types import _NUMERIC_DTYPES, _INT_DTYPES
 _NEAR_REDUNDANT_THRESHOLD: float = 0.95
+_NEAR_REDUNDANT_CRAMER_V_THRESHOLD: float = 0.80
+_NEAR_REDUNDANT_ETA_SQUARED_THRESHOLD: float = 0.50
 _TOP_N_FEATURE_TARGET: int = 10
 _MI_N_NEIGHBORS: int = 3
 _MI_MIN_ROWS: int = 10  # min complete-case rows for a meaningful k-NN MI estimate
@@ -142,13 +146,14 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
         self,
         df: pl.DataFrame,
         numeric_cols: list[str],
+        categorical_cols: Optional[list[str]] = None,
     ) -> CorrelationProfileResult:
         """
         Compute pairwise feature-feature correlation matrices.
-        Pearson + Spearman matrices and near-redundancy groups are filled.
-        All target-specific fields are left at their defaults (empty lists /
-        None).  Call profile_target() separately for each target column.
+        Pearson + Spearman for numeric pairs, Cramér's V for categorical pairs,
+        eta-squared for numeric-categorical pairs.  All target-specific fields
+        are left at their defaults.  Call profile_target() for target analysis.
         """
         result = CorrelationProfileResult()
@@ -159,6 +164,9 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
         ]
         result.analysed_numeric_columns = resolved_numeric
+        resolved_categorical = [c for c in (categorical_cols or []) if c in df.columns]
+        result.analysed_categorical_columns = resolved_categorical
         if len(resolved_numeric) >= 2:
             pearson_mat, spearman_mat = self._compute_matrices(df, resolved_numeric)
             result.pearson_matrix = pearson_mat
@@ -171,6 +179,22 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
                 result.near_redundant_pairs
             )
+        if len(resolved_categorical) >= 2:
+            result.cramer_v_pairs = self._compute_cramer_v_pairs(
+                df, resolved_categorical, _NEAR_REDUNDANT_CRAMER_V_THRESHOLD
+            )
+            result.near_redundant_cramer_v_pairs = [
+                p for p in result.cramer_v_pairs if p.near_redundant
+            ]
+        if resolved_numeric and resolved_categorical:
+            result.eta_squared_pairs = self._compute_eta_squared_pairs(
+                df, resolved_numeric, resolved_categorical, _NEAR_REDUNDANT_ETA_SQUARED_THRESHOLD
+            )
+            result.near_redundant_eta_squared_pairs = [
+                p for p in result.eta_squared_pairs if p.near_redundant
+            ]
         return result
     def profile_target(
@@ -316,6 +340,153 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
             for members in uf.groups()
         ]
+    # ------------------------------------------------------------------
+    # Step 3b: Cramér's V — categorical ↔ categorical
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _compute_cramer_v_pairs(
+        df: pl.DataFrame,
+        cat_cols: list[str],
+        threshold: float,
+    ) -> list[CramerVPair]:
+        try:
+            from scipy.stats import chi2_contingency
+        except ImportError:
+            warnings.warn(
+                "scipy is required for Cramér's V. Install: pip install scipy",
+                stacklevel=3,
+            )
+            return []
+        import numpy as np
+        pairs: list[CramerVPair] = []
+        for col_a, col_b in itertools.combinations(cat_cols, 2):
+            pair_df = (
+                df.select([
+                    pl.col(col_a).cast(pl.Utf8, strict=False),
+                    pl.col(col_b).cast(pl.Utf8, strict=False),
+                ])
+                .drop_nulls()
+            )
+            n = pair_df.height
+            if n < 5:
+                pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
+                continue
+            counts = pair_df.group_by([col_a, col_b]).agg(pl.len().alias("count"))
+            a_unique = sorted(counts[col_a].unique().to_list())
+            b_unique = sorted(counts[col_b].unique().to_list())
+            if len(a_unique) < 2 or len(b_unique) < 2:
+                pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
+                continue
+            a_idx = {v: i for i, v in enumerate(a_unique)}
+            b_idx = {v: i for i, v in enumerate(b_unique)}
+            ct = np.zeros((len(a_unique), len(b_unique)), dtype=int)
+            for a_val, b_val, cnt in zip(
+                counts[col_a].to_list(),
+                counts[col_b].to_list(),
+                counts["count"].to_list(),
+            ):
+                ct[a_idx[a_val], b_idx[b_val]] = cnt
+            try:
+                chi2, _, _, _ = chi2_contingency(ct)
+                r, c = ct.shape
+                phi2 = chi2 / n
+                # Bergsma & Wicher (2013) bias correction
+                phi2_corr = max(0.0, phi2 - (r - 1) * (c - 1) / (n - 1))
+                r_corr = r - (r - 1) ** 2 / (n - 1)
+                c_corr = c - (c - 1) ** 2 / (n - 1)
+                denom = min(r_corr - 1, c_corr - 1)
+                if denom <= 0:
+                    # Near-saturated contingency table (n_unique ≈ n_rows):
+                    # bias correction collapses denominator; skip the pair.
+                    pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
+                    continue
+                v = float(np.sqrt(phi2_corr / denom))
+                v = max(0.0, min(1.0, v))
+            except Exception as exc:
+                warnings.warn(
+                    f"Cramér's V failed for ({col_a}, {col_b}): {exc}", stacklevel=3
+                )
+                pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
+                continue
+            pairs.append(CramerVPair(
+                col_a=col_a, col_b=col_b,
+                cramer_v=v,
+                near_redundant=v > threshold,
+            ))
+        return pairs
+    # ------------------------------------------------------------------
+    # Step 3c: Eta-squared — numeric ↔ categorical
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _compute_eta_squared_pairs(
+        df: pl.DataFrame,
+        numeric_cols: list[str],
+        cat_cols: list[str],
+        threshold: float,
+    ) -> list[EtaSquaredPair]:
+        try:
+            from scipy.stats import f_oneway
+        except ImportError:
+            warnings.warn(
+                "scipy is required for eta-squared. Install: pip install scipy",
+                stacklevel=3,
+            )
+            return []
+        pairs: list[EtaSquaredPair] = []
+        for num_col in numeric_cols:
+            feat = df[num_col].cast(pl.Float64)
+            valid_feat = feat.drop_nulls()
+            if valid_feat.len() == 0:
+                continue
+            grand_mean = float(valid_feat.mean())  # type: ignore[arg-type]
+            ss_total = float(((valid_feat - grand_mean) ** 2).sum() or 0.0)
+            for cat_col in cat_cols:
+                target = df[cat_col]
+                categories = target.drop_nulls().unique().to_list()
+                groups = [
+                    feat.filter(target == cat).drop_nulls().to_numpy()
+                    for cat in categories
+                ]
+                non_empty = [g for g in groups if len(g) > 0]
+                if len(non_empty) < 2:
+                    pairs.append(EtaSquaredPair(numeric_col=num_col, categorical_col=cat_col))
+                    continue
+                try:
+                    f_oneway(*non_empty)
+                    ss_between = sum(
+                        len(g) * (float(g.mean()) - grand_mean) ** 2
+                        for g in non_empty
+                    )
+                    eta_sq = ss_between / ss_total if ss_total > 0 else 0.0
+                    eta_sq = max(0.0, min(1.0, eta_sq))
+                except Exception as exc:
+                    warnings.warn(
+                        f"Eta-squared failed for ({num_col}, {cat_col}): {exc}",
+                        stacklevel=3,
+                    )
+                    pairs.append(EtaSquaredPair(numeric_col=num_col, categorical_col=cat_col))
+                    continue
+                pairs.append(EtaSquaredPair(
+                    numeric_col=num_col, categorical_col=cat_col,
+                    eta_squared=eta_sq,
+                    near_redundant=eta_sq > threshold,
+                ))
+        return pairs
     # ------------------------------------------------------------------
     # Step 5a: Feature–target Pearson (unchanged)
     # ------------------------------------------------------------------

{dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_missingness_profiler.py RENAMED Viewed

@@ -207,7 +207,8 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
         profile.effective_null_ratio = eff_count / n_rows if n_rows else 0.0
         r = profile.effective_null_ratio
-        if r < _SEVERITY_MINOR:
+        if r < _SEVERITY_MINOR and r != 0:
             profile.severity = MissingSeverity.Minor
         elif r < _SEVERITY_MODERATE:
             profile.severity = MissingSeverity.Moderate

{dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_numeric_profiler.py RENAMED Viewed

@@ -50,7 +50,6 @@ from ._numeric_config import (
     NumericTopValueEntry,
     HistogramBin,
 )
-from ..models._data_types import _NUMERIC_DTYPES
 # ---------------------------------------------------------------------------
 # Thresholds (documented so callers can see what drives labels / flags)
@@ -119,7 +118,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
         if override is not None:
             return False
-        return series.dtype in _NUMERIC_DTYPES
+        return True
     def _run(
         self,
@@ -127,9 +126,8 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
         columns: list[str],
     ) -> NumericProfileResult:
         result = NumericProfileResult()
         n_rows = df.height
-        # Intersect requested columns with the actual schema
         available = [
             c
             for c in self._resolve_columns(df.columns, columns)
@@ -137,15 +135,78 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
         ]
         result.analysed_columns = available
-        for col_name in available:
-            series = df[col_name]
-            profile = self._profile_column(series, n_rows)
-            result.columns[col_name] = profile
+        if not available:
+            return result
+        # One df.select([...]) for all scalar stats across all columns so
+        # Polars can parallelise expression evaluation rather than running
+        # independent query plans per column.
+        exprs: list[pl.Expr] = []
+        for col in available:
+            c = pl.col(col).cast(pl.Float64, strict=False)
+            exprs.append(c.mean().alias(f"{col}__mean"))
+            exprs.append(c.median().alias(f"{col}__median"))
+            exprs.append(c.min().alias(f"{col}__min"))
+            exprs.append(c.max().alias(f"{col}__max"))
+            exprs.append(c.std(ddof=1).alias(f"{col}__std"))
+            for q in _QUANTILE_LEVELS:
+                exprs.append(
+                    c.quantile(q, interpolation="linear").alias(f"{col}__q{q}")
+                )
+        batch = df.select(exprs).row(0, named=True)
+        for col in available:
+            series = df[col]
+            f64 = series.cast(pl.Float64, strict=False)
+            clean = f64.drop_nulls()
+            profile = NumericStats()
+            if clean.len() == 0:
+                result.columns[col] = profile
+                continue
+            # Central tendency
+            mean = float(batch[f"{col}__mean"])
+            median = float(batch[f"{col}__median"])
+            profile.mean = mean
+            profile.median = median
+            if median == 0.0:
+                profile.mean_median_ratio = float("inf") if mean != 0.0 else 1.0
+            else:
+                profile.mean_median_ratio = mean / median
+            # Range
+            profile.min = float(batch[f"{col}__min"])
+            profile.max = float(batch[f"{col}__max"])
+            # Spread — Polars returns null for std with ddof=1 on a single row
+            std_val = batch[f"{col}__std"]
+            profile.std = float(std_val) if std_val is not None else 0.0
+            profile.variance = profile.std ** 2
+            # Percentiles
+            q_vals = [batch[f"{col}__q{q}"] for q in _QUANTILE_LEVELS]
+            profile.percentiles = PercentileSnapshot(
+                p1=q_vals[0], p5=q_vals[1], p25=q_vals[2], p50=q_vals[3],
+                p75=q_vals[4], p95=q_vals[5], p99=q_vals[6],
+            )
+            # Frequency / distribution stays per-column (returns a frame, not a scalar)
+            self._compute_frequency_and_distribution(series, clean, profile, n_rows)
+            # Shape stays per-column (delegates to scipy on a numpy array)
+            self._compute_shape(clean, profile)
+            self._check_scale_anomaly(profile)
+            result.columns[col] = profile
         return result
     # ------------------------------------------------------------------
-    # Per-column driver
+    # Per-column helpers (frequency/distribution and shape only —
+    # scalar stats are now batched in _run above)
     # ------------------------------------------------------------------
     @staticmethod
@@ -196,7 +257,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
             # --- 20-Bin Histogram Distribution (Continuous) ---
             import numpy as np
-            counts, bin_edges = np.histogram(clean_f64.to_numpy(), bins=20)
+            counts, bin_edges = np.histogram(clean_f64.to_numpy(), bins="auto")
             profile.histogram = [
                 HistogramBin(
                     lower_bound=float(bin_edges[i]),
@@ -207,73 +268,8 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
                 for i in range(len(counts))
             ]
-    def _profile_column(
-        self,
-        series: pl.Series,
-        n_rows: int,
-    ) -> NumericStats:
-        profile = NumericStats()
-        f64 = series.cast(pl.Float64)
-        clean = f64.drop_nulls()
-        if clean.len() == 0:
-            return profile
-        self._compute_central_tendency(clean, profile)
-        self._compute_range(clean, profile)
-        self._compute_frequency_and_distribution(series, clean, profile, n_rows)
-        self._compute_percentiles(clean, profile)
-        self._compute_spread(clean, profile)
-        self._compute_shape(clean, profile)
-        self._check_scale_anomaly(profile)
-        return profile
     # ------------------------------------------------------------------
-    # Step 1: Central tendency
-    # ------------------------------------------------------------------
-    @staticmethod
-    def _compute_central_tendency(
-        clean: pl.Series,
-        profile: NumericStats,
-    ) -> None:
-        mean = float(clean.mean())  # type: ignore[arg-type]
-        median = float(clean.median())  # type: ignore[arg-type]
-        profile.mean = mean
-        profile.median = median
-        # Mean/median ratio: primary skew indicator at a glance.
-        # Guard against division by zero (e.g. a column of all zeros).
-        if median == 0.0:
-            profile.mean_median_ratio = float("inf") if mean != 0.0 else 1.0
-        else:
-            profile.mean_median_ratio = mean / median
-    # ------------------------------------------------------------------
-    # Step 2: Spread
-    # ------------------------------------------------------------------
-    @staticmethod
-    def _compute_spread(
-        clean: pl.Series,
-        profile: NumericStats,
-    ) -> None:
-        n = clean.len()
-        if n < 2:
-            # Std / variance undefined for a single observation
-            profile.std = 0.0
-            profile.variance = 0.0
-            return
-        std = float(clean.std(ddof=1))  # type: ignore[arg-type]
-        profile.std = std
-        profile.variance = std**2
-    # ------------------------------------------------------------------
-    # Step 3: Shape — skewness and kurtosis
+    # Step 2: Shape — skewness and kurtosis
     # ------------------------------------------------------------------
     @staticmethod
@@ -315,48 +311,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
             profile.kurtosis_tag = KurtosisTag.Mesokurtic
     # ------------------------------------------------------------------
-    # Step 4: Range
-    # ------------------------------------------------------------------
-    @staticmethod
-    def _compute_range(
-        clean: pl.Series,
-        profile: NumericStats,
-    ) -> None:
-        profile.min = float(clean.min())  # type: ignore[arg-type]
-        profile.max = float(clean.max())  # type: ignore[arg-type]
-    # ------------------------------------------------------------------
-    # Step 5: Percentiles
-    # ------------------------------------------------------------------
-    @staticmethod
-    def _compute_percentiles(
-        clean: pl.Series,
-        profile: NumericStats,
-    ) -> None:
-        # Polars quantile() is O(n log n) once; compute all at once via select
-        # to avoid repeated passes.
-        quantile_frame = pl.DataFrame({"v": clean}).select(
-            [
-                pl.col("v").quantile(q, interpolation="linear").alias(f"q{i}")
-                for i, q in enumerate(_QUANTILE_LEVELS)
-            ]
-        )
-        row = quantile_frame.row(0)
-        # row order: p1, p5, p25, p50, p75, p95, p99
-        profile.percentiles = PercentileSnapshot(
-            p1=row[0],
-            p5=row[1],
-            p25=row[2],
-            p50=row[3],
-            p75=row[4],
-            p95=row[5],
-            p99=row[6],
-        )
-    # ------------------------------------------------------------------
-    # Step 6: Scale-anomaly flag
+    # Step 3: Scale-anomaly flag
     # ------------------------------------------------------------------
     @staticmethod

{dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_target_profiler.py RENAMED Viewed

@@ -148,9 +148,11 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
         """Generates numeric metrics and checks for target skewness."""
         num_profiler = NumericProfiler(config=self.config)
-        num_profile = num_profiler._profile_column(series, n_rows)
+        col_name = series.name
+        num_result = num_profiler.profile(series.to_frame(), [col_name])
+        num_profile = num_result.columns.get(col_name)
         result.numeric_profile = num_profile
         # Flag Skewness (Highly skewed targets often require Log/Yeo-Johnson transforms)
-        if num_profile.skewness_severity in (SkewSeverity.High, SkewSeverity.Severe):
+        if num_profile and num_profile.skewness_severity in (SkewSeverity.High, SkewSeverity.Severe):
             result.flags.append(TargetFlag.HighlySkewed)

{dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_type_detector.py RENAMED Viewed

@@ -35,10 +35,11 @@ _IDENTIFIER_UNIQUE_RATIO = 0.99  # >99 % unique → identifier
 _IDENTIFIER_MAX_MEDIAN_LENGTH = 40
 _DISCRETE_NUNIQUE_THRESHOLD = 20  # numeric with <20 unique values → discrete
-_FREE_TEXT_AVG_WORDS: int = 5  # avg word count above which → Text
-_FREE_TEXT_MEDIAN_CHARS: int = 35
-_FREE_TEXT_P90_CHARS: int = 60
+_FREE_TEXT_AVG_WORDS: int = 3
+_FREE_TEXT_MEDIAN_CHARS: int = 20
+_FREE_TEXT_P90_CHARS: int = 35
 _FREE_TEXT_MIN_UNIQUE_RATIO: float = 0.40
+_FREE_TEXT_HIGH_UNIQUE_WITH_SPACES: float = 0.70  # unique ratio above which multi-token strings → Text
 # Common boolean string values (lowercased)
@@ -77,115 +78,87 @@ class TypeDetector:
                 original_dtype=original_dtype,
                 inferred_dtype=original_dtype,
             )
-            # Work with a copy that may be re-assigned after coercion
             working = series
             # 1 & 2: Coercion for string columns
-            if series.dtype == pl.Utf8 or series.dtype == pl.String:
+            if series.dtype in (pl.Utf8, pl.String):
                 coerced, flag = self._try_numeric_coerce(series, n_rows)
                 if coerced is not None:
                     info.inferred_dtype = str(coerced.dtype)
                     info.flags.append(flag)  # type: ignore[arg-type]
                     working = coerced
-                    self._check_coerced_encoded_category(working, info, n_rows)
+                    self._check_coerced_encoded_category(working, info)
                 else:
                     coerced_dt, flag_dt = self._try_datetime_coerce(
-                        series, col_name, n_rows
+                        series, n_rows
                     )
                     if coerced_dt is not None:
                         info.inferred_dtype = str(coerced_dt.dtype)
                         info.flags.append(flag_dt)  # type: ignore[arg-type]
-                        working = coerced_dt
                         info.semantic_type = SemanticType.Datetime
                         results[col_name] = info
                         continue
             # 3: Boolean candidate
             self._check_boolean_candidate(working, info)
+            if TypeFlag.BooleanCandidate in info.flags:
+                info.semantic_type = SemanticType.Boolean
+                results[col_name] = info
+                continue
+            # Native datetime types
+            if working.dtype in (pl.Date, pl.Datetime, pl.Duration, pl.Time) or isinstance(
+                working.dtype, pl.Datetime
+            ):
+                info.semantic_type = SemanticType.Datetime
+                results[col_name] = info
+                continue
-            # Work only on numeric-ish columns for the remaining checks
+            # 4–7: Numeric path
             if working.dtype in _NUMERIC_DTYPES:
-                # 4 & 5: Encoded category and identifier checks — integers only.
-                # Continuous floats have high cardinality by nature and are never
-                # identifiers; restricting these checks prevents false Identifier
-                # classification of genuine numeric features.
                 if working.dtype in _INT_DTYPES:
-                    self._check_encoded_category(working, info, n_rows)
-                    self._check_identifier(working, info, n_rows)
-                # 6: Sequential index (integers only)
-                if working.dtype in _INT_DTYPES or working.dtype in (
-                    pl.Float32,
-                    pl.Float64,
-                ):
-                    self._check_sequential_index(working, info, n_rows)
-                # 7: Numeric kind (skip for identifiers / sequential indices)
-                if not any(
-                    info.has_flag(f)
-                    for f in (
-                        TypeFlag.IdentifierColumn,
-                        TypeFlag.SequentialIndex,
-                        TypeFlag.FloatSequentialIndex,
-                    )
-                ):
+                    # EncodedCategory and IdentifierColumn are mutually exclusive:
+                    # low-cardinality and near-unique cannot both be true.
+                    # Check encoded category first; skip identifier if it matches.
+                    self._check_encoded_category(working, info)
+                    if TypeFlag.EncodedCategory not in info.flags:
+                        self._check_identifier(working, info, n_rows)
+                        if TypeFlag.IdentifierColumn in info.flags:
+                            self._check_sequential_index(working, info, n_rows)
+                if TypeFlag.EncodedCategory in info.flags:
+                    info.semantic_type = SemanticType.Categorical
+                elif TypeFlag.IdentifierColumn in info.flags:
+                    info.semantic_type = SemanticType.Identifier
+                else:
                     self._classify_numeric_kind(working, info)
+                    info.semantic_type = SemanticType.Numeric
-            elif working.dtype == pl.Utf8 or working.dtype == pl.String:
-                # String identifier check
-                self._check_identifier(working, info, n_rows)
+                results[col_name] = info
+                continue
+            # String path
+            if working.dtype in (pl.Utf8, pl.String):
                 self._check_free_text(working, info, n_rows)
-            info.semantic_type = self._derive_semantic_type(
-                info,
-                working,
-                n_rows,
-            )
+                if TypeFlag.FreeTextCandidate in info.flags:
+                    info.semantic_type = SemanticType.Text
+                    results[col_name] = info
+                    continue
+                self._check_identifier(working, info, n_rows)
+                info.semantic_type = (
+                    SemanticType.Identifier
+                    if TypeFlag.IdentifierColumn in info.flags
+                    else SemanticType.Categorical
+                )
+                results[col_name] = info
+                continue
+            # Fallback
+            info.semantic_type = SemanticType.Text
             results[col_name] = info
         return results
-    @staticmethod
-    def _derive_semantic_type(
-        info: ColumnTypeInfo,
-        working: pl.Series,
-        n_rows: int,
-    ) -> SemanticType:
-        if TypeFlag.IdentifierColumn in info.flags:
-            return SemanticType.Identifier
-        if TypeFlag.BooleanCandidate in info.flags:
-            return SemanticType.Boolean
-        is_native_datetime = working.dtype in (
-            pl.Date,
-            pl.Datetime,
-            pl.Duration,
-            pl.Time,
-        ) or (hasattr(pl, "Datetime") and isinstance(working.dtype, pl.Datetime))
-        if is_native_datetime or TypeFlag.DatetimeCoerced in info.flags:
-            return SemanticType.Datetime
-        if TypeFlag.EncodedCategory in info.flags:
-            return SemanticType.Categorical
-        if working.dtype in (pl.Utf8, pl.String):
-            if TypeFlag.FreeTextCandidate in info.flags:
-                return SemanticType.Text
-            return SemanticType.Categorical
-        if working.dtype in _NUMERIC_DTYPES:
-            return SemanticType.Numeric
-        return SemanticType.Categorical
     # ------------------------------------------------------------------
     # Step 1: Numeric coercion
     # ------------------------------------------------------------------
@@ -221,7 +194,7 @@ class TypeDetector:
     @staticmethod
     def _try_datetime_coerce(
-        series: pl.Series, col_name: str, n_rows: int
+        series: pl.Series, n_rows: int
     ) -> tuple[pl.Series, TypeFlag] | tuple[None, None]:
         """
         Attempt datetime coercion if the column name looks date-like.
@@ -269,7 +242,7 @@ class TypeDetector:
     @staticmethod
     def _check_coerced_encoded_category(
-        series: pl.Series, info: ColumnTypeInfo, n_rows: int
+        series: pl.Series, info: ColumnTypeInfo
     ) -> None:
         """
         Post-coercion low-cardinality check for Float64 series that originated
@@ -312,9 +285,8 @@ class TypeDetector:
     @staticmethod
     def _check_encoded_category(
-        series: pl.Series, info: ColumnTypeInfo, n_rows: int
+        series: pl.Series, info: ColumnTypeInfo
     ) -> None:
-        # Skip if already flagged as boolean candidate (subset of {0,1})
         if TypeFlag.BooleanCandidate in info.flags:
             return
@@ -357,16 +329,17 @@ class TypeDetector:
             return
         if series.dtype in (pl.Utf8, pl.String):
-            lengths = series.drop_nulls().str.len_chars()
-            if lengths.len() == 0:
+            non_null = series.drop_nulls()
+            if non_null.len() == 0:
                 return
-            median_length = lengths.median()
+            median_length = non_null.str.len_chars().median()
+            if median_length is not None and median_length > _IDENTIFIER_MAX_MEDIAN_LENGTH:
+                return
-            if (
-                median_length is not None
-                and median_length > _IDENTIFIER_MAX_MEDIAN_LENGTH
-            ):
+            # Real identifiers are single tokens — no spaces.
+            # Sentences and descriptions have median_spaces > 0.
+            if float(non_null.str.count_matches(r"\s+").median() or 0.0) > 0:
                 return
         info.flags.append(TypeFlag.IdentifierColumn)
@@ -440,24 +413,27 @@ class TypeDetector:
         char_lengths = non_null.str.len_chars()
         median_chars = float(char_lengths.median() or 0.0)
+        space_counts = non_null.str.count_matches(r"\s+")
+        median_spaces = float(space_counts.median() or 0.0)
+        median_words = median_spaces + 1.0
+        unique_ratio = series.n_unique() / n_rows if n_rows > 0 else 0.0
-        if median_chars > _FREE_TEXT_MEDIAN_CHARS:
+        # Multi-word strings of medium length: names, addresses, short descriptions
+        if median_chars > _FREE_TEXT_MEDIAN_CHARS and median_spaces >= 1.0:
             info.flags.append(TypeFlag.FreeTextCandidate)
             return
-        space_counts = non_null.str.count_matches(r"\s+")
-        median_words = float(space_counts.median() or 0.0) + 1.0
+        # Long average word count: sentences, paragraphs
         if median_words > _FREE_TEXT_AVG_WORDS:
             info.flags.append(TypeFlag.FreeTextCandidate)
             return
         p90_chars = float(char_lengths.quantile(0.9) or 0.0)
+        if p90_chars > _FREE_TEXT_P90_CHARS and unique_ratio > _FREE_TEXT_MIN_UNIQUE_RATIO:
+            info.flags.append(TypeFlag.FreeTextCandidate)
+            return
-        unique_ratio = series.n_unique() / n_rows if n_rows > 0 else 0.0
-        if (
-            p90_chars > _FREE_TEXT_P90_CHARS
-            and unique_ratio > _FREE_TEXT_MIN_UNIQUE_RATIO
-        ):
+        # High-cardinality multi-token strings that don't meet char thresholds:
+        # e.g. short full names like "John Smith", compound tokens
+        if unique_ratio >= _FREE_TEXT_HIGH_UNIQUE_WITH_SPACES and median_spaces >= 1.0:
             info.flags.append(TypeFlag.FreeTextCandidate)

{dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/structural.py RENAMED Viewed

@@ -199,7 +199,7 @@ class StructuralProfiler:
             # 8a. Feature-feature matrices — computed ONCE, target-independent.
             feature_corr = corr_profiler.profile_features(
-                data, numeric_cols
+                data, numeric_cols, categorical_cols
             )
             result.dataset.feature_correlation = feature_corr

{dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataforge-ml
-Version: 0.5.0
+Version: 0.7.0
 Summary: A automated feature engineering and designing pipeline library
 License: MIT
 Classifier: License :: OSI Approved :: MIT License