PyPI - dataforge-ml - Versions diffs - 0.4.0__tar.gz → 0.6.0__tar.gz - Mend

dataforge-ml 0.4.0tar.gz → 0.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

{dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataforge-ml
-Version: 0.4.0
+Version: 0.6.0
 Summary: A automated feature engineering and designing pipeline library
 License: MIT
 Classifier: License :: OSI Approved :: MIT License
@@ -15,6 +15,8 @@ Requires-Dist: polars>=1.0.0
 Requires-Dist: scikit-learn>=1.0.0
 Requires-Dist: scipy>=1.10.0
 Requires-Dist: numpy>=2.0.0
+Requires-Dist: pandas>=2.0.0
+Requires-Dist: chardet>=5.0.0
 Provides-Extra: dev
 Requires-Dist: pytest>=8.0; extra == "dev"
 Dynamic: license-file

{dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "dataforge-ml"
-version = "0.4.0"
+version = "0.6.0"
 description = "A automated feature engineering and designing pipeline library"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -21,6 +21,8 @@ dependencies = [
     "scikit-learn>=1.0.0",
     "scipy>=1.10.0",
     "numpy>=2.0.0",
+    "pandas>=2.0.0",
+    "chardet>=5.0.0",
 ]
 [project.optional-dependencies]

{dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_boolean_profiler.py RENAMED Viewed

@@ -84,19 +84,7 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
         if override is not None:
             return False
-        # Native boolean dtype
-        if series.dtype == pl.Boolean:
-            return True
-        # Integer {0, 1} column — check after dropping nulls
-        if series.dtype in _INT_DTYPES:
-            clean = series.drop_nulls()
-            if clean.len() == 0:
-                return False
-            unique_vals = set(clean.unique().to_list())
-            return unique_vals <= {0, 1}
-        return False
+        return True
     # ------------------------------------------------------------------
     # Orchestration

{dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_categorical.py RENAMED Viewed

@@ -49,7 +49,6 @@ from .config import (
     ProfileConfig,
     SemanticType,
 )
-from ..models._data_types import _CAT_DTYPES
 # ---------------------------------------------------------------------------
 # Module-level thresholds (documented so callers can see what drives flags)
@@ -115,7 +114,7 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
         if override is not None:
             return False
-        return series.dtype in _CAT_DTYPES
+        return True
     def _run(
         self,

{dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_correlation_config.py RENAMED Viewed

@@ -7,41 +7,41 @@ numeric/categorical column lists that are already resolved upstream).
 Design notes
 ------------
-- Pearson matrix   : linear relationships between numeric columns.
-- Spearman matrix  : monotonic (rank-based) relationships; robust to
-                     outliers and non-linearity.
-- Near-redundancy  : any pair with |r| > 0.95 flagged — identical signal,
-                     one should be dropped before modelling.
-- Feature–target   : Pearson for numeric target, ANOVA F / eta² for
-                     categorical target.  Top-10 reported.
-- Mutual information: MI for all features vs target (classif or regression).
-                     Captures non-linear dependencies correlation misses.
+- Pearson / Spearman : linear / monotonic relationships between numeric columns.
+- Cramér's V         : association between categorical column pairs [0, 1].
+- Eta-squared        : numeric-categorical association via ANOVA [0, 1].
+- Near-redundancy    : Pearson/Spearman |r| > 0.95, Cramér's V > 0.80,
+                       or eta² > 0.50 flagged — near-identical signal.
+- Feature–target     : Pearson (numeric target), ANOVA/eta² (categorical target).
+- Mutual information : MI for all features vs target (classif or regression).
 """
 from __future__ import annotations
 from dataclasses import dataclass, field
 from enum import StrEnum
 from typing import Optional
 # ---------------------------------------------------------------------------
 # Enums
 # ---------------------------------------------------------------------------
 class CorrelationMethod(StrEnum):
-    Pearson  = "pearson"
+    Pearson = "pearson"
     Spearman = "spearman"
 class TargetType(StrEnum):
-    Numeric      = "numeric"      # numeric target  → Pearson + MI regression
-    Categorical  = "categorical"  # categorical target → ANOVA/eta² + MI classif
+    Numeric = "numeric"  # numeric target  → Pearson + MI regression
+    Categorical = "categorical"  # categorical target → ANOVA/eta² + MI classif
 # ---------------------------------------------------------------------------
 # Pairwise correlation result
 # ---------------------------------------------------------------------------
 @dataclass
 class CorrelationPair:
     """
@@ -62,14 +62,74 @@ class CorrelationPair:
     col_a: str
     col_b: str
-    pearson_r:  Optional[float] = None
+    pearson_r: Optional[float] = None
     spearman_r: Optional[float] = None
     near_redundant: bool = False
     def to_dict(self) -> dict:
         return {
-            "col_a": self.col_a, "col_b": self.col_b,
-            "pearson_r": self.pearson_r, "spearman_r": self.spearman_r,
+            "col_a": self.col_a,
+            "col_b": self.col_b,
+            "pearson_r": self.pearson_r,
+            "spearman_r": self.spearman_r,
+            "near_redundant": self.near_redundant,
+        }
+@dataclass
+class CramerVPair:
+    """
+    Cramér's V association between two categorical columns.
+    Attributes
+    ----------
+    col_a, col_b : str
+    cramer_v : float | None
+        Cramér's V in [0, 1]. None when computation fails or sample too small.
+    near_redundant : bool
+        True when cramer_v exceeds the near-redundancy threshold (default 0.80).
+    """
+    col_a: str = ""
+    col_b: str = ""
+    cramer_v: Optional[float] = None
+    near_redundant: bool = False
+    def to_dict(self) -> dict:
+        return {
+            "col_a": self.col_a,
+            "col_b": self.col_b,
+            "cramer_v": self.cramer_v,
+            "near_redundant": self.near_redundant,
+        }
+@dataclass
+class EtaSquaredPair:
+    """
+    Eta-squared (η²) association between a numeric and a categorical column.
+    Attributes
+    ----------
+    numeric_col : str
+    categorical_col : str
+    eta_squared : float | None
+        Effect size in [0, 1]. None when computation fails.
+        Rule of thumb: 0.01 small, 0.06 medium, 0.14 large.
+    near_redundant : bool
+        True when eta_squared exceeds the near-redundancy threshold (default 0.50).
+    """
+    numeric_col: str = ""
+    categorical_col: str = ""
+    eta_squared: Optional[float] = None
+    near_redundant: bool = False
+    def to_dict(self) -> dict:
+        return {
+            "numeric_col": self.numeric_col,
+            "categorical_col": self.categorical_col,
+            "eta_squared": self.eta_squared,
             "near_redundant": self.near_redundant,
         }
@@ -78,6 +138,7 @@ class CorrelationPair:
 # Feature–target entries
 # ---------------------------------------------------------------------------
 @dataclass
 class NumericTargetCorrelation:
     """
@@ -88,7 +149,8 @@ class NumericTargetCorrelation:
     feature : str
     pearson_r : float | None
     """
-    feature:   str
+    feature: str
     pearson_r: Optional[float] = None
     def to_dict(self) -> dict:
@@ -113,15 +175,18 @@ class CategoricalTargetCorrelation:
         Effect size: SS_between / SS_total.  Ranges [0, 1].
         Rule of thumb: 0.01 small, 0.06 medium, 0.14 large.
     """
-    feature:     str
+    feature: str
     f_statistic: Optional[float] = None
-    p_value:     Optional[float] = None
+    p_value: Optional[float] = None
     eta_squared: Optional[float] = None
     def to_dict(self) -> dict:
         return {
-            "feature": self.feature, "f_statistic": self.f_statistic,
-            "p_value": self.p_value, "eta_squared": self.eta_squared,
+            "feature": self.feature,
+            "f_statistic": self.f_statistic,
+            "p_value": self.p_value,
+            "eta_squared": self.eta_squared,
         }
@@ -129,6 +194,7 @@ class CategoricalTargetCorrelation:
 # Mutual information
 # ---------------------------------------------------------------------------
 @dataclass
 class MutualInformationEntry:
     """
@@ -143,9 +209,10 @@ class MutualInformationEntry:
     rank : int
         1 = highest MI (most informative).
     """
-    feature:  str
+    feature: str
     mi_score: float = 0.0
-    rank:     int   = 0
+    rank: int = 0
     def to_dict(self) -> dict:
         return {"feature": self.feature, "mi_score": self.mi_score, "rank": self.rank}
@@ -155,6 +222,7 @@ class MutualInformationEntry:
 # Near-redundancy summary
 # ---------------------------------------------------------------------------
 @dataclass
 class NearRedundancyGroup:
     """
@@ -164,17 +232,22 @@ class NearRedundancyGroup:
     The suggested_drop list contains every column except the first
     alphabetically — a simple, deterministic heuristic.
     """
-    columns:       list[str] = field(default_factory=list)
+    columns: list[str] = field(default_factory=list)
     suggested_drop: list[str] = field(default_factory=list)
     def to_dict(self) -> dict:
-        return {"columns": list(self.columns), "suggested_drop": list(self.suggested_drop)}
+        return {
+            "columns": list(self.columns),
+            "suggested_drop": list(self.suggested_drop),
+        }
 # ---------------------------------------------------------------------------
 # Top-level result
 # ---------------------------------------------------------------------------
 @dataclass
 class CorrelationProfileResult:
     """
@@ -211,23 +284,34 @@ class CorrelationProfileResult:
     # Column scope
     analysed_numeric_columns: list[str] = field(default_factory=list)
+    analysed_categorical_columns: list[str] = field(default_factory=list)
     # Pairwise matrices
-    pearson_matrix:  dict[str, dict[str, float]] = field(default_factory=dict)
+    pearson_matrix: dict[str, dict[str, float]] = field(default_factory=dict)
     spearman_matrix: dict[str, dict[str, float]] = field(default_factory=dict)
-    # Pairwise summaries
-    pairwise:             list[CorrelationPair] = field(default_factory=list)
+    # Pairwise summaries — numeric ↔ numeric
+    pairwise: list[CorrelationPair] = field(default_factory=list)
     near_redundant_pairs: list[CorrelationPair] = field(default_factory=list)
     near_redundancy_groups: list[NearRedundancyGroup] = field(default_factory=list)
+    # Pairwise summaries — categorical ↔ categorical (Cramér's V)
+    cramer_v_pairs: list[CramerVPair] = field(default_factory=list)
+    near_redundant_cramer_v_pairs: list[CramerVPair] = field(default_factory=list)
+    # Pairwise summaries — numeric ↔ categorical (eta-squared)
+    eta_squared_pairs: list[EtaSquaredPair] = field(default_factory=list)
+    near_redundant_eta_squared_pairs: list[EtaSquaredPair] = field(default_factory=list)
     # Target info
-    target_column: Optional[str]       = None
-    target_type:   Optional[TargetType] = None
+    target_column: Optional[str] = None
+    target_type: Optional[TargetType] = None
     # Feature–target correlations (top-10 each)
-    feature_target_numeric:      list[NumericTargetCorrelation]      = field(default_factory=list)
-    feature_target_categorical:  list[CategoricalTargetCorrelation]  = field(default_factory=list)
+    feature_target_numeric: list[NumericTargetCorrelation] = field(default_factory=list)
+    feature_target_categorical: list[CategoricalTargetCorrelation] = field(
+        default_factory=list
+    )
     # Mutual information (all features, ranked)
     mutual_information: list[MutualInformationEntry] = field(default_factory=list)
@@ -249,14 +333,29 @@ class CorrelationProfileResult:
     def to_dict(self) -> dict:
         return {
             "analysed_numeric_columns": list(self.analysed_numeric_columns),
+            "analysed_categorical_columns": list(self.analysed_categorical_columns),
             "pearson_matrix": {k: dict(v) for k, v in self.pearson_matrix.items()},
             "spearman_matrix": {k: dict(v) for k, v in self.spearman_matrix.items()},
             "pairwise": [p.to_dict() for p in self.pairwise],
             "near_redundant_pairs": [p.to_dict() for p in self.near_redundant_pairs],
-            "near_redundancy_groups": [g.to_dict() for g in self.near_redundancy_groups],
+            "near_redundancy_groups": [
+                g.to_dict() for g in self.near_redundancy_groups
+            ],
+            "cramer_v_pairs": [p.to_dict() for p in self.cramer_v_pairs],
+            "near_redundant_cramer_v_pairs": [
+                p.to_dict() for p in self.near_redundant_cramer_v_pairs
+            ],
+            "eta_squared_pairs": [p.to_dict() for p in self.eta_squared_pairs],
+            "near_redundant_eta_squared_pairs": [
+                p.to_dict() for p in self.near_redundant_eta_squared_pairs
+            ],
             "target_column": self.target_column,
             "target_type": str(self.target_type) if self.target_type else None,
-            "feature_target_numeric": [f.to_dict() for f in self.feature_target_numeric],
-            "feature_target_categorical": [f.to_dict() for f in self.feature_target_categorical],
+            "feature_target_numeric": [
+                f.to_dict() for f in self.feature_target_numeric
+            ],
+            "feature_target_categorical": [
+                f.to_dict() for f in self.feature_target_categorical
+            ],
             "mutual_information": [m.to_dict() for m in self.mutual_information],
         }

{dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_correlation_profiler.py RENAMED Viewed

@@ -47,6 +47,8 @@ from ._correlation_config import (
     CategoricalTargetCorrelation,
     CorrelationPair,
     CorrelationProfileResult,
+    CramerVPair,
+    EtaSquaredPair,
     MutualInformationEntry,
     NearRedundancyGroup,
     NumericTargetCorrelation,
@@ -55,6 +57,8 @@ from ._correlation_config import (
 from ..models._data_types import _NUMERIC_DTYPES, _INT_DTYPES
 _NEAR_REDUNDANT_THRESHOLD: float = 0.95
+_NEAR_REDUNDANT_CRAMER_V_THRESHOLD: float = 0.80
+_NEAR_REDUNDANT_ETA_SQUARED_THRESHOLD: float = 0.50
 _TOP_N_FEATURE_TARGET: int = 10
 _MI_N_NEIGHBORS: int = 3
 _MI_MIN_ROWS: int = 10  # min complete-case rows for a meaningful k-NN MI estimate
@@ -142,13 +146,14 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
         self,
         df: pl.DataFrame,
         numeric_cols: list[str],
+        categorical_cols: Optional[list[str]] = None,
     ) -> CorrelationProfileResult:
         """
         Compute pairwise feature-feature correlation matrices.
-        Pearson + Spearman matrices and near-redundancy groups are filled.
-        All target-specific fields are left at their defaults (empty lists /
-        None).  Call profile_target() separately for each target column.
+        Pearson + Spearman for numeric pairs, Cramér's V for categorical pairs,
+        eta-squared for numeric-categorical pairs.  All target-specific fields
+        are left at their defaults.  Call profile_target() for target analysis.
         """
         result = CorrelationProfileResult()
@@ -159,6 +164,9 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
         ]
         result.analysed_numeric_columns = resolved_numeric
+        resolved_categorical = [c for c in (categorical_cols or []) if c in df.columns]
+        result.analysed_categorical_columns = resolved_categorical
         if len(resolved_numeric) >= 2:
             pearson_mat, spearman_mat = self._compute_matrices(df, resolved_numeric)
             result.pearson_matrix = pearson_mat
@@ -171,6 +179,22 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
                 result.near_redundant_pairs
             )
+        if len(resolved_categorical) >= 2:
+            result.cramer_v_pairs = self._compute_cramer_v_pairs(
+                df, resolved_categorical, _NEAR_REDUNDANT_CRAMER_V_THRESHOLD
+            )
+            result.near_redundant_cramer_v_pairs = [
+                p for p in result.cramer_v_pairs if p.near_redundant
+            ]
+        if resolved_numeric and resolved_categorical:
+            result.eta_squared_pairs = self._compute_eta_squared_pairs(
+                df, resolved_numeric, resolved_categorical, _NEAR_REDUNDANT_ETA_SQUARED_THRESHOLD
+            )
+            result.near_redundant_eta_squared_pairs = [
+                p for p in result.eta_squared_pairs if p.near_redundant
+            ]
         return result
     def profile_target(
@@ -316,6 +340,147 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
             for members in uf.groups()
         ]
+    # ------------------------------------------------------------------
+    # Step 3b: Cramér's V — categorical ↔ categorical
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _compute_cramer_v_pairs(
+        df: pl.DataFrame,
+        cat_cols: list[str],
+        threshold: float,
+    ) -> list[CramerVPair]:
+        try:
+            from scipy.stats import chi2_contingency
+        except ImportError:
+            warnings.warn(
+                "scipy is required for Cramér's V. Install: pip install scipy",
+                stacklevel=3,
+            )
+            return []
+        import numpy as np
+        pairs: list[CramerVPair] = []
+        for col_a, col_b in itertools.combinations(cat_cols, 2):
+            pair_df = (
+                df.select([
+                    pl.col(col_a).cast(pl.Utf8, strict=False),
+                    pl.col(col_b).cast(pl.Utf8, strict=False),
+                ])
+                .drop_nulls()
+            )
+            n = pair_df.height
+            if n < 5:
+                pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
+                continue
+            counts = pair_df.group_by([col_a, col_b]).agg(pl.len().alias("count"))
+            a_unique = sorted(counts[col_a].unique().to_list())
+            b_unique = sorted(counts[col_b].unique().to_list())
+            if len(a_unique) < 2 or len(b_unique) < 2:
+                pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
+                continue
+            a_idx = {v: i for i, v in enumerate(a_unique)}
+            b_idx = {v: i for i, v in enumerate(b_unique)}
+            ct = np.zeros((len(a_unique), len(b_unique)), dtype=int)
+            for a_val, b_val, cnt in zip(
+                counts[col_a].to_list(),
+                counts[col_b].to_list(),
+                counts["count"].to_list(),
+            ):
+                ct[a_idx[a_val], b_idx[b_val]] = cnt
+            try:
+                chi2, _, _, _ = chi2_contingency(ct)
+                r, c = ct.shape
+                phi2 = chi2 / n
+                # Bergsma & Wicher (2013) bias correction
+                phi2_corr = max(0.0, phi2 - (r - 1) * (c - 1) / (n - 1))
+                r_corr = r - (r - 1) ** 2 / (n - 1)
+                c_corr = c - (c - 1) ** 2 / (n - 1)
+                v = float(np.sqrt(phi2_corr / min(r_corr - 1, c_corr - 1)))
+                v = max(0.0, min(1.0, v))
+            except Exception as exc:
+                warnings.warn(
+                    f"Cramér's V failed for ({col_a}, {col_b}): {exc}", stacklevel=3
+                )
+                pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
+                continue
+            pairs.append(CramerVPair(
+                col_a=col_a, col_b=col_b,
+                cramer_v=v,
+                near_redundant=v > threshold,
+            ))
+        return pairs
+    # ------------------------------------------------------------------
+    # Step 3c: Eta-squared — numeric ↔ categorical
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _compute_eta_squared_pairs(
+        df: pl.DataFrame,
+        numeric_cols: list[str],
+        cat_cols: list[str],
+        threshold: float,
+    ) -> list[EtaSquaredPair]:
+        try:
+            from scipy.stats import f_oneway
+        except ImportError:
+            warnings.warn(
+                "scipy is required for eta-squared. Install: pip install scipy",
+                stacklevel=3,
+            )
+            return []
+        pairs: list[EtaSquaredPair] = []
+        for num_col in numeric_cols:
+            feat = df[num_col].cast(pl.Float64)
+            valid_feat = feat.drop_nulls()
+            if valid_feat.len() == 0:
+                continue
+            grand_mean = float(valid_feat.mean())  # type: ignore[arg-type]
+            ss_total = float(((valid_feat - grand_mean) ** 2).sum() or 0.0)
+            for cat_col in cat_cols:
+                target = df[cat_col]
+                categories = target.drop_nulls().unique().to_list()
+                groups = [
+                    feat.filter(target == cat).drop_nulls().to_numpy()
+                    for cat in categories
+                ]
+                non_empty = [g for g in groups if len(g) > 0]
+                if len(non_empty) < 2:
+                    pairs.append(EtaSquaredPair(numeric_col=num_col, categorical_col=cat_col))
+                    continue
+                try:
+                    f_oneway(*non_empty)
+                    ss_between = sum(
+                        len(g) * (float(g.mean()) - grand_mean) ** 2
+                        for g in non_empty
+                    )
+                    eta_sq = ss_between / ss_total if ss_total > 0 else 0.0
+                    eta_sq = max(0.0, min(1.0, eta_sq))
+                except Exception as exc:
+                    warnings.warn(
+                        f"Eta-squared failed for ({num_col}, {cat_col}): {exc}",
+                        stacklevel=3,
+                    )
+                    pairs.append(EtaSquaredPair(numeric_col=num_col, categorical_col=cat_col))
+                    continue
+                pairs.append(EtaSquaredPair(
+                    numeric_col=num_col, categorical_col=cat_col,
+                    eta_squared=eta_sq,
+                    near_redundant=eta_sq > threshold,
+                ))
+        return pairs
     # ------------------------------------------------------------------
     # Step 5a: Feature–target Pearson (unchanged)
     # ------------------------------------------------------------------

{dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_missingness_profiler.py RENAMED Viewed

@@ -207,7 +207,8 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
         profile.effective_null_ratio = eff_count / n_rows if n_rows else 0.0
         r = profile.effective_null_ratio
-        if r < _SEVERITY_MINOR:
+        if r < _SEVERITY_MINOR and r != 0:
             profile.severity = MissingSeverity.Minor
         elif r < _SEVERITY_MODERATE:
             profile.severity = MissingSeverity.Moderate

{dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_numeric_profiler.py RENAMED Viewed

@@ -50,7 +50,6 @@ from ._numeric_config import (
     NumericTopValueEntry,
     HistogramBin,
 )
-from ..models._data_types import _NUMERIC_DTYPES
 # ---------------------------------------------------------------------------
 # Thresholds (documented so callers can see what drives labels / flags)
@@ -119,7 +118,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
         if override is not None:
             return False
-        return series.dtype in _NUMERIC_DTYPES
+        return True
     def _run(
         self,
@@ -127,9 +126,8 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
         columns: list[str],
     ) -> NumericProfileResult:
         result = NumericProfileResult()
         n_rows = df.height
-        # Intersect requested columns with the actual schema
         available = [
             c
             for c in self._resolve_columns(df.columns, columns)
@@ -137,15 +135,78 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
         ]
         result.analysed_columns = available
-        for col_name in available:
-            series = df[col_name]
-            profile = self._profile_column(series, n_rows)
-            result.columns[col_name] = profile
+        if not available:
+            return result
+        # One df.select([...]) for all scalar stats across all columns so
+        # Polars can parallelise expression evaluation rather than running
+        # independent query plans per column.
+        exprs: list[pl.Expr] = []
+        for col in available:
+            c = pl.col(col).cast(pl.Float64, strict=False)
+            exprs.append(c.mean().alias(f"{col}__mean"))
+            exprs.append(c.median().alias(f"{col}__median"))
+            exprs.append(c.min().alias(f"{col}__min"))
+            exprs.append(c.max().alias(f"{col}__max"))
+            exprs.append(c.std(ddof=1).alias(f"{col}__std"))
+            for q in _QUANTILE_LEVELS:
+                exprs.append(
+                    c.quantile(q, interpolation="linear").alias(f"{col}__q{q}")
+                )
+        batch = df.select(exprs).row(0, named=True)
+        for col in available:
+            series = df[col]
+            f64 = series.cast(pl.Float64, strict=False)
+            clean = f64.drop_nulls()
+            profile = NumericStats()
+            if clean.len() == 0:
+                result.columns[col] = profile
+                continue
+            # Central tendency
+            mean = float(batch[f"{col}__mean"])
+            median = float(batch[f"{col}__median"])
+            profile.mean = mean
+            profile.median = median
+            if median == 0.0:
+                profile.mean_median_ratio = float("inf") if mean != 0.0 else 1.0
+            else:
+                profile.mean_median_ratio = mean / median
+            # Range
+            profile.min = float(batch[f"{col}__min"])
+            profile.max = float(batch[f"{col}__max"])
+            # Spread — Polars returns null for std with ddof=1 on a single row
+            std_val = batch[f"{col}__std"]
+            profile.std = float(std_val) if std_val is not None else 0.0
+            profile.variance = profile.std ** 2
+            # Percentiles
+            q_vals = [batch[f"{col}__q{q}"] for q in _QUANTILE_LEVELS]
+            profile.percentiles = PercentileSnapshot(
+                p1=q_vals[0], p5=q_vals[1], p25=q_vals[2], p50=q_vals[3],
+                p75=q_vals[4], p95=q_vals[5], p99=q_vals[6],
+            )
+            # Frequency / distribution stays per-column (returns a frame, not a scalar)
+            self._compute_frequency_and_distribution(series, clean, profile, n_rows)
+            # Shape stays per-column (delegates to scipy on a numpy array)
+            self._compute_shape(clean, profile)
+            self._check_scale_anomaly(profile)
+            result.columns[col] = profile
         return result
     # ------------------------------------------------------------------
-    # Per-column driver
+    # Per-column helpers (frequency/distribution and shape only —
+    # scalar stats are now batched in _run above)
     # ------------------------------------------------------------------
     @staticmethod
@@ -196,7 +257,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
             # --- 20-Bin Histogram Distribution (Continuous) ---
             import numpy as np
-            counts, bin_edges = np.histogram(clean_f64.to_numpy(), bins=20)
+            counts, bin_edges = np.histogram(clean_f64.to_numpy(), bins="auto")
             profile.histogram = [
                 HistogramBin(
                     lower_bound=float(bin_edges[i]),
@@ -207,73 +268,8 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
                 for i in range(len(counts))
             ]
-    def _profile_column(
-        self,
-        series: pl.Series,
-        n_rows: int,
-    ) -> NumericStats:
-        profile = NumericStats()
-        f64 = series.cast(pl.Float64)
-        clean = f64.drop_nulls()
-        if clean.len() == 0:
-            return profile
-        self._compute_central_tendency(clean, profile)
-        self._compute_range(clean, profile)
-        self._compute_frequency_and_distribution(series, clean, profile, n_rows)
-        self._compute_percentiles(clean, profile)
-        self._compute_spread(clean, profile)
-        self._compute_shape(clean, profile)
-        self._check_scale_anomaly(profile)
-        return profile
     # ------------------------------------------------------------------
-    # Step 1: Central tendency
-    # ------------------------------------------------------------------
-    @staticmethod
-    def _compute_central_tendency(
-        clean: pl.Series,
-        profile: NumericStats,
-    ) -> None:
-        mean = float(clean.mean())  # type: ignore[arg-type]
-        median = float(clean.median())  # type: ignore[arg-type]
-        profile.mean = mean
-        profile.median = median
-        # Mean/median ratio: primary skew indicator at a glance.
-        # Guard against division by zero (e.g. a column of all zeros).
-        if median == 0.0:
-            profile.mean_median_ratio = float("inf") if mean != 0.0 else 1.0
-        else:
-            profile.mean_median_ratio = mean / median
-    # ------------------------------------------------------------------
-    # Step 2: Spread
-    # ------------------------------------------------------------------
-    @staticmethod
-    def _compute_spread(
-        clean: pl.Series,
-        profile: NumericStats,
-    ) -> None:
-        n = clean.len()
-        if n < 2:
-            # Std / variance undefined for a single observation
-            profile.std = 0.0
-            profile.variance = 0.0
-            return
-        std = float(clean.std(ddof=1))  # type: ignore[arg-type]
-        profile.std = std
-        profile.variance = std**2
-    # ------------------------------------------------------------------
-    # Step 3: Shape — skewness and kurtosis
+    # Step 2: Shape — skewness and kurtosis
     # ------------------------------------------------------------------
     @staticmethod
@@ -315,48 +311,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
             profile.kurtosis_tag = KurtosisTag.Mesokurtic
     # ------------------------------------------------------------------
-    # Step 4: Range
-    # ------------------------------------------------------------------
-    @staticmethod
-    def _compute_range(
-        clean: pl.Series,
-        profile: NumericStats,
-    ) -> None:
-        profile.min = float(clean.min())  # type: ignore[arg-type]
-        profile.max = float(clean.max())  # type: ignore[arg-type]
-    # ------------------------------------------------------------------
-    # Step 5: Percentiles
-    # ------------------------------------------------------------------
-    @staticmethod
-    def _compute_percentiles(
-        clean: pl.Series,
-        profile: NumericStats,
-    ) -> None:
-        # Polars quantile() is O(n log n) once; compute all at once via select
-        # to avoid repeated passes.
-        quantile_frame = pl.DataFrame({"v": clean}).select(
-            [
-                pl.col("v").quantile(q, interpolation="linear").alias(f"q{i}")
-                for i, q in enumerate(_QUANTILE_LEVELS)
-            ]
-        )
-        row = quantile_frame.row(0)
-        # row order: p1, p5, p25, p50, p75, p95, p99
-        profile.percentiles = PercentileSnapshot(
-            p1=row[0],
-            p5=row[1],
-            p25=row[2],
-            p50=row[3],
-            p75=row[4],
-            p95=row[5],
-            p99=row[6],
-        )
-    # ------------------------------------------------------------------
-    # Step 6: Scale-anomaly flag
+    # Step 3: Scale-anomaly flag
     # ------------------------------------------------------------------
     @staticmethod

{dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_target_profiler.py RENAMED Viewed

@@ -148,9 +148,11 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
         """Generates numeric metrics and checks for target skewness."""
         num_profiler = NumericProfiler(config=self.config)
-        num_profile = num_profiler._profile_column(series, n_rows)
+        col_name = series.name
+        num_result = num_profiler.profile(series.to_frame(), [col_name])
+        num_profile = num_result.columns.get(col_name)
         result.numeric_profile = num_profile
         # Flag Skewness (Highly skewed targets often require Log/Yeo-Johnson transforms)
-        if num_profile.skewness_severity in (SkewSeverity.High, SkewSeverity.Severe):
+        if num_profile and num_profile.skewness_severity in (SkewSeverity.High, SkewSeverity.Severe):
             result.flags.append(TargetFlag.HighlySkewed)

{dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_type_detector.py RENAMED Viewed

@@ -357,16 +357,17 @@ class TypeDetector:
             return
         if series.dtype in (pl.Utf8, pl.String):
-            lengths = series.drop_nulls().str.len_chars()
-            if lengths.len() == 0:
+            non_null = series.drop_nulls()
+            if non_null.len() == 0:
                 return
-            median_length = lengths.median()
+            median_length = non_null.str.len_chars().median()
+            if median_length is not None and median_length > _IDENTIFIER_MAX_MEDIAN_LENGTH:
+                return
-            if (
-                median_length is not None
-                and median_length > _IDENTIFIER_MAX_MEDIAN_LENGTH
-            ):
+            # Real identifiers are single tokens — no spaces.
+            # Sentences and descriptions have median_spaces > 0.
+            if float(non_null.str.count_matches(r"\s+").median() or 0.0) > 0:
                 return
         info.flags.append(TypeFlag.IdentifierColumn)

{dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/structural.py RENAMED Viewed

@@ -199,7 +199,7 @@ class StructuralProfiler:
             # 8a. Feature-feature matrices — computed ONCE, target-independent.
             feature_corr = corr_profiler.profile_features(
-                data, numeric_cols
+                data, numeric_cols, categorical_cols
             )
             result.dataset.feature_correlation = feature_corr

{dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataforge-ml
-Version: 0.4.0
+Version: 0.6.0
 Summary: A automated feature engineering and designing pipeline library
 License: MIT
 Classifier: License :: OSI Approved :: MIT License
@@ -15,6 +15,8 @@ Requires-Dist: polars>=1.0.0
 Requires-Dist: scikit-learn>=1.0.0
 Requires-Dist: scipy>=1.10.0
 Requires-Dist: numpy>=2.0.0
+Requires-Dist: pandas>=2.0.0
+Requires-Dist: chardet>=5.0.0
 Provides-Extra: dev
 Requires-Dist: pytest>=8.0; extra == "dev"
 Dynamic: license-file