PyPI - dataforge-ml - Versions diffs - 0.1.0__py3-none-any.whl - Mend

dataforge-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

dataforge_ml-0.1.0.dist-info/METADATA +34 -0
dataforge_ml-0.1.0.dist-info/RECORD +54 -0
dataforge_ml-0.1.0.dist-info/WHEEL +5 -0
dataforge_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
dataforge_ml-0.1.0.dist-info/top_level.txt +5 -0
models/__init__.py +0 -0
models/_data_structure.py +7 -0
models/_data_types.py +12 -0
profiling/__init__.py +35 -0
profiling/_base.py +101 -0
profiling/_boolean_config.py +37 -0
profiling/_boolean_profiler.py +191 -0
profiling/_categorical.py +315 -0
profiling/_categorical_config.py +87 -0
profiling/_correlation_config.py +225 -0
profiling/_correlation_profiler.py +544 -0
profiling/_datetime_config.py +98 -0
profiling/_datetime_profiler.py +406 -0
profiling/_missingness_config.py +137 -0
profiling/_missingness_profiler.py +252 -0
profiling/_numeric_config.py +116 -0
profiling/_numeric_profiler.py +403 -0
profiling/_tabular.py +249 -0
profiling/_target_config.py +74 -0
profiling/_target_profiler.py +156 -0
profiling/_text_config.py +40 -0
profiling/_text_profiler.py +194 -0
profiling/_type_detector.py +463 -0
profiling/config.py +236 -0
profiling/structural.py +280 -0
splitting/__init__.py +4 -0
splitting/_config.py +56 -0
splitting/_splitter.py +202 -0
tests/__init__.py +0 -0
tests/conftest.py +7 -0
tests/integration/__init__.py +0 -0
tests/integration/conftest.py +82 -0
tests/integration/test_structural_end_to_end.py +219 -0
tests/unit/__init__.py +0 -0
tests/unit/profiling/__init__.py +0 -0
tests/unit/profiling/conftest.py +81 -0
tests/unit/profiling/test_boolean_profiler.py +91 -0
tests/unit/profiling/test_categorical_profiler.py +182 -0
tests/unit/profiling/test_correlation_profiler.py +124 -0
tests/unit/profiling/test_datetime_profiler.py +133 -0
tests/unit/profiling/test_missingness_profiler.py +51 -0
tests/unit/profiling/test_numeric_profiler.py +212 -0
tests/unit/profiling/test_target_profiler.py +44 -0
tests/unit/profiling/test_text_profiler.py +61 -0
tests/unit/profiling/test_type_detector.py +32 -0
tests/unit/splitting/__init__.py +0 -0
tests/unit/splitting/test_data_splitter.py +417 -0
utils/__init__.py +0 -0
utils/data_loader.py +110 -0

profiling/_numeric_profiler.py ADDED Viewed

@@ -0,0 +1,403 @@
+"""
+NumericProfiler  –  Phase 1 extension: Numeric Distribution Profiling.
+Per-column metrics (opt-in via ProfileConfig.numeric_columns):
+  1. Central tendency     – mean, median, mean/median ratio
+  2. Spread               – std, variance, IQR (Q3 – Q1)
+  3. Skewness & kurtosis  – with severity/tag labels
+  4. Range                – min, max
+  5. Percentile profile   – p1, p5, p25, p50, p75, p95, p99
+  6. Scale-anomaly flag   – values spanning 3+ orders of magnitude
+Only numeric Polars dtypes are profiled; string columns in the list are
+silently skipped (a warning is produced if the caller passes non-numeric
+column names).
+Integration
+-----------
+Add ``numeric_columns: list[str] | None`` to ProfileConfig, then call::
+    from profiling.numeric_profiler import NumericProfiler
+    num_profiler = NumericProfiler(
+        columns=["age", "income", "temperature"],
+        config=cfg,
+    )
+    num_result = num_profiler.profile(df)
+Attach ``num_result`` to ``TabularProfileResult`` as
+``result.numeric_profile``.
+"""
+from __future__ import annotations
+import polars as pl
+from ._base import ColumnBatchProfiler
+from .config import (
+    ProfileConfig,
+    SemanticType,
+)
+from ._correlation_profiler import _INT_DTYPES
+from ._numeric_config import (
+    NumericProfileResult,
+    NumericStats,
+    PercentileSnapshot,
+    KurtosisTag,
+    NumericFlag,
+    SkewSeverity,
+    NumericTopValueEntry,
+    HistogramBin,
+)
+from ..models._data_types import _NUMERIC_DTYPES
+# ---------------------------------------------------------------------------
+# Thresholds (documented so callers can see what drives labels / flags)
+# ---------------------------------------------------------------------------
+# Skewness severity bands (applied to |skewness|)
+_SKEW_NORMAL = 0.5  # |skew| ≤ this  →  normal
+_SKEW_MODERATE = 1.0  # |skew| ≤ this  →  moderate
+_SKEW_HIGH = 2.0  # |skew| ≤ this  →  high
+#                        |skew| > 2.0   →  severe
+# Excess kurtosis bands
+_KURT_PLATY_UPPER = -1.0  # excess < this  →  platykurtic
+_KURT_LEPTO_LOWER = 3.0  # excess > this  →  leptokurtic
+#                            else          →  mesokurtic
+# Scale-anomaly: flag when max/min ratio spans ≥ 3 orders of magnitude
+_SCALE_ORDERS_OF_MAGNITUDE = 3  # i.e. ratio ≥ 10^3
+# Percentile quantile levels (in order)
+_QUANTILE_LEVELS = (0.01, 0.05, 0.25, 0.50, 0.75, 0.95, 0.99)
+_NEAR_CONSTANT_THRESHOLD = 0.90
+_DISCRETE_MAX_UNIQUE = 20
+class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
+    """
+    Numeric distribution profiler for Polars DataFrames.
+    Parameters
+    ----------
+    columns : list[str]
+        Columns to profile.  Non-numeric or absent columns are skipped
+        with a warning; they do not raise.
+    config : ProfileConfig | None
+        Shared profiling configuration.
+    """
+    def __init__(
+        self,
+        config: ProfileConfig | None = None,
+    ) -> None:
+        super().__init__(config)
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def profile(
+        self,
+        data: pl.DataFrame,
+        columns: list[str],
+    ) -> NumericProfileResult:
+        return self._run(data, columns)
+    # ------------------------------------------------------------------
+    # Orchestration
+    # ------------------------------------------------------------------
+    def _eligible(self, series: pl.Series) -> bool:
+        override = self.config.column_overrides.get(series.name)
+        if override == SemanticType.Numeric:
+            return True
+        if override is not None:
+            return False
+        return series.dtype in _NUMERIC_DTYPES
+    def _run(
+        self,
+        df: pl.DataFrame,
+        columns: list[str],
+    ) -> NumericProfileResult:
+        result = NumericProfileResult()
+        n_rows = df.height
+        # Intersect requested columns with the actual schema
+        available = [
+            c
+            for c in self._resolve_columns(df.columns, columns)
+            if self._eligible(df[c])
+        ]
+        result.analysed_columns = available
+        for col_name in available:
+            series = df[col_name]
+            profile = self._profile_column(series, n_rows)
+            result.columns[col_name] = profile
+        return result
+    # ------------------------------------------------------------------
+    # Per-column driver
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _compute_frequency_and_distribution(
+        original_series: pl.Series,
+        clean_f64: pl.Series,
+        profile: NumericStats,
+        n_rows: int,
+    ) -> None:
+        """
+        Compute Mode, and depending on whether the feature is continuous or discrete,
+        calculate a 20-bin histogram OR Top-10 value counts.
+        """
+        if clean_f64.len() == 0:
+            return
+        vc = clean_f64.value_counts(sort=True)
+        col_name = clean_f64.name
+        # --- Absolute Mode Frequency ---
+        mode_val = float(vc[col_name][0])
+        mode_count = int(vc["count"][0])
+        mode_freq = mode_count / n_rows if n_rows > 0 else 0.0
+        profile.mode = mode_val
+        profile.mode_frequency = mode_freq
+        if mode_freq > _NEAR_CONSTANT_THRESHOLD:
+            profile.flags.append(NumericFlag.NearConstant)
+        n_unique = vc.height
+        is_discrete = (
+            original_series.dtype in _INT_DTYPES or n_unique <= _DISCRETE_MAX_UNIQUE
+        )
+        if is_discrete:
+            # --- Top-10 Distribution (Discrete) ---
+            top_rows = min(10, n_unique)
+            profile.top_values = [
+                NumericTopValueEntry(
+                    value=float(vc[col_name][i]),
+                    count=int(vc["count"][i]),
+                    percentage=int(vc["count"][i]) / n_rows if n_rows > 0 else 0.0,
+                )
+                for i in range(top_rows)
+            ]
+        else:
+            # --- 20-Bin Histogram Distribution (Continuous) ---
+            import numpy as np
+            counts, bin_edges = np.histogram(clean_f64.to_numpy(), bins=20)
+            profile.histogram = [
+                HistogramBin(
+                    lower_bound=float(bin_edges[i]),
+                    upper_bound=float(bin_edges[i + 1]),
+                    count=int(counts[i]),
+                    percentage=int(counts[i]) / n_rows if n_rows > 0 else 0.0,
+                )
+                for i in range(len(counts))
+            ]
+    def _profile_column(
+        self,
+        series: pl.Series,
+        n_rows: int,
+    ) -> NumericStats:
+        profile = NumericStats()
+        f64 = series.cast(pl.Float64)
+        clean = f64.drop_nulls()
+        if clean.len() == 0:
+            return profile
+        self._compute_central_tendency(clean, profile)
+        self._compute_range(clean, profile)
+        self._compute_frequency_and_distribution(series, clean, profile, n_rows)
+        self._compute_percentiles(clean, profile)
+        self._compute_spread(clean, profile)
+        self._compute_shape(clean, profile)
+        self._check_scale_anomaly(profile)
+        return profile
+    # ------------------------------------------------------------------
+    # Step 1: Central tendency
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _compute_central_tendency(
+        clean: pl.Series,
+        profile: NumericStats,
+    ) -> None:
+        mean = float(clean.mean())  # type: ignore[arg-type]
+        median = float(clean.median())  # type: ignore[arg-type]
+        profile.mean = mean
+        profile.median = median
+        # Mean/median ratio: primary skew indicator at a glance.
+        # Guard against division by zero (e.g. a column of all zeros).
+        if median == 0.0:
+            profile.mean_median_ratio = float("inf") if mean != 0.0 else 1.0
+        else:
+            profile.mean_median_ratio = mean / median
+    # ------------------------------------------------------------------
+    # Step 2: Spread
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _compute_spread(
+        clean: pl.Series,
+        profile: NumericStats,
+    ) -> None:
+        n = clean.len()
+        if n < 2:
+            # Std / variance undefined for a single observation
+            profile.std = 0.0
+            profile.variance = 0.0
+            return
+        std = float(clean.std(ddof=1))  # type: ignore[arg-type]
+        profile.std = std
+        profile.variance = std**2
+    # ------------------------------------------------------------------
+    # Step 3: Shape — skewness and kurtosis
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _compute_shape(
+        clean: pl.Series,
+        profile: NumericStats,
+    ) -> None:
+        from scipy.stats import skew, kurtosis as scipy_kurtosis
+        if clean.len() < 3:
+            return
+        if profile.std is None or profile.std == 0.0:
+            profile.skewness = 0.0
+            profile.kurtosis = 0.0
+            profile.skewness_severity = SkewSeverity.Normal
+            profile.kurtosis_tag = KurtosisTag.Mesokurtic
+            return
+        arr = clean.to_numpy()
+        profile.skewness = float(skew(arr, bias=False))
+        profile.kurtosis = float(scipy_kurtosis(arr, bias=False))
+        abs_skew = abs(profile.skewness)
+        if abs_skew <= _SKEW_NORMAL:
+            profile.skewness_severity = SkewSeverity.Normal
+        elif abs_skew <= _SKEW_MODERATE:
+            profile.skewness_severity = SkewSeverity.Moderate
+        elif abs_skew <= _SKEW_HIGH:
+            profile.skewness_severity = SkewSeverity.High
+        else:
+            profile.skewness_severity = SkewSeverity.Severe
+        if profile.kurtosis < _KURT_PLATY_UPPER:
+            profile.kurtosis_tag = KurtosisTag.Platykurtic
+        elif profile.kurtosis > _KURT_LEPTO_LOWER:
+            profile.kurtosis_tag = KurtosisTag.Leptokurtic
+        else:
+            profile.kurtosis_tag = KurtosisTag.Mesokurtic
+    # ------------------------------------------------------------------
+    # Step 4: Range
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _compute_range(
+        clean: pl.Series,
+        profile: NumericStats,
+    ) -> None:
+        profile.min = float(clean.min())  # type: ignore[arg-type]
+        profile.max = float(clean.max())  # type: ignore[arg-type]
+    # ------------------------------------------------------------------
+    # Step 5: Percentiles
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _compute_percentiles(
+        clean: pl.Series,
+        profile: NumericStats,
+    ) -> None:
+        # Polars quantile() is O(n log n) once; compute all at once via select
+        # to avoid repeated passes.
+        quantile_frame = pl.DataFrame({"v": clean}).select(
+            [
+                pl.col("v").quantile(q, interpolation="linear").alias(f"q{i}")
+                for i, q in enumerate(_QUANTILE_LEVELS)
+            ]
+        )
+        row = quantile_frame.row(0)
+        # row order: p1, p5, p25, p50, p75, p95, p99
+        profile.percentiles = PercentileSnapshot(
+            p1=row[0],
+            p5=row[1],
+            p25=row[2],
+            p50=row[3],
+            p75=row[4],
+            p95=row[5],
+            p99=row[6],
+        )
+    # ------------------------------------------------------------------
+    # Step 6: Scale-anomaly flag
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _check_scale_anomaly(
+        profile: NumericStats,
+    ) -> None:
+        """
+        Flag when values span ≥ 3 orders of magnitude *on the positive side*.
+        Rationale: a column with values like [0.002, 15000] almost certainly
+        mixes units or scales, which will mislead distance-based models.
+        We use the absolute-value range to handle columns that cross zero
+        (e.g. log-returns that go from -0.05 to 500).  Columns whose
+        entire range is within [-1, 1] are exempt (percentages, probabilities).
+        """
+        col_min = profile.min
+        col_max = profile.max
+        if col_min is None or col_max is None:
+            return
+        abs_min = abs(col_min)
+        abs_max = abs(col_max)
+        # Skip all-zero or all-same-sign tiny ranges
+        if abs_max == 0.0:
+            return
+        # Exempt probability / ratio columns
+        if abs_max <= 1.0 and abs_min <= 1.0:
+            return
+        # Compute orders of magnitude
+        if abs_min == 0.0:
+            # Any non-zero max with a zero minimum → infinite ratio →
+            # conservatively flag if max is large enough to be suspicious.
+            if abs_max >= 10**_SCALE_ORDERS_OF_MAGNITUDE:
+                profile.flags.append(NumericFlag.ScaleAnomaly)
+            return
+        ratio = abs_max / abs_min
+        if ratio >= 10**_SCALE_ORDERS_OF_MAGNITUDE:
+            profile.flags.append(NumericFlag.ScaleAnomaly)

profiling/_tabular.py ADDED Viewed

@@ -0,0 +1,249 @@
+"""
+TabularProfiler  –  Phase 1: Structural Profiling for tabular datasets.
+All DataFrame operations use Polars (no pandas dependency).
+Computes:
+  • row / column count                (always full dataset)
+  • memory usage + per-column breakdown when threshold exceeded
+  • duplicate row count & ratio       (scoped to config.duplicate_columns)
+  • overall sparsity                  (scoped to config.sparsity_columns)
+  • data-type detection               (scoped to config.type_detection_columns;
+                                       skipped entirely when None)
+Chunked processing is activated automatically when the DataFrame's
+estimated memory exceeds config.memory_threshold_mb.
+"""
+from __future__ import annotations
+import math
+import polars as pl
+from ._base import ModalityProfiler
+from .config import (
+    MemoryBreakdown,
+    ProfileConfig,
+    DatasetStats,
+)
+class TabularProfiler(ModalityProfiler):
+    """
+    Structural profiler for Polars DataFrames.
+    Usage
+    -----
+    >>> cfg = ProfileConfig(
+    ...     duplicate_columns=["user_id", "event_time"],
+    ...     sparsity_columns=["age", "income", "postcode"],
+    ...     type_detection_columns=["age", "income", "postcode", "created_at"],
+    ...     memory_threshold_mb=200,
+    ... )
+    >>> profiler = TabularProfiler(config=cfg)
+    >>> result = profiler.profile(df)
+    >>> print(result)
+    """
+    def __init__(self, config: ProfileConfig | None = None):
+        super().__init__(config)
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def profile(self, data: pl.DataFrame, **kwargs) -> DatasetStats:
+        return self._run(data)
+    # ------------------------------------------------------------------
+    # Internals
+    # ------------------------------------------------------------------
+    def _run(self, df: pl.DataFrame) -> DatasetStats:
+        result = DatasetStats()
+        # 1. Shape — always computed on the full frame
+        result.row_count = df.height
+        result.column_count = df.width
+        # 2. Memory
+        self._analyse_memory(df, result)
+        # Decide processing mode AFTER memory analysis
+        use_chunks = (result.memory_breakdown is not None) and result.row_count > 0
+        result.was_chunked = use_chunks
+        if result.row_count == 0:
+            return result
+        # 3. Resolve column scopes
+        all_cols: list[str] = df.columns
+        analysed_cols = [c for c in all_cols if c not in self.config.exclude_columns]
+        dup_cols = analysed_cols
+        missingness_cols = analysed_cols
+        if use_chunks:
+            self._chunked_metrics(df, dup_cols, missingness_cols, result)
+        else:
+            self._full_metrics(df, dup_cols, missingness_cols, result)
+        return result
+    @staticmethod
+    def _build_missingness_exprs(df: pl.DataFrame, cols: list[str]) -> list[pl.Expr]:
+        exprs = []
+        for col_name in cols:
+            dtype = df[col_name].dtype
+            std_expr = pl.col(col_name).is_null()
+            if dtype in (pl.Utf8, pl.String):
+                eff_expr = (
+                    std_expr
+                    | (pl.col(col_name).str.strip_chars() == "")
+                    | pl.col(col_name)
+                    .str.to_uppercase()
+                    .is_in(["NA", "NAN", "NULL", "NONE", "?"])
+                )
+            elif dtype in (pl.Float32, pl.Float64):
+                eff_expr = (
+                    std_expr
+                    | pl.col(col_name).is_nan()
+                    | pl.col(col_name).is_infinite()
+                )
+            else:
+                eff_expr = std_expr
+            exprs.append(std_expr.sum().alias(f"{col_name}_std"))
+            exprs.append(eff_expr.sum().alias(f"{col_name}_eff"))
+        return exprs
+    # ------------------------------------------------------------------
+    # Memory analysis
+    # ------------------------------------------------------------------
+    def _analyse_memory(self, df: pl.DataFrame, result: DatasetStats) -> None:
+        """
+        Populate memory fields on *result*.
+        Polars exposes estimated_size() per Series for heap allocation.
+        """
+        col_bytes: dict[str, int] = {
+            col: df[col].estimated_size() for col in df.columns
+        }
+        total_bytes = sum(col_bytes.values())
+        result.memory_bytes = total_bytes
+        threshold_bytes = self.config.memory_threshold_mb * 1024 * 1024
+        if total_bytes > threshold_bytes:
+            result.memory_breakdown = MemoryBreakdown(column_bytes=col_bytes)
+    # ------------------------------------------------------------------
+    # Full-frame metrics
+    # ------------------------------------------------------------------
+    def _full_metrics(
+        self,
+        df: pl.DataFrame,
+        dup_cols: list[str],
+        missing_cols: list[str],
+        result: DatasetStats,
+    ) -> None:
+        result.duplicate_count = self._count_duplicates(df, dup_cols)
+        result.duplicate_ratio = (
+            result.duplicate_count / result.row_count if result.row_count else 0.0
+        )
+        if missing_cols:
+            exprs = self._build_missingness_exprs(df, missing_cols)
+            row = df.select(exprs).row(0)
+            total_eff_cells = 0
+            for i, _ in enumerate(missing_cols):
+                eff_nulls = row[i * 2 + 1]
+                total_eff_cells += eff_nulls
+            total_cells = result.row_count * len(missing_cols)
+            result.overall_sparsity = (
+                total_eff_cells / total_cells if total_cells else 0.0
+            )
+    # ------------------------------------------------------------------
+    # Chunked metrics
+    # ------------------------------------------------------------------
+    def _chunked_metrics(
+        self,
+        df: pl.DataFrame,
+        dup_cols: list[str],
+        sparsity_cols: list[str],
+        result: DatasetStats,
+    ) -> None:
+        """
+        Stream through the DataFrame in row-chunks to keep peak memory low.
+        Duplicate detection: hash the dup_cols subset row-by-row and track
+        seen hashes — semantics match keep='first'.
+        Sparsity is accumulated as (missing_cells, total_cells).
+        """
+        chunk_size = self.config.chunk_size
+        n_chunks = math.ceil(result.row_count / chunk_size)
+        seen_hashes: set[int] = set()
+        dup_count = 0
+        missing_cells = 0
+        total_cells = 0
+        for i in range(n_chunks):
+            start = i * chunk_size
+            end = min(start + chunk_size, result.row_count)
+            chunk: pl.DataFrame = df.slice(start, end - start)
+            if dup_cols:
+            # --- duplicates ---
+                sub = chunk.select(dup_cols) if dup_cols else chunk
+                for row_tuple in sub.iter_rows():
+                    h = hash(row_tuple)
+                    if h in seen_hashes:
+                        dup_count += 1
+                    else:
+                        seen_hashes.add(h)
+            if sparsity_cols:
+            # --- sparsity ---
+                exprs = self._build_missingness_exprs(chunk, sparsity_cols)
+                row = chunk.select(exprs).row(0)
+                for j in range(len(sparsity_cols)):
+                    missing_cells += row[j * 2 + 1]
+                total_cells += chunk.height * len(sparsity_cols)
+        result.duplicate_count = dup_count
+        result.duplicate_ratio = (
+            dup_count / result.row_count if result.row_count else 0.0
+        )
+        result.overall_sparsity = missing_cells / total_cells if total_cells else 0.0
+    # ------------------------------------------------------------------
+    # Type detection
+    # ------------------------------------------------------------------
+    # ------------------------------------------------------------------
+    # Stateless helpers
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _count_duplicates(df: pl.DataFrame, cols: list[str]) -> int:
+        """
+        Count rows that are duplicates (keeping first occurrence).
+        Equivalent to pandas duplicated(subset=cols, keep='first').sum().
+        """
+        sub = df.select(cols) if cols else df
+        # is_duplicated() marks ALL occurrences of a duplicate group.
+        # We want only the non-first occurrences, so we subtract the
+        # number of unique rows.
+        n_unique = sub.unique().height
+        return df.height - n_unique