PyPI - dataforge-ml - Versions diffs - 0.1.0__py3-none-any.whl - Mend

dataforge-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

dataforge_ml-0.1.0.dist-info/METADATA +34 -0
dataforge_ml-0.1.0.dist-info/RECORD +54 -0
dataforge_ml-0.1.0.dist-info/WHEEL +5 -0
dataforge_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
dataforge_ml-0.1.0.dist-info/top_level.txt +5 -0
models/__init__.py +0 -0
models/_data_structure.py +7 -0
models/_data_types.py +12 -0
profiling/__init__.py +35 -0
profiling/_base.py +101 -0
profiling/_boolean_config.py +37 -0
profiling/_boolean_profiler.py +191 -0
profiling/_categorical.py +315 -0
profiling/_categorical_config.py +87 -0
profiling/_correlation_config.py +225 -0
profiling/_correlation_profiler.py +544 -0
profiling/_datetime_config.py +98 -0
profiling/_datetime_profiler.py +406 -0
profiling/_missingness_config.py +137 -0
profiling/_missingness_profiler.py +252 -0
profiling/_numeric_config.py +116 -0
profiling/_numeric_profiler.py +403 -0
profiling/_tabular.py +249 -0
profiling/_target_config.py +74 -0
profiling/_target_profiler.py +156 -0
profiling/_text_config.py +40 -0
profiling/_text_profiler.py +194 -0
profiling/_type_detector.py +463 -0
profiling/config.py +236 -0
profiling/structural.py +280 -0
splitting/__init__.py +4 -0
splitting/_config.py +56 -0
splitting/_splitter.py +202 -0
tests/__init__.py +0 -0
tests/conftest.py +7 -0
tests/integration/__init__.py +0 -0
tests/integration/conftest.py +82 -0
tests/integration/test_structural_end_to_end.py +219 -0
tests/unit/__init__.py +0 -0
tests/unit/profiling/__init__.py +0 -0
tests/unit/profiling/conftest.py +81 -0
tests/unit/profiling/test_boolean_profiler.py +91 -0
tests/unit/profiling/test_categorical_profiler.py +182 -0
tests/unit/profiling/test_correlation_profiler.py +124 -0
tests/unit/profiling/test_datetime_profiler.py +133 -0
tests/unit/profiling/test_missingness_profiler.py +51 -0
tests/unit/profiling/test_numeric_profiler.py +212 -0
tests/unit/profiling/test_target_profiler.py +44 -0
tests/unit/profiling/test_text_profiler.py +61 -0
tests/unit/profiling/test_type_detector.py +32 -0
tests/unit/splitting/__init__.py +0 -0
tests/unit/splitting/test_data_splitter.py +417 -0
utils/__init__.py +0 -0
utils/data_loader.py +110 -0

profiling/_datetime_profiler.py ADDED Viewed

@@ -0,0 +1,406 @@
+"""
+DatetimeProfiler  –  Phase 1 extension: Datetime Column Profiling.
+Per-column metrics (opt-in via ProfileConfig.datetime_columns):
+  1. Range              – min date, max date, total range in days
+  2. Null analysis      – count, ratio, MNAR flag when null_ratio > 5 %
+  3. Future dates       – count of values > now, with context note
+  4. Granularity        – inferred periodicity from median consecutive gap;
+                          high gap-CV flagged as irregular
+  5. Temporal signals   – audit which of {year, month, day, day-of-week,
+                          hour, is-weekend, is-month-end} vary in the data,
+                          to guide downstream feature engineering
+Granularity inference bands (median gap in seconds):
+  < 90 s        → secondly
+  < 3 600 s     → minutely
+  < 7 200 s     → hourly
+  < 172 800 s   → daily      (< 2 days)
+  < 1 209 600 s → weekly     (< 14 days)
+  < 5 184 000 s → monthly    (< 60 days)
+  else          → yearly
+Integration
+-----------
+Add ``datetime_columns: list[str] | None`` to ProfileConfig, then call::
+    from profiling.datetime_profiler import DatetimeProfiler
+    dt_profiler = DatetimeProfiler(
+        columns=["created_at", "event_time"],
+        config=cfg,
+    )
+    dt_result = dt_profiler.profile(df)
+Attach ``dt_result`` to ``StructuralProfileResult`` as
+``result.datetime``.
+"""
+from __future__ import annotations
+from datetime import datetime, timezone
+import polars as pl
+from ._base import ColumnBatchProfiler
+from .config import (
+    ProfileConfig,
+    SemanticType,
+)
+from ._datetime_config import (
+    DatetimeProfileResult,
+    DatetimeStats,
+    InferredGranularity,
+    DatetimeFlag,
+    TemporalSignals,
+)
+# ---------------------------------------------------------------------------
+# Thresholds
+# ---------------------------------------------------------------------------
+# MNAR suspicion: missing rate above this fraction → flag
+_MNAR_NULL_RATIO_THRESHOLD: float = 0.05
+# Gap coefficient of variation above this → flag as irregular
+_HIGH_GAP_CV_THRESHOLD: float = 1.0
+# Granularity bands — upper bound (exclusive) in seconds for each label
+# Ordered from finest to coarsest.
+_GRANULARITY_BANDS: list[tuple[float, InferredGranularity]] = [
+    (90.0, InferredGranularity.Secondly),  # < 1.5 min
+    (3_600.0, InferredGranularity.Minutely),  # < 1 h
+    (7_200.0, InferredGranularity.Hourly),  # < 2 h
+    (172_800.0, InferredGranularity.Daily),  # < 2 days
+    (1_209_600.0, InferredGranularity.Weekly),  # < 14 days
+    (5_184_000.0, InferredGranularity.Monthly),  # < 60 days
+]
+# Anything ≥ 5_184_000 s → Yearly
+# Recent-data sparsity: consider the last this-fraction of the total range
+_RECENT_WINDOW_FRACTION: float = 0.10
+def _is_datetime_dtype(dtype: pl.DataType) -> bool:
+    """Return True for Date, Datetime (any time-unit / tz)."""
+    return isinstance(dtype, (pl.Date, pl.Datetime))
+class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
+    """
+    Datetime distribution profiler for Polars DataFrames.
+    Parameters
+    ----------
+    columns : list[str]
+        Columns to profile.  Non-datetime columns are skipped with a warning.
+    config : ProfileConfig | None
+        Shared profiling configuration.
+    """
+    def __init__(
+        self,
+        config: ProfileConfig | None = None,
+    ) -> None:
+        super().__init__(config)
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def profile(
+        self,
+        data: pl.DataFrame,
+        columns: list[str],
+    ) -> DatetimeProfileResult:
+        return self._run(data, columns)
+    # ------------------------------------------------------------------
+    # Orchestration
+    # ------------------------------------------------------------------
+    def _eligible(self, series: pl.Series) -> bool:
+        override = self.config.column_overrides.get(series.name)
+        if override == SemanticType.Datetime:
+            return True
+        if override is not None:
+            return False
+        return _is_datetime_dtype(series.dtype) or series.dtype in (pl.Utf8, pl.String)
+    def _coerce_to_datetime(self, series: pl.Series) -> pl.Series | None:
+        if series.dtype in (pl.Utf8, pl.String):
+            coerced = series.str.to_datetime(strict=False)
+            return coerced if coerced.drop_nulls().len() > 0 else None
+        return series
+    def _run(self, df: pl.DataFrame, columns: list[str]) -> DatetimeProfileResult:
+        result = DatetimeProfileResult()
+        now = datetime.now(tz=timezone.utc)
+        candidates = [
+            c
+            for c in self._resolve_columns(df.columns, columns)
+            if self._eligible(df[c])
+        ]
+        available = []
+        coerced_cache = {}
+        for col_name in candidates:
+            series = self._coerce_to_datetime(df[col_name])
+            if series is not None:
+                available.append(col_name)
+                coerced_cache[col_name] = series
+        result.analysed_columns = available
+        for col_name in available:
+            profile = self._profile_column(coerced_cache[col_name], df.height, now)
+            result.columns[col_name] = profile
+        return result
+    # ------------------------------------------------------------------
+    # Per-column driver
+    # ------------------------------------------------------------------
+    def _profile_column(
+        self,
+        series: pl.Series,
+        n_rows: int,
+        now: datetime,
+    ) -> DatetimeStats:
+        profile = DatetimeStats()
+        # Normalise to microsecond Datetime (UTC) for uniform arithmetic
+        # Date columns are cast to Datetime at midnight UTC.
+        if isinstance(series.dtype, pl.Date):
+            series = series.cast(pl.Datetime("us", "UTC"))
+        elif isinstance(series.dtype, pl.Datetime):
+            if series.dtype.time_zone is None:
+                series = series.dt.replace_time_zone("UTC")
+            else:
+                series = series.dt.convert_time_zone("UTC")
+        # Drop nulls for all remaining computations
+        clean = series.drop_nulls()
+        if clean.len() == 0:
+            return profile
+        # 2. Range
+        self._compute_range(clean, profile)
+        # 3. Future dates
+        self._check_future_dates(clean, profile, now)
+        # 4. Recent data sparsity (needs range, so after _compute_range)
+        self._check_recent_date_missing(series, profile)
+        # 5. Granularity
+        self._infer_granularity(clean, profile)
+        # 6. Temporal signals
+        self._audit_temporal_signals(clean, profile)
+        return profile
+    # ------------------------------------------------------------------
+    # Step 2: Range
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _compute_range(
+        clean: pl.Series,
+        profile: DatetimeStats,
+    ) -> None:
+        min_ts = clean.min()
+        max_ts = clean.max()
+        if min_ts is not None:
+            profile.min_date = (
+                min_ts.replace(tzinfo=timezone.utc)
+                if isinstance(min_ts, datetime)
+                else min_ts
+            )
+        if max_ts is not None:
+            profile.max_date = (
+                max_ts.replace(tzinfo=timezone.utc)
+                if isinstance(max_ts, datetime)
+                else max_ts
+            )
+        if profile.min_date is not None and profile.max_date is not None:
+            delta = profile.max_date - profile.min_date
+            profile.date_range_days = delta.total_seconds() / 86_400.0
+    # ------------------------------------------------------------------
+    # Step 3: Future dates
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _check_future_dates(
+        clean: pl.Series,
+        profile: DatetimeStats,
+        now: datetime,
+    ) -> None:
+        # Cast to Int64 (epoch microseconds) and compare against now scalar
+        now_us = int(now.timestamp() * 1_000_000)
+        ts_int = clean.cast(pl.Int64)
+        future_mask = ts_int > now_us
+        future_count = int(future_mask.sum())
+        profile.future_date_count = future_count
+        if future_count > 0:
+            profile.flags.append(DatetimeFlag.FutureDates)
+    # ------------------------------------------------------------------
+    # Step 3b: Recent data sparsity
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _check_recent_date_missing(
+        series: pl.Series,
+        profile: DatetimeStats,
+    ) -> None:
+        """
+        Flag when the last _RECENT_WINDOW_FRACTION of the expected date
+        range contains fewer observations than expected.
+        We compare density in the recent window vs overall density.
+        If the recent window has < 20 % of the expected density → flag.
+        """
+        if profile.min_date is None or profile.max_date is None:
+            return
+        if profile.date_range_days is None or profile.date_range_days == 0:
+            return
+        range_seconds = profile.date_range_days * 86_400.0
+        window_seconds = range_seconds * _RECENT_WINDOW_FRACTION
+        # Compute cutoff as epoch microseconds
+        max_ts_us = int(profile.max_date.timestamp() * 1_000_000)
+        window_us = int(window_seconds * 1_000_000)
+        cutoff_us = max_ts_us - window_us
+        # Cast series to Int64 (epoch microseconds) for comparison
+        ts_int = series.cast(pl.Int64)
+        recent_mask = ts_int >= cutoff_us
+        recent_count = int(recent_mask.sum())
+        # Expected count if uniform distribution
+        total_non_null = series.drop_nulls().len()
+        if total_non_null == 0:
+            return
+        expected_recent = total_non_null * _RECENT_WINDOW_FRACTION
+        density_ratio = recent_count / expected_recent if expected_recent > 0 else 1.0
+        if density_ratio < 0.20:
+            profile.flags.append(DatetimeFlag.RecentDateMissing)
+    # ------------------------------------------------------------------
+    # Step 4: Granularity inference
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _infer_granularity(
+        clean: pl.Series,
+        profile: DatetimeStats,
+    ) -> None:
+        """
+        Sort values, compute consecutive gaps in seconds, derive median gap.
+        Uses Int64 epoch-microsecond representation for vectorised diff.
+        """
+        n = clean.len()
+        if n < 2:
+            profile.inferred_granularity = InferredGranularity.Irregular
+            return
+        ts_us = clean.sort().cast(pl.Int64)  # microseconds since epoch
+        gaps_us = ts_us.diff().drop_nulls()  # consecutive differences
+        # Discard zero and negative gaps (exact duplicates or out-of-order noise)
+        gaps_us = gaps_us.filter(gaps_us > 0)
+        if gaps_us.len() == 0:
+            profile.inferred_granularity = InferredGranularity.Irregular
+            return
+        gaps_s = gaps_us.cast(pl.Float64) / 1_000_000.0  # → seconds
+        median_gap_s = float(gaps_s.median())  # type: ignore[arg-type]
+        mean_gap_s = float(gaps_s.mean())  # type: ignore[arg-type]
+        std_gap_s = float(gaps_s.std(ddof=1)) if gaps_s.len() > 1 else 0.0
+        profile.median_gap_seconds = median_gap_s
+        # Coefficient of variation (robust to skewed gap distributions)
+        if mean_gap_s > 0:
+            profile.gap_cv = std_gap_s / mean_gap_s
+            if profile.gap_cv > _HIGH_GAP_CV_THRESHOLD:
+                profile.flags.append(DatetimeFlag.HighGapVariance)
+        else:
+            profile.gap_cv = 0.0
+        # Map median gap to granularity label
+        granularity = InferredGranularity.Yearly  # default (coarsest)
+        for upper_bound, label in _GRANULARITY_BANDS:
+            if median_gap_s < upper_bound:
+                granularity = label
+                break
+        profile.inferred_granularity = granularity
+    # ------------------------------------------------------------------
+    # Step 5: Temporal signal audit
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _audit_temporal_signals(
+        clean: pl.Series,
+        profile: DatetimeStats,
+    ) -> None:
+        """
+        Check which temporal features vary across rows
+        All checks are done via Polars expressions on the full clean series,
+        so no Python-level loops are required.
+        """
+        signals = TemporalSignals()
+        years = clean.dt.year()
+        months = clean.dt.month()
+        days = clean.dt.day()
+        dow = clean.dt.weekday()  # 0=Monday … 6=Sunday
+        hours = clean.dt.hour()
+        signals.has_year = years.n_unique() > 1
+        signals.has_month = months.n_unique() > 1
+        signals.has_day = days.n_unique() > 1
+        signals.has_day_of_week = dow.n_unique() > 1
+        signals.has_hour = int(hours.max()) > 0  # type: ignore[arg-type]
+        # Weekend signal is only meaningful when day-of-week varies
+        if signals.has_day_of_week:
+            weekend_mask = dow >= 5  # Saturday=5, Sunday=6
+            signals.has_is_weekend = bool(weekend_mask.any())
+        # Month-end: day == last day of the respective month
+        # We approximate: day == 28/29/30/31 AND next-day's month ≠ current month.
+        # Polars has dt.month_end() which returns the last day of the month.
+        try:
+            month_end_ts = clean.dt.month_end()
+            # Strip time component for date-level comparison
+            is_month_end_mask = (
+                (clean.dt.year() == month_end_ts.dt.year())
+                & (clean.dt.month() == month_end_ts.dt.month())
+                & (clean.dt.day() == month_end_ts.dt.day())
+            )
+            signals.has_is_month_end = bool(is_month_end_mask.any())
+        except Exception:
+            # Fallback: flag if day ≥ 28
+            signals.has_is_month_end = False
+        profile.signals = signals

profiling/_missingness_config.py ADDED Viewed

@@ -0,0 +1,137 @@
+"""
+Result dataclasses for missingness profiling.
+Populated by MissingnessProfiler, which is always run as part of
+StructuralProfiler (non-optional Phase 1 component).
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from enum import StrEnum
+from typing import Optional
+# ---------------------------------------------------------------------------
+# Enums
+# ---------------------------------------------------------------------------
+class MissingSeverity(StrEnum):
+    Minor = "minor"  # < 1%   missing
+    Moderate = "moderate"  # 1–5%   missing
+    High = "high"  # 5–20%  missing
+    Severe = "severe"  # > 20%  missing
+class MissingnessFlag(StrEnum):
+    FullyNull = "fully_null"  # missing ratio == 1.0 → must drop
+    MARSuspect = "mar_suspect"  # correlated missingness with ≥1 other col
+    DropCandidate = "drop_candidate"  # >50% of rows missing across the column
+# ---------------------------------------------------------------------------
+# Per-column result
+# ---------------------------------------------------------------------------
+@dataclass
+class ColumnMissingnessProfile:
+    """
+    Full missingness profile for a single column.
+    Attributes
+    ----------
+    column : str
+        Column name.
+    total_rows : int
+        Total rows in the DataFrame.
+    standard_null_count : int
+        Polars-level nulls (None / NaN for floats).
+    effective_null_count : int
+        Standard nulls + whitespace-only strings + sentinel strings
+        ("NA", "NAN", "NULL", "NONE", "?") — i.e. the count used for
+        imputation decisions.
+    standard_null_ratio : float
+        standard_null_count / total_rows.
+    effective_null_ratio : float
+        effective_null_count / total_rows.
+    severity : MissingSeverity
+        Derived from effective_null_ratio.
+    flags : list[MissingnessFlag]
+        Zero or more non-exclusive behavioural flags.
+    correlated_with : list[str]
+        Columns whose missingness indicator correlates > 0.6 with this
+        column's indicator (populated after the correlation matrix pass).
+    """
+    column: str
+    total_rows: int
+    standard_null_count: int = 0
+    effective_null_count: int = 0
+    standard_null_ratio: float = 0.0
+    effective_null_ratio: float = 0.0
+    severity: Optional[MissingSeverity] = None
+    flags: list[MissingnessFlag] = field(default_factory=list)
+    correlated_with: list[str] = field(default_factory=list)
+    def has_flag(self, flag: MissingnessFlag) -> bool:
+        return flag in self.flags
+    def __str__(self) -> str:  # pragma: no cover
+        lines = [
+            f"  Column : {self.column}",
+            f"    Standard nulls     : {self.standard_null_count:,}"
+            f"  ({self.standard_null_ratio:.2%})",
+            f"    Effective nulls    : {self.effective_null_count:,}"
+            f"  ({self.effective_null_ratio:.2%})",
+            f"    Severity           : {self.severity or 'N/A'}",
+        ]
+        if self.correlated_with:
+            lines.append(f"    MAR correlates with: {', '.join(self.correlated_with)}")
+        if self.flags:
+            lines.append(f"    Flags              : {', '.join(self.flags)}")
+        return "\n".join(lines)
+@dataclass
+class MissingnessProfileResult:
+    """
+    Missingness profile for all analysed columns.
+    Attributes
+    ----------
+    columns : dict[str, ColumnMissingnessProfile]
+        Per-column profiles, keyed by column name.
+    analysed_columns : list[str]
+        Columns that were actually profiled.
+    fully_null_columns : list[str]
+        Columns where effective_null_ratio == 1.0.  Must be dropped.
+    correlation_matrix : dict[str, dict[str, float]]
+        Pairwise Pearson correlations between binary missingness indicators.
+        Only populated when ≥ 2 columns have at least one missing value.
+        Stored as a nested dict: matrix[col_a][col_b] = correlation.
+    row_distribution : RowMissingnessDistribution
+        Aggregate row-wise missingness summary.
+    """
+    columns: dict[str, ColumnMissingnessProfile] = field(default_factory=dict)
+    analysed_columns: list[str] = field(default_factory=list)
+    fully_null_columns: list[str] = field(default_factory=list)
+    correlation_matrix: dict[str, dict[str, float]] = field(default_factory=dict)
+    def __str__(self) -> str:  # pragma: no cover
+        lines = ["=== Missingness Profile ==="]
+        for profile in self.columns.values():
+            lines.append(str(profile))
+        if self.fully_null_columns:
+            lines.append(
+                f"\n  Fully-null columns (must drop): "
+                f"{', '.join(self.fully_null_columns)}"
+            )
+        return "\n".join(lines)