PyPI - dataforge-ml - Versions diffs - 2.0.6__tar.gz → 2.0.8__tar.gz - Mend

dataforge-ml 2.0.6tar.gz → 2.0.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

{dataforge_ml-2.0.6/src/dataforge_ml.egg-info → dataforge_ml-2.0.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataforge-ml
-Version: 2.0.6
+Version: 2.0.8
 Summary: A automated feature engineering and designing pipeline library
 License: MIT
 Classifier: License :: OSI Approved :: MIT License

{dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "dataforge-ml"
-version = "2.0.6"
+version = "2.0.8"
 description = "A automated feature engineering and designing pipeline library"
 readme = "README.md"
 requires-python = ">3.10"

{dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_boolean_config.py RENAMED Viewed

@@ -7,9 +7,14 @@ Populated by BooleanProfiler.
 from __future__ import annotations
 from dataclasses import dataclass, field
+from enum import StrEnum
 from typing import Optional
+class BooleanFlag(StrEnum):
+    FormatMismatch = "format_mismatch"
 @dataclass
 class BooleanStats:
     """Value distribution statistics for a single Boolean column.
@@ -24,6 +29,23 @@ class BooleanStats:
     true_ratio: float = 0.0
     false_ratio: float = 0.0
     mode: Optional[bool] = None
+    flags: list[BooleanFlag] = field(default_factory=list)
+    def has_flag(self, flag: BooleanFlag) -> bool:
+        """Check whether a specific ``BooleanFlag`` is set on this column.
+        Parameters
+        ----------
+        flag : BooleanFlag
+            The flag to test.
+        Returns
+        -------
+        bool
+            ``True`` if ``flag`` is present in :attr:`flags`, ``False``
+            otherwise.
+        """
+        return flag in self.flags
     def to_dict(self) -> dict:
         """Serialise the boolean statistics to a plain dictionary.
@@ -31,7 +53,8 @@ class BooleanStats:
         Returns
         -------
         dict
-            All fields keyed by field name.
+            All fields keyed by field name.  ``flags`` are serialised as their
+            string values.
         """
         return {
             "true_count": self.true_count,
@@ -39,6 +62,7 @@ class BooleanStats:
             "true_ratio": self.true_ratio,
             "false_ratio": self.false_ratio,
             "mode": self.mode,
+            "flags": [str(f) for f in self.flags],
         }

{dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_boolean_profiler.py RENAMED Viewed

@@ -23,7 +23,7 @@ import polars as pl
 from ._base import ColumnBatchProfiler
 from ._config import BooleanStats
-from ._boolean_config import BooleanProfileResult
+from ._boolean_config import BooleanFlag, BooleanProfileResult
 from ..models._data_types import _INT_DTYPES
 # ---------------------------------------------------------------------------
@@ -114,6 +114,13 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
         bool_series = self._to_bool_series(series)
         non_null_count = bool_series.len()
+        # FormatMismatch: a value that is present (non-null after the
+        # orchestrator's Effective-Null normalization) but falls outside the
+        # recognized true/false vocabulary is dropped by coercion.  A shortfall
+        # in the non-null count means the column holds dirty, uncoercible data.
+        if non_null_count < series.drop_nulls().len():
+            profile.flags.append(BooleanFlag.FormatMismatch)
         if non_null_count == 0:
             if series.drop_nulls().len() > 0 and col_name in user_overrides:
                 from ._base import OverrideCoercionError

{dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_config.py RENAMED Viewed

@@ -658,15 +658,18 @@ class ProfileConfig:
     numeric_sentinels: InitVar[Optional[dict[str, list[float]]]] = None
     string_sentinels: InitVar[Optional[dict[str, list[str]]]] = None
     datetime_epoch_units: InitVar[Optional[dict[str, Union[str, EpochUnit]]]] = None
+    datetime_formats: InitVar[Optional[dict[str, str]]] = None
     _numeric_sentinels: dict[str, list[float]] = field(default_factory=dict, init=False)
     _string_sentinels: dict[str, list[str]] = field(default_factory=dict, init=False)
     _datetime_epoch_units: dict[str, EpochUnit] = field(default_factory=dict, init=False)
+    _datetime_formats: dict[str, str] = field(default_factory=dict, init=False)
     def __post_init__(
         self,
         numeric_sentinels: Optional[dict[str, list[float]]],
         string_sentinels: Optional[dict[str, list[str]]],
         datetime_epoch_units: Optional[dict[str, Union[str, EpochUnit]]] = None,
+        datetime_formats: Optional[dict[str, str]] = None,
     ) -> None:
         if numeric_sentinels is not None and not isinstance(numeric_sentinels, property):
             for k, vals in numeric_sentinels.items():
@@ -677,6 +680,9 @@ class ProfileConfig:
         if datetime_epoch_units is not None and not isinstance(datetime_epoch_units, property):
             for k, val in datetime_epoch_units.items():
                 self.set_datetime_epoch_unit(k, val)
+        if datetime_formats is not None and not isinstance(datetime_formats, property):
+            for k, fmt in datetime_formats.items():
+                self.set_datetime_format(k, fmt)
     @property
     def numeric_sentinels(self) -> MappingProxyType[str, list[float]]:
@@ -731,6 +737,27 @@ class ProfileConfig:
         """
         return MappingProxyType(self._datetime_epoch_units)
+    @property
+    def datetime_formats(self) -> MappingProxyType[str, str]:
+        """
+        Get the per-column declared datetime format strings.
+        Keys are column names; values are strftime-style format strings (e.g.
+        ``{"Year": "%Y"}``) applied by ``DatetimeProfiler`` with
+        ``strict=False`` when coercing that column to Datetime. A declaration
+        applies to any column profiled as Datetime, whether overridden or
+        auto-detected. Format strings are not validated against strftime
+        grammar at declaration time — a bad format surfaces at profiling time.
+        Defaults to an empty dict — columns with no declaration fall back to
+        Polars format inference.
+        Returns
+        -------
+        MappingProxyType[str, str]
+            Read-only mapping of column names to declared datetime formats.
+        """
+        return MappingProxyType(self._datetime_formats)
     def set_numeric_sentinel(self, column: str | list[str], values: list[float]) -> None:
         """
         Set numeric sentinel values for one or more columns.
@@ -809,6 +836,38 @@ class ProfileConfig:
         for c in columns:
             self._datetime_epoch_units[c] = enum_unit
+    def set_datetime_format(self, column: str | list[str], format: str) -> None:
+        """
+        Declare a datetime format string for one or more columns.
+        The format is applied by ``DatetimeProfiler`` with ``strict=False``
+        when coercing the column to Datetime, and is not validated against
+        strftime grammar or the data at declaration time — a bad format
+        surfaces at profiling time, consistent with ``set_column_type`` and
+        ``set_datetime_epoch_unit``.
+        Parameters
+        ----------
+        column : str or list of str
+            Column name or list of column names to apply the format to.
+        format : str
+            A non-empty strftime-style format string (e.g. ``"%Y"``).
+        Raises
+        ------
+        ValueError
+            If any column name is empty, or if `format` is not a non-empty
+            string.
+        """
+        if not isinstance(format, str) or not format:
+            raise ValueError("format must be a non-empty string.")
+        columns = [column] if isinstance(column, str) else column
+        for c in columns:
+            if not isinstance(c, str) or not c:
+                raise ValueError("column name must be a non-empty string.")
+            self._datetime_formats[c] = format
     def to_dict(self) -> dict:
         """
         Serialise the config to a plain dictionary.
@@ -837,6 +896,7 @@ class ProfileConfig:
             "numeric_sentinels": {k: list(v) for k, v in self.numeric_sentinels.items()},
             "string_sentinels": {k: list(v) for k, v in self.string_sentinels.items()},
             "datetime_epoch_units": {k: v.value for k, v in self.datetime_epoch_units.items()},
+            "datetime_formats": {k: v for k, v in self.datetime_formats.items()},
         }
     @classmethod
@@ -884,6 +944,7 @@ class ProfileConfig:
             numeric_sentinels=data.get("numeric_sentinels", {}),
             string_sentinels=data.get("string_sentinels", {}),
             datetime_epoch_units=data.get("datetime_epoch_units", {}),
+            datetime_formats=data.get("datetime_formats", {}),
         )
         return config

{dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_datetime_config.py RENAMED Viewed

@@ -28,70 +28,7 @@ class DatetimeFlag(StrEnum):
     HighGapVariance = "high_gap_variance"
     MnarSuspected = "mnar_suspected"
     RecentDateMissing = "recent_date_missing"
-@dataclass
-class TemporalSignals:
-    """Which time-component features are present in a Datetime column.
-    Each boolean field indicates that the corresponding granularity was
-    detected as non-constant, making it a candidate for feature extraction
-    in Phase 5 Encoding.
-    """
-    has_year: bool = False
-    has_month: bool = False
-    has_day: bool = False
-    has_day_of_week: bool = False
-    has_hour: bool = False
-    has_is_weekend: bool = False
-    has_is_month_end: bool = False
-    def extractable_features(self) -> list[str]:
-        """Return the names of all time-component features that can be extracted.
-        Returns
-        -------
-        list[str]
-            Feature names corresponding to every ``has_*`` field that is
-            ``True``.  An empty list means no temporal variation was detected.
-        """
-        features = []
-        if self.has_year:
-            features.append("year")
-        if self.has_month:
-            features.append("month")
-        if self.has_day:
-            features.append("day_of_month")
-        if self.has_day_of_week:
-            features.append("day_of_week")
-        if self.has_hour:
-            features.append("hour")
-        if self.has_is_weekend:
-            features.append("is_weekend")
-        if self.has_is_month_end:
-            features.append("is_month_end")
-        return features
-    def to_dict(self) -> dict:
-        """Serialise the temporal signals to a plain dictionary.
-        Returns
-        -------
-        dict
-            All ``has_*`` flags plus an ``extractable_features`` key
-            containing the result of :meth:`extractable_features`.
-        """
-        return {
-            "has_year": self.has_year,
-            "has_month": self.has_month,
-            "has_day": self.has_day,
-            "has_day_of_week": self.has_day_of_week,
-            "has_hour": self.has_hour,
-            "has_is_weekend": self.has_is_weekend,
-            "has_is_month_end": self.has_is_month_end,
-            "extractable_features": self.extractable_features(),
-        }
+    FormatMismatch = "format_mismatch"
 @dataclass
@@ -99,8 +36,7 @@ class DatetimeStats:
     """Statistical summary of a single Datetime column.
     Produced by ``DatetimeProfiler`` for each opted-in column.  Stores
-    range, gap regularity, inferred granularity, and ``TemporalSignals``
-    indicating which time components are available for feature extraction.
+    range, gap regularity, and inferred granularity.
     """
     min_date: Optional[str] = None
@@ -110,7 +46,6 @@ class DatetimeStats:
     inferred_granularity: Optional[InferredGranularity] = None
     median_gap_seconds: Optional[float] = None
     gap_cv: Optional[float] = None
-    signals: TemporalSignals = field(default_factory=TemporalSignals)
     flags: list[DatetimeFlag] = field(default_factory=list)
     def has_flag(self, flag: DatetimeFlag) -> bool:
@@ -136,8 +71,7 @@ class DatetimeStats:
         -------
         dict
             All fields keyed by field name.  ``inferred_granularity`` is
-            serialised as its string value; ``signals`` is expanded via
-            :meth:`TemporalSignals.to_dict`; ``flags`` are serialised as
+            serialised as its string value; ``flags`` are serialised as
             their string values.
         """
         return {
@@ -148,7 +82,6 @@ class DatetimeStats:
             "inferred_granularity": str(self.inferred_granularity) if self.inferred_granularity else None,
             "median_gap_seconds": self.median_gap_seconds,
             "gap_cv": self.gap_cv,
-            "signals": self.signals.to_dict(),
             "flags": [str(f) for f in self.flags],
         }

{dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_datetime_profiler.py RENAMED Viewed

@@ -7,9 +7,10 @@ Per-column metrics (opt-in via ProfileConfig.datetime_columns):
   3. Future dates       – count of values > now, with context note
   4. Granularity        – inferred periodicity from median consecutive gap;
                           high gap-CV flagged as irregular
-  5. Temporal signals   – audit which of {year, month, day, day-of-week,
-                          hour, is-weekend, is-month-end} vary in the data,
-                          to guide downstream feature engineering
+Temporal-component variance (whether year, month, day-of-week, etc. vary) is
+intentionally not profiled here; Phase 5 Encoding derives it on demand from
+the column.
 Granularity inference bands (median gap in seconds):
   < 90 s        → secondly
@@ -34,7 +35,6 @@ from ._datetime_config import (
     DatetimeStats,
     InferredGranularity,
     DatetimeFlag,
-    TemporalSignals,
 )
 # Granularity bands — upper bound (exclusive) in seconds for each label.
@@ -68,9 +68,11 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
         self,
         config: DatetimeProfileConfig | None = None,
         epoch_units: dict[str, str] | None = None,
+        formats: dict[str, str] | None = None,
     ) -> None:
         self._config = config if config is not None else DatetimeProfileConfig()
         self._epoch_units = epoch_units or {}
+        self._formats = formats or {}
     # ------------------------------------------------------------------
     # Public API
@@ -112,8 +114,14 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
     def _coerce_to_datetime(self, series: pl.Series, col_name: str) -> pl.Series | None:
         if series.dtype in (pl.Utf8, pl.String):
+            declared_format = self._formats.get(col_name)
             try:
-                coerced = series.str.to_datetime(strict=False)
+                if declared_format is not None:
+                    coerced = series.str.to_datetime(
+                        format=declared_format, strict=False
+                    )
+                else:
+                    coerced = series.str.to_datetime(strict=False)
                 return coerced if coerced.drop_nulls().len() > 0 else None
             except pl.exceptions.ComputeError:
                 return None
@@ -139,22 +147,35 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
         available = []
         coerced_cache = {}
+        format_mismatch: dict[str, bool] = {}
         for col_name in self._resolve_columns(df.columns, columns):
-            series = self._coerce_to_datetime(df[col_name], col_name)
+            original = df[col_name]
+            series = self._coerce_to_datetime(original, col_name)
             if series is not None:
                 available.append(col_name)
                 coerced_cache[col_name] = series
+                # FormatMismatch: a value that is present (non-null after the
+                # orchestrator's Effective-Null normalization) but fails
+                # coercion becomes null here.  Compare non-null counts before
+                # and after coercion; a shortfall means dirty, uncoercible data.
+                format_mismatch[col_name] = (
+                    series.drop_nulls().len() < original.drop_nulls().len()
+                )
             elif col_name in user_overrides:
-                if df[col_name].drop_nulls().len() > 0:
+                if original.drop_nulls().len() > 0:
                     from ._base import OverrideCoercionError
                     raise OverrideCoercionError(
-                        f"Column {col_name!r} with TypeFlag.UserOverride completely failed coercion to Datetime."
+                        f"Column {col_name!r} with TypeFlag.UserOverride completely failed coercion to Datetime. "
+                        f"If Polars cannot infer the format, declare one explicitly via "
+                        f"ProfileConfig.set_datetime_format({col_name!r}, <format>) (e.g. '%Y' for bare years)."
                     )
         result.analysed_columns = available
         for col_name in available:
             profile = self._profile_column(coerced_cache[col_name], df.height, now)
+            if format_mismatch.get(col_name):
+                profile.flags.append(DatetimeFlag.FormatMismatch)
             result.columns[col_name] = profile
         return result
@@ -205,9 +226,6 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
         # 6. Granularity
         self._infer_granularity(clean, profile)
-        # 7. Temporal signals
-        self._audit_temporal_signals(clean, profile)
         return profile
     # ------------------------------------------------------------------
@@ -356,51 +374,3 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
                 break
         profile.inferred_granularity = granularity
-    # ------------------------------------------------------------------
-    # Step 7: Temporal signal audit
-    # ------------------------------------------------------------------
-    @staticmethod
-    def _audit_temporal_signals(
-        clean: pl.Series,
-        profile: DatetimeStats,
-    ) -> None:
-        """
-        Check which temporal features vary across rows.
-        All checks are done via Polars expressions on the full clean series,
-        so no Python-level loops are required.
-        """
-        signals = TemporalSignals()
-        years = clean.dt.year()
-        months = clean.dt.month()
-        days = clean.dt.day()
-        dow = clean.dt.weekday()  # 0=Monday … 6=Sunday
-        hours = clean.dt.hour()
-        signals.has_year = years.n_unique() > 1
-        signals.has_month = months.n_unique() > 1
-        signals.has_day = days.n_unique() > 1
-        signals.has_day_of_week = dow.n_unique() > 1
-        signals.has_hour = int(hours.max()) > 0  # type: ignore[arg-type]
-        # Weekend signal is only meaningful when day-of-week varies
-        if signals.has_day_of_week:
-            weekend_mask = dow >= 5  # Saturday=5, Sunday=6
-            signals.has_is_weekend = bool(weekend_mask.any())
-        # Month-end: day == last day of the respective month
-        try:
-            month_end_ts = clean.dt.month_end()
-            is_month_end_mask = (
-                (clean.dt.year() == month_end_ts.dt.year())
-                & (clean.dt.month() == month_end_ts.dt.month())
-                & (clean.dt.day() == month_end_ts.dt.day())
-            )
-            signals.has_is_month_end = bool(is_month_end_mask.any())
-        except Exception:
-            signals.has_is_month_end = False
-        profile.signals = signals

{dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_numeric_config.py RENAMED Viewed

@@ -170,6 +170,7 @@ class NumericFlag(StrEnum):
     NearConstant = "near_constant"
     Bimodal = "bimodal"
     HighOutlierDensity = "high_outlier_density"
+    FormatMismatch = "format_mismatch"
 @dataclass

{dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_numeric_profiler.py RENAMED Viewed

@@ -144,6 +144,13 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
             clean = f64.drop_nulls()
             profile = NumericStats()
+            # FormatMismatch: a value that is present (non-null after the
+            # orchestrator's Effective-Null normalization) but fails the
+            # Float64 cast becomes null here.  A shortfall in the non-null
+            # count means the column holds dirty, uncoercible data.
+            if clean.len() < series.drop_nulls().len():
+                profile.flags.append(NumericFlag.FormatMismatch)
             if clean.len() == 0:
                 if series.drop_nulls().len() > 0 and col in user_overrides:
                     from ._base import OverrideCoercionError

{dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/orchestrator.py RENAMED Viewed

@@ -224,13 +224,22 @@ class StructuralProfiler:
             type_to_cols.setdefault(sem_type, []).append(col_name)
         pc = self.config.profiling
+        profiling_frame = _resolve_effective_nulls(
+            data,
+            numeric_sentinels=dict(pc.numeric_sentinels),
+            string_sentinels=dict(pc.string_sentinels),
+        )
         for sem_type, cols in type_to_cols.items():
             if sem_type == SemanticType.Numeric:
                 profiler = NumericProfiler(config=pc.numeric)
             elif sem_type == SemanticType.Categorical:
                 profiler = CategoricalProfiler(config=pc.categorical)
             elif sem_type == SemanticType.Datetime:
-                profiler = DatetimeProfiler(config=pc.datetime_, epoch_units=pc.datetime_epoch_units)
+                profiler = DatetimeProfiler(
+                    config=pc.datetime_,
+                    epoch_units=pc.datetime_epoch_units,
+                    formats=pc.datetime_formats,
+                )
             else:
                 profiler_cls = _COLUMN_PROFILER_REGISTRY.get(sem_type)  # type: ignore[arg-type]
                 if profiler_cls is None:
@@ -241,7 +250,7 @@ class StructuralProfiler:
                     c for c in cols
                     if result.columns.get(c) and TypeFlag.UserOverride in result.columns[c].type_flags
                 }
-                batch = profiler.profile(data, columns=cols, user_overrides=user_overrides)
+                batch = profiler.profile(profiling_frame, columns=cols, user_overrides=user_overrides)
                 for col_name in batch.analysed_columns:
                     if col_name in result.columns:
                         result.columns[col_name].stats = batch.columns.get(col_name)

{dataforge_ml-2.0.6 → dataforge_ml-2.0.8/src/dataforge_ml.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataforge-ml
-Version: 2.0.6
+Version: 2.0.8
 Summary: A automated feature engineering and designing pipeline library
 License: MIT
 Classifier: License :: OSI Approved :: MIT License