PyPI - dataforge-ml - Versions diffs - 2.0.5__tar.gz → 2.0.7__tar.gz - Mend

dataforge-ml 2.0.5tar.gz → 2.0.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

{dataforge_ml-2.0.5/src/dataforge_ml.egg-info → dataforge_ml-2.0.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataforge-ml
-Version: 2.0.5
+Version: 2.0.7
 Summary: A automated feature engineering and designing pipeline library
 License: MIT
 Classifier: License :: OSI Approved :: MIT License

{dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "dataforge-ml"
-version = "2.0.5"
+version = "2.0.7"
 description = "A automated feature engineering and designing pipeline library"
 readme = "README.md"
 requires-python = ">3.10"

{dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_boolean_config.py RENAMED Viewed

@@ -7,9 +7,14 @@ Populated by BooleanProfiler.
 from __future__ import annotations
 from dataclasses import dataclass, field
+from enum import StrEnum
 from typing import Optional
+class BooleanFlag(StrEnum):
+    FormatMismatch = "format_mismatch"
 @dataclass
 class BooleanStats:
     """Value distribution statistics for a single Boolean column.
@@ -24,6 +29,23 @@ class BooleanStats:
     true_ratio: float = 0.0
     false_ratio: float = 0.0
     mode: Optional[bool] = None
+    flags: list[BooleanFlag] = field(default_factory=list)
+    def has_flag(self, flag: BooleanFlag) -> bool:
+        """Check whether a specific ``BooleanFlag`` is set on this column.
+        Parameters
+        ----------
+        flag : BooleanFlag
+            The flag to test.
+        Returns
+        -------
+        bool
+            ``True`` if ``flag`` is present in :attr:`flags`, ``False``
+            otherwise.
+        """
+        return flag in self.flags
     def to_dict(self) -> dict:
         """Serialise the boolean statistics to a plain dictionary.
@@ -31,7 +53,8 @@ class BooleanStats:
         Returns
         -------
         dict
-            All fields keyed by field name.
+            All fields keyed by field name.  ``flags`` are serialised as their
+            string values.
         """
         return {
             "true_count": self.true_count,
@@ -39,6 +62,7 @@ class BooleanStats:
             "true_ratio": self.true_ratio,
             "false_ratio": self.false_ratio,
             "mode": self.mode,
+            "flags": [str(f) for f in self.flags],
         }

{dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_boolean_profiler.py RENAMED Viewed

@@ -23,7 +23,7 @@ import polars as pl
 from ._base import ColumnBatchProfiler
 from ._config import BooleanStats
-from ._boolean_config import BooleanProfileResult
+from ._boolean_config import BooleanFlag, BooleanProfileResult
 from ..models._data_types import _INT_DTYPES
 # ---------------------------------------------------------------------------
@@ -114,6 +114,13 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
         bool_series = self._to_bool_series(series)
         non_null_count = bool_series.len()
+        # FormatMismatch: a value that is present (non-null after the
+        # orchestrator's Effective-Null normalization) but falls outside the
+        # recognized true/false vocabulary is dropped by coercion.  A shortfall
+        # in the non-null count means the column holds dirty, uncoercible data.
+        if non_null_count < series.drop_nulls().len():
+            profile.flags.append(BooleanFlag.FormatMismatch)
         if non_null_count == 0:
             if series.drop_nulls().len() > 0 and col_name in user_overrides:
                 from ._base import OverrideCoercionError

{dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_config.py RENAMED Viewed

@@ -658,15 +658,18 @@ class ProfileConfig:
     numeric_sentinels: InitVar[Optional[dict[str, list[float]]]] = None
     string_sentinels: InitVar[Optional[dict[str, list[str]]]] = None
     datetime_epoch_units: InitVar[Optional[dict[str, Union[str, EpochUnit]]]] = None
+    datetime_formats: InitVar[Optional[dict[str, str]]] = None
     _numeric_sentinels: dict[str, list[float]] = field(default_factory=dict, init=False)
     _string_sentinels: dict[str, list[str]] = field(default_factory=dict, init=False)
     _datetime_epoch_units: dict[str, EpochUnit] = field(default_factory=dict, init=False)
+    _datetime_formats: dict[str, str] = field(default_factory=dict, init=False)
     def __post_init__(
         self,
         numeric_sentinels: Optional[dict[str, list[float]]],
         string_sentinels: Optional[dict[str, list[str]]],
         datetime_epoch_units: Optional[dict[str, Union[str, EpochUnit]]] = None,
+        datetime_formats: Optional[dict[str, str]] = None,
     ) -> None:
         if numeric_sentinels is not None and not isinstance(numeric_sentinels, property):
             for k, vals in numeric_sentinels.items():
@@ -677,6 +680,9 @@ class ProfileConfig:
         if datetime_epoch_units is not None and not isinstance(datetime_epoch_units, property):
             for k, val in datetime_epoch_units.items():
                 self.set_datetime_epoch_unit(k, val)
+        if datetime_formats is not None and not isinstance(datetime_formats, property):
+            for k, fmt in datetime_formats.items():
+                self.set_datetime_format(k, fmt)
     @property
     def numeric_sentinels(self) -> MappingProxyType[str, list[float]]:
@@ -731,6 +737,27 @@ class ProfileConfig:
         """
         return MappingProxyType(self._datetime_epoch_units)
+    @property
+    def datetime_formats(self) -> MappingProxyType[str, str]:
+        """
+        Get the per-column declared datetime format strings.
+        Keys are column names; values are strftime-style format strings (e.g.
+        ``{"Year": "%Y"}``) applied by ``DatetimeProfiler`` with
+        ``strict=False`` when coercing that column to Datetime. A declaration
+        applies to any column profiled as Datetime, whether overridden or
+        auto-detected. Format strings are not validated against strftime
+        grammar at declaration time — a bad format surfaces at profiling time.
+        Defaults to an empty dict — columns with no declaration fall back to
+        Polars format inference.
+        Returns
+        -------
+        MappingProxyType[str, str]
+            Read-only mapping of column names to declared datetime formats.
+        """
+        return MappingProxyType(self._datetime_formats)
     def set_numeric_sentinel(self, column: str | list[str], values: list[float]) -> None:
         """
         Set numeric sentinel values for one or more columns.
@@ -809,6 +836,38 @@ class ProfileConfig:
         for c in columns:
             self._datetime_epoch_units[c] = enum_unit
+    def set_datetime_format(self, column: str | list[str], format: str) -> None:
+        """
+        Declare a datetime format string for one or more columns.
+        The format is applied by ``DatetimeProfiler`` with ``strict=False``
+        when coercing the column to Datetime, and is not validated against
+        strftime grammar or the data at declaration time — a bad format
+        surfaces at profiling time, consistent with ``set_column_type`` and
+        ``set_datetime_epoch_unit``.
+        Parameters
+        ----------
+        column : str or list of str
+            Column name or list of column names to apply the format to.
+        format : str
+            A non-empty strftime-style format string (e.g. ``"%Y"``).
+        Raises
+        ------
+        ValueError
+            If any column name is empty, or if `format` is not a non-empty
+            string.
+        """
+        if not isinstance(format, str) or not format:
+            raise ValueError("format must be a non-empty string.")
+        columns = [column] if isinstance(column, str) else column
+        for c in columns:
+            if not isinstance(c, str) or not c:
+                raise ValueError("column name must be a non-empty string.")
+            self._datetime_formats[c] = format
     def to_dict(self) -> dict:
         """
         Serialise the config to a plain dictionary.
@@ -837,6 +896,7 @@ class ProfileConfig:
             "numeric_sentinels": {k: list(v) for k, v in self.numeric_sentinels.items()},
             "string_sentinels": {k: list(v) for k, v in self.string_sentinels.items()},
             "datetime_epoch_units": {k: v.value for k, v in self.datetime_epoch_units.items()},
+            "datetime_formats": {k: v for k, v in self.datetime_formats.items()},
         }
     @classmethod
@@ -884,6 +944,7 @@ class ProfileConfig:
             numeric_sentinels=data.get("numeric_sentinels", {}),
             string_sentinels=data.get("string_sentinels", {}),
             datetime_epoch_units=data.get("datetime_epoch_units", {}),
+            datetime_formats=data.get("datetime_formats", {}),
         )
         return config

{dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_datetime_config.py RENAMED Viewed

@@ -28,6 +28,7 @@ class DatetimeFlag(StrEnum):
     HighGapVariance = "high_gap_variance"
     MnarSuspected = "mnar_suspected"
     RecentDateMissing = "recent_date_missing"
+    FormatMismatch = "format_mismatch"
 @dataclass

{dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_datetime_profiler.py RENAMED Viewed

@@ -68,9 +68,11 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
         self,
         config: DatetimeProfileConfig | None = None,
         epoch_units: dict[str, str] | None = None,
+        formats: dict[str, str] | None = None,
     ) -> None:
         self._config = config if config is not None else DatetimeProfileConfig()
         self._epoch_units = epoch_units or {}
+        self._formats = formats or {}
     # ------------------------------------------------------------------
     # Public API
@@ -112,8 +114,14 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
     def _coerce_to_datetime(self, series: pl.Series, col_name: str) -> pl.Series | None:
         if series.dtype in (pl.Utf8, pl.String):
+            declared_format = self._formats.get(col_name)
             try:
-                coerced = series.str.to_datetime(strict=False)
+                if declared_format is not None:
+                    coerced = series.str.to_datetime(
+                        format=declared_format, strict=False
+                    )
+                else:
+                    coerced = series.str.to_datetime(strict=False)
                 return coerced if coerced.drop_nulls().len() > 0 else None
             except pl.exceptions.ComputeError:
                 return None
@@ -139,22 +147,35 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
         available = []
         coerced_cache = {}
+        format_mismatch: dict[str, bool] = {}
         for col_name in self._resolve_columns(df.columns, columns):
-            series = self._coerce_to_datetime(df[col_name], col_name)
+            original = df[col_name]
+            series = self._coerce_to_datetime(original, col_name)
             if series is not None:
                 available.append(col_name)
                 coerced_cache[col_name] = series
+                # FormatMismatch: a value that is present (non-null after the
+                # orchestrator's Effective-Null normalization) but fails
+                # coercion becomes null here.  Compare non-null counts before
+                # and after coercion; a shortfall means dirty, uncoercible data.
+                format_mismatch[col_name] = (
+                    series.drop_nulls().len() < original.drop_nulls().len()
+                )
             elif col_name in user_overrides:
-                if df[col_name].drop_nulls().len() > 0:
+                if original.drop_nulls().len() > 0:
                     from ._base import OverrideCoercionError
                     raise OverrideCoercionError(
-                        f"Column {col_name!r} with TypeFlag.UserOverride completely failed coercion to Datetime."
+                        f"Column {col_name!r} with TypeFlag.UserOverride completely failed coercion to Datetime. "
+                        f"If Polars cannot infer the format, declare one explicitly via "
+                        f"ProfileConfig.set_datetime_format({col_name!r}, <format>) (e.g. '%Y' for bare years)."
                     )
         result.analysed_columns = available
         for col_name in available:
             profile = self._profile_column(coerced_cache[col_name], df.height, now)
+            if format_mismatch.get(col_name):
+                profile.flags.append(DatetimeFlag.FormatMismatch)
             result.columns[col_name] = profile
         return result

{dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_numeric_config.py RENAMED Viewed

@@ -170,6 +170,7 @@ class NumericFlag(StrEnum):
     NearConstant = "near_constant"
     Bimodal = "bimodal"
     HighOutlierDensity = "high_outlier_density"
+    FormatMismatch = "format_mismatch"
 @dataclass

{dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_numeric_profiler.py RENAMED Viewed

@@ -144,6 +144,13 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
             clean = f64.drop_nulls()
             profile = NumericStats()
+            # FormatMismatch: a value that is present (non-null after the
+            # orchestrator's Effective-Null normalization) but fails the
+            # Float64 cast becomes null here.  A shortfall in the non-null
+            # count means the column holds dirty, uncoercible data.
+            if clean.len() < series.drop_nulls().len():
+                profile.flags.append(NumericFlag.FormatMismatch)
             if clean.len() == 0:
                 if series.drop_nulls().len() > 0 and col in user_overrides:
                     from ._base import OverrideCoercionError

{dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/orchestrator.py RENAMED Viewed

@@ -28,7 +28,7 @@ from typing import Any
 import numpy as np
 import polars as pl
-from ._base import ModalityProfiler, ColumnBatchProfiler
+from ._base import ModalityProfiler, ColumnBatchProfiler, OverrideCoercionError
 from ._tabular import TabularProfiler
 from ._categorical import CategoricalProfiler
 from ._datetime_profiler import DatetimeProfiler
@@ -114,6 +114,10 @@ class StructuralProfiler:
         ------
         TypeError
             When ``data`` is not a ``polars.DataFrame``.
+        OverrideCoercionError
+            When a column carrying ``TypeFlag.UserOverride`` completely fails
+            coercion to its overridden ``SemanticType`` (zero usable values
+            remain despite the original column having non-null data).
         """
         if not isinstance(data, pl.DataFrame):
             raise TypeError(
@@ -220,13 +224,22 @@ class StructuralProfiler:
             type_to_cols.setdefault(sem_type, []).append(col_name)
         pc = self.config.profiling
+        profiling_frame = _resolve_effective_nulls(
+            data,
+            numeric_sentinels=dict(pc.numeric_sentinels),
+            string_sentinels=dict(pc.string_sentinels),
+        )
         for sem_type, cols in type_to_cols.items():
             if sem_type == SemanticType.Numeric:
                 profiler = NumericProfiler(config=pc.numeric)
             elif sem_type == SemanticType.Categorical:
                 profiler = CategoricalProfiler(config=pc.categorical)
             elif sem_type == SemanticType.Datetime:
-                profiler = DatetimeProfiler(config=pc.datetime_, epoch_units=pc.datetime_epoch_units)
+                profiler = DatetimeProfiler(
+                    config=pc.datetime_,
+                    epoch_units=pc.datetime_epoch_units,
+                    formats=pc.datetime_formats,
+                )
             else:
                 profiler_cls = _COLUMN_PROFILER_REGISTRY.get(sem_type)  # type: ignore[arg-type]
                 if profiler_cls is None:
@@ -237,10 +250,12 @@ class StructuralProfiler:
                     c for c in cols
                     if result.columns.get(c) and TypeFlag.UserOverride in result.columns[c].type_flags
                 }
-                batch = profiler.profile(data, columns=cols, user_overrides=user_overrides)
+                batch = profiler.profile(profiling_frame, columns=cols, user_overrides=user_overrides)
                 for col_name in batch.analysed_columns:
                     if col_name in result.columns:
                         result.columns[col_name].stats = batch.columns.get(col_name)
+            except OverrideCoercionError:
+                raise
             except Exception:
                 pass

{dataforge_ml-2.0.5 → dataforge_ml-2.0.7/src/dataforge_ml.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataforge-ml
-Version: 2.0.5
+Version: 2.0.7
 Summary: A automated feature engineering and designing pipeline library
 License: MIT
 Classifier: License :: OSI Approved :: MIT License