dataforge-ml 2.0.6__tar.gz → 2.0.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {dataforge_ml-2.0.6/src/dataforge_ml.egg-info → dataforge_ml-2.0.8}/PKG-INFO +1 -1
  2. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/pyproject.toml +1 -1
  3. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_boolean_config.py +25 -1
  4. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_boolean_profiler.py +8 -1
  5. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_config.py +61 -0
  6. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_datetime_config.py +3 -70
  7. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_datetime_profiler.py +29 -59
  8. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_numeric_config.py +1 -0
  9. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_numeric_profiler.py +7 -0
  10. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/orchestrator.py +11 -2
  11. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8/src/dataforge_ml.egg-info}/PKG-INFO +1 -1
  12. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/LICENSE +0 -0
  13. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/README.md +0 -0
  14. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/setup.cfg +0 -0
  15. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/__init__.py +0 -0
  16. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/config.py +0 -0
  17. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/__init__.py +0 -0
  18. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/_config.py +0 -0
  19. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/_fitted_imputer.py +0 -0
  20. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/_numeric_imputer.py +0 -0
  21. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/_regression_estimator_factory.py +0 -0
  22. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/_strategy_router.py +0 -0
  23. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/_utils.py +0 -0
  24. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/orchestrator.py +0 -0
  25. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/models/__init__.py +0 -0
  26. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/models/_data_structure.py +0 -0
  27. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/models/_data_types.py +0 -0
  28. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/__init__.py +0 -0
  29. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_base.py +0 -0
  30. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_categorical.py +0 -0
  31. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
  32. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
  33. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_correlation_profiler.py +0 -0
  34. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
  35. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_missingness_profiler.py +0 -0
  36. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_nonlinearity_profiler.py +0 -0
  37. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_tabular.py +0 -0
  38. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_target_config.py +0 -0
  39. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_target_profiler.py +0 -0
  40. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_text_config.py +0 -0
  41. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
  42. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_type_detection_config.py +0 -0
  43. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_type_detector.py +0 -0
  44. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/splitting/__init__.py +0 -0
  45. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/splitting/_config.py +0 -0
  46. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/splitting/_profile_signals.py +0 -0
  47. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/splitting/_splitter.py +0 -0
  48. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/utils/__init__.py +0 -0
  49. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/utils/_null_detection.py +0 -0
  50. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/utils/_null_normalization.py +0 -0
  51. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/utils/data_loader.py +0 -0
  52. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
  53. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
  54. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml.egg-info/requires.txt +0 -0
  55. {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 2.0.6
3
+ Version: 2.0.8
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dataforge-ml"
7
- version = "2.0.6"
7
+ version = "2.0.8"
8
8
  description = "A automated feature engineering and designing pipeline library"
9
9
  readme = "README.md"
10
10
  requires-python = ">3.10"
@@ -7,9 +7,14 @@ Populated by BooleanProfiler.
7
7
  from __future__ import annotations
8
8
 
9
9
  from dataclasses import dataclass, field
10
+ from enum import StrEnum
10
11
  from typing import Optional
11
12
 
12
13
 
14
+ class BooleanFlag(StrEnum):
15
+ FormatMismatch = "format_mismatch"
16
+
17
+
13
18
  @dataclass
14
19
  class BooleanStats:
15
20
  """Value distribution statistics for a single Boolean column.
@@ -24,6 +29,23 @@ class BooleanStats:
24
29
  true_ratio: float = 0.0
25
30
  false_ratio: float = 0.0
26
31
  mode: Optional[bool] = None
32
+ flags: list[BooleanFlag] = field(default_factory=list)
33
+
34
+ def has_flag(self, flag: BooleanFlag) -> bool:
35
+ """Check whether a specific ``BooleanFlag`` is set on this column.
36
+
37
+ Parameters
38
+ ----------
39
+ flag : BooleanFlag
40
+ The flag to test.
41
+
42
+ Returns
43
+ -------
44
+ bool
45
+ ``True`` if ``flag`` is present in :attr:`flags`, ``False``
46
+ otherwise.
47
+ """
48
+ return flag in self.flags
27
49
 
28
50
  def to_dict(self) -> dict:
29
51
  """Serialise the boolean statistics to a plain dictionary.
@@ -31,7 +53,8 @@ class BooleanStats:
31
53
  Returns
32
54
  -------
33
55
  dict
34
- All fields keyed by field name.
56
+ All fields keyed by field name. ``flags`` are serialised as their
57
+ string values.
35
58
  """
36
59
  return {
37
60
  "true_count": self.true_count,
@@ -39,6 +62,7 @@ class BooleanStats:
39
62
  "true_ratio": self.true_ratio,
40
63
  "false_ratio": self.false_ratio,
41
64
  "mode": self.mode,
65
+ "flags": [str(f) for f in self.flags],
42
66
  }
43
67
 
44
68
 
@@ -23,7 +23,7 @@ import polars as pl
23
23
 
24
24
  from ._base import ColumnBatchProfiler
25
25
  from ._config import BooleanStats
26
- from ._boolean_config import BooleanProfileResult
26
+ from ._boolean_config import BooleanFlag, BooleanProfileResult
27
27
  from ..models._data_types import _INT_DTYPES
28
28
 
29
29
  # ---------------------------------------------------------------------------
@@ -114,6 +114,13 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
114
114
  bool_series = self._to_bool_series(series)
115
115
  non_null_count = bool_series.len()
116
116
 
117
+ # FormatMismatch: a value that is present (non-null after the
118
+ # orchestrator's Effective-Null normalization) but falls outside the
119
+ # recognized true/false vocabulary is dropped by coercion. A shortfall
120
+ # in the non-null count means the column holds dirty, uncoercible data.
121
+ if non_null_count < series.drop_nulls().len():
122
+ profile.flags.append(BooleanFlag.FormatMismatch)
123
+
117
124
  if non_null_count == 0:
118
125
  if series.drop_nulls().len() > 0 and col_name in user_overrides:
119
126
  from ._base import OverrideCoercionError
@@ -658,15 +658,18 @@ class ProfileConfig:
658
658
  numeric_sentinels: InitVar[Optional[dict[str, list[float]]]] = None
659
659
  string_sentinels: InitVar[Optional[dict[str, list[str]]]] = None
660
660
  datetime_epoch_units: InitVar[Optional[dict[str, Union[str, EpochUnit]]]] = None
661
+ datetime_formats: InitVar[Optional[dict[str, str]]] = None
661
662
  _numeric_sentinels: dict[str, list[float]] = field(default_factory=dict, init=False)
662
663
  _string_sentinels: dict[str, list[str]] = field(default_factory=dict, init=False)
663
664
  _datetime_epoch_units: dict[str, EpochUnit] = field(default_factory=dict, init=False)
665
+ _datetime_formats: dict[str, str] = field(default_factory=dict, init=False)
664
666
 
665
667
  def __post_init__(
666
668
  self,
667
669
  numeric_sentinels: Optional[dict[str, list[float]]],
668
670
  string_sentinels: Optional[dict[str, list[str]]],
669
671
  datetime_epoch_units: Optional[dict[str, Union[str, EpochUnit]]] = None,
672
+ datetime_formats: Optional[dict[str, str]] = None,
670
673
  ) -> None:
671
674
  if numeric_sentinels is not None and not isinstance(numeric_sentinels, property):
672
675
  for k, vals in numeric_sentinels.items():
@@ -677,6 +680,9 @@ class ProfileConfig:
677
680
  if datetime_epoch_units is not None and not isinstance(datetime_epoch_units, property):
678
681
  for k, val in datetime_epoch_units.items():
679
682
  self.set_datetime_epoch_unit(k, val)
683
+ if datetime_formats is not None and not isinstance(datetime_formats, property):
684
+ for k, fmt in datetime_formats.items():
685
+ self.set_datetime_format(k, fmt)
680
686
 
681
687
  @property
682
688
  def numeric_sentinels(self) -> MappingProxyType[str, list[float]]:
@@ -731,6 +737,27 @@ class ProfileConfig:
731
737
  """
732
738
  return MappingProxyType(self._datetime_epoch_units)
733
739
 
740
+ @property
741
+ def datetime_formats(self) -> MappingProxyType[str, str]:
742
+ """
743
+ Get the per-column declared datetime format strings.
744
+
745
+ Keys are column names; values are strftime-style format strings (e.g.
746
+ ``{"Year": "%Y"}``) applied by ``DatetimeProfiler`` with
747
+ ``strict=False`` when coercing that column to Datetime. A declaration
748
+ applies to any column profiled as Datetime, whether overridden or
749
+ auto-detected. Format strings are not validated against strftime
750
+ grammar at declaration time — a bad format surfaces at profiling time.
751
+ Defaults to an empty dict — columns with no declaration fall back to
752
+ Polars format inference.
753
+
754
+ Returns
755
+ -------
756
+ MappingProxyType[str, str]
757
+ Read-only mapping of column names to declared datetime formats.
758
+ """
759
+ return MappingProxyType(self._datetime_formats)
760
+
734
761
  def set_numeric_sentinel(self, column: str | list[str], values: list[float]) -> None:
735
762
  """
736
763
  Set numeric sentinel values for one or more columns.
@@ -809,6 +836,38 @@ class ProfileConfig:
809
836
  for c in columns:
810
837
  self._datetime_epoch_units[c] = enum_unit
811
838
 
839
+ def set_datetime_format(self, column: str | list[str], format: str) -> None:
840
+ """
841
+ Declare a datetime format string for one or more columns.
842
+
843
+ The format is applied by ``DatetimeProfiler`` with ``strict=False``
844
+ when coercing the column to Datetime, and is not validated against
845
+ strftime grammar or the data at declaration time — a bad format
846
+ surfaces at profiling time, consistent with ``set_column_type`` and
847
+ ``set_datetime_epoch_unit``.
848
+
849
+ Parameters
850
+ ----------
851
+ column : str or list of str
852
+ Column name or list of column names to apply the format to.
853
+ format : str
854
+ A non-empty strftime-style format string (e.g. ``"%Y"``).
855
+
856
+ Raises
857
+ ------
858
+ ValueError
859
+ If any column name is empty, or if `format` is not a non-empty
860
+ string.
861
+ """
862
+ if not isinstance(format, str) or not format:
863
+ raise ValueError("format must be a non-empty string.")
864
+
865
+ columns = [column] if isinstance(column, str) else column
866
+ for c in columns:
867
+ if not isinstance(c, str) or not c:
868
+ raise ValueError("column name must be a non-empty string.")
869
+ self._datetime_formats[c] = format
870
+
812
871
  def to_dict(self) -> dict:
813
872
  """
814
873
  Serialise the config to a plain dictionary.
@@ -837,6 +896,7 @@ class ProfileConfig:
837
896
  "numeric_sentinels": {k: list(v) for k, v in self.numeric_sentinels.items()},
838
897
  "string_sentinels": {k: list(v) for k, v in self.string_sentinels.items()},
839
898
  "datetime_epoch_units": {k: v.value for k, v in self.datetime_epoch_units.items()},
899
+ "datetime_formats": {k: v for k, v in self.datetime_formats.items()},
840
900
  }
841
901
 
842
902
  @classmethod
@@ -884,6 +944,7 @@ class ProfileConfig:
884
944
  numeric_sentinels=data.get("numeric_sentinels", {}),
885
945
  string_sentinels=data.get("string_sentinels", {}),
886
946
  datetime_epoch_units=data.get("datetime_epoch_units", {}),
947
+ datetime_formats=data.get("datetime_formats", {}),
887
948
  )
888
949
 
889
950
  return config
@@ -28,70 +28,7 @@ class DatetimeFlag(StrEnum):
28
28
  HighGapVariance = "high_gap_variance"
29
29
  MnarSuspected = "mnar_suspected"
30
30
  RecentDateMissing = "recent_date_missing"
31
-
32
-
33
- @dataclass
34
- class TemporalSignals:
35
- """Which time-component features are present in a Datetime column.
36
-
37
- Each boolean field indicates that the corresponding granularity was
38
- detected as non-constant, making it a candidate for feature extraction
39
- in Phase 5 Encoding.
40
- """
41
-
42
- has_year: bool = False
43
- has_month: bool = False
44
- has_day: bool = False
45
- has_day_of_week: bool = False
46
- has_hour: bool = False
47
- has_is_weekend: bool = False
48
- has_is_month_end: bool = False
49
-
50
- def extractable_features(self) -> list[str]:
51
- """Return the names of all time-component features that can be extracted.
52
-
53
- Returns
54
- -------
55
- list[str]
56
- Feature names corresponding to every ``has_*`` field that is
57
- ``True``. An empty list means no temporal variation was detected.
58
- """
59
- features = []
60
- if self.has_year:
61
- features.append("year")
62
- if self.has_month:
63
- features.append("month")
64
- if self.has_day:
65
- features.append("day_of_month")
66
- if self.has_day_of_week:
67
- features.append("day_of_week")
68
- if self.has_hour:
69
- features.append("hour")
70
- if self.has_is_weekend:
71
- features.append("is_weekend")
72
- if self.has_is_month_end:
73
- features.append("is_month_end")
74
- return features
75
-
76
- def to_dict(self) -> dict:
77
- """Serialise the temporal signals to a plain dictionary.
78
-
79
- Returns
80
- -------
81
- dict
82
- All ``has_*`` flags plus an ``extractable_features`` key
83
- containing the result of :meth:`extractable_features`.
84
- """
85
- return {
86
- "has_year": self.has_year,
87
- "has_month": self.has_month,
88
- "has_day": self.has_day,
89
- "has_day_of_week": self.has_day_of_week,
90
- "has_hour": self.has_hour,
91
- "has_is_weekend": self.has_is_weekend,
92
- "has_is_month_end": self.has_is_month_end,
93
- "extractable_features": self.extractable_features(),
94
- }
31
+ FormatMismatch = "format_mismatch"
95
32
 
96
33
 
97
34
  @dataclass
@@ -99,8 +36,7 @@ class DatetimeStats:
99
36
  """Statistical summary of a single Datetime column.
100
37
 
101
38
  Produced by ``DatetimeProfiler`` for each opted-in column. Stores
102
- range, gap regularity, inferred granularity, and ``TemporalSignals``
103
- indicating which time components are available for feature extraction.
39
+ range, gap regularity, and inferred granularity.
104
40
  """
105
41
 
106
42
  min_date: Optional[str] = None
@@ -110,7 +46,6 @@ class DatetimeStats:
110
46
  inferred_granularity: Optional[InferredGranularity] = None
111
47
  median_gap_seconds: Optional[float] = None
112
48
  gap_cv: Optional[float] = None
113
- signals: TemporalSignals = field(default_factory=TemporalSignals)
114
49
  flags: list[DatetimeFlag] = field(default_factory=list)
115
50
 
116
51
  def has_flag(self, flag: DatetimeFlag) -> bool:
@@ -136,8 +71,7 @@ class DatetimeStats:
136
71
  -------
137
72
  dict
138
73
  All fields keyed by field name. ``inferred_granularity`` is
139
- serialised as its string value; ``signals`` is expanded via
140
- :meth:`TemporalSignals.to_dict`; ``flags`` are serialised as
74
+ serialised as its string value; ``flags`` are serialised as
141
75
  their string values.
142
76
  """
143
77
  return {
@@ -148,7 +82,6 @@ class DatetimeStats:
148
82
  "inferred_granularity": str(self.inferred_granularity) if self.inferred_granularity else None,
149
83
  "median_gap_seconds": self.median_gap_seconds,
150
84
  "gap_cv": self.gap_cv,
151
- "signals": self.signals.to_dict(),
152
85
  "flags": [str(f) for f in self.flags],
153
86
  }
154
87
 
@@ -7,9 +7,10 @@ Per-column metrics (opt-in via ProfileConfig.datetime_columns):
7
7
  3. Future dates – count of values > now, with context note
8
8
  4. Granularity – inferred periodicity from median consecutive gap;
9
9
  high gap-CV flagged as irregular
10
- 5. Temporal signals – audit which of {year, month, day, day-of-week,
11
- hour, is-weekend, is-month-end} vary in the data,
12
- to guide downstream feature engineering
10
+
11
+ Temporal-component variance (whether year, month, day-of-week, etc. vary) is
12
+ intentionally not profiled here; Phase 5 Encoding derives it on demand from
13
+ the column.
13
14
 
14
15
  Granularity inference bands (median gap in seconds):
15
16
  < 90 s → secondly
@@ -34,7 +35,6 @@ from ._datetime_config import (
34
35
  DatetimeStats,
35
36
  InferredGranularity,
36
37
  DatetimeFlag,
37
- TemporalSignals,
38
38
  )
39
39
 
40
40
  # Granularity bands — upper bound (exclusive) in seconds for each label.
@@ -68,9 +68,11 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
68
68
  self,
69
69
  config: DatetimeProfileConfig | None = None,
70
70
  epoch_units: dict[str, str] | None = None,
71
+ formats: dict[str, str] | None = None,
71
72
  ) -> None:
72
73
  self._config = config if config is not None else DatetimeProfileConfig()
73
74
  self._epoch_units = epoch_units or {}
75
+ self._formats = formats or {}
74
76
 
75
77
  # ------------------------------------------------------------------
76
78
  # Public API
@@ -112,8 +114,14 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
112
114
 
113
115
  def _coerce_to_datetime(self, series: pl.Series, col_name: str) -> pl.Series | None:
114
116
  if series.dtype in (pl.Utf8, pl.String):
117
+ declared_format = self._formats.get(col_name)
115
118
  try:
116
- coerced = series.str.to_datetime(strict=False)
119
+ if declared_format is not None:
120
+ coerced = series.str.to_datetime(
121
+ format=declared_format, strict=False
122
+ )
123
+ else:
124
+ coerced = series.str.to_datetime(strict=False)
117
125
  return coerced if coerced.drop_nulls().len() > 0 else None
118
126
  except pl.exceptions.ComputeError:
119
127
  return None
@@ -139,22 +147,35 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
139
147
 
140
148
  available = []
141
149
  coerced_cache = {}
150
+ format_mismatch: dict[str, bool] = {}
142
151
  for col_name in self._resolve_columns(df.columns, columns):
143
- series = self._coerce_to_datetime(df[col_name], col_name)
152
+ original = df[col_name]
153
+ series = self._coerce_to_datetime(original, col_name)
144
154
  if series is not None:
145
155
  available.append(col_name)
146
156
  coerced_cache[col_name] = series
157
+ # FormatMismatch: a value that is present (non-null after the
158
+ # orchestrator's Effective-Null normalization) but fails
159
+ # coercion becomes null here. Compare non-null counts before
160
+ # and after coercion; a shortfall means dirty, uncoercible data.
161
+ format_mismatch[col_name] = (
162
+ series.drop_nulls().len() < original.drop_nulls().len()
163
+ )
147
164
  elif col_name in user_overrides:
148
- if df[col_name].drop_nulls().len() > 0:
165
+ if original.drop_nulls().len() > 0:
149
166
  from ._base import OverrideCoercionError
150
167
  raise OverrideCoercionError(
151
- f"Column {col_name!r} with TypeFlag.UserOverride completely failed coercion to Datetime."
168
+ f"Column {col_name!r} with TypeFlag.UserOverride completely failed coercion to Datetime. "
169
+ f"If Polars cannot infer the format, declare one explicitly via "
170
+ f"ProfileConfig.set_datetime_format({col_name!r}, <format>) (e.g. '%Y' for bare years)."
152
171
  )
153
172
 
154
173
  result.analysed_columns = available
155
174
 
156
175
  for col_name in available:
157
176
  profile = self._profile_column(coerced_cache[col_name], df.height, now)
177
+ if format_mismatch.get(col_name):
178
+ profile.flags.append(DatetimeFlag.FormatMismatch)
158
179
  result.columns[col_name] = profile
159
180
 
160
181
  return result
@@ -205,9 +226,6 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
205
226
  # 6. Granularity
206
227
  self._infer_granularity(clean, profile)
207
228
 
208
- # 7. Temporal signals
209
- self._audit_temporal_signals(clean, profile)
210
-
211
229
  return profile
212
230
 
213
231
  # ------------------------------------------------------------------
@@ -356,51 +374,3 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
356
374
  break
357
375
 
358
376
  profile.inferred_granularity = granularity
359
-
360
- # ------------------------------------------------------------------
361
- # Step 7: Temporal signal audit
362
- # ------------------------------------------------------------------
363
-
364
- @staticmethod
365
- def _audit_temporal_signals(
366
- clean: pl.Series,
367
- profile: DatetimeStats,
368
- ) -> None:
369
- """
370
- Check which temporal features vary across rows.
371
-
372
- All checks are done via Polars expressions on the full clean series,
373
- so no Python-level loops are required.
374
- """
375
- signals = TemporalSignals()
376
-
377
- years = clean.dt.year()
378
- months = clean.dt.month()
379
- days = clean.dt.day()
380
- dow = clean.dt.weekday() # 0=Monday … 6=Sunday
381
- hours = clean.dt.hour()
382
-
383
- signals.has_year = years.n_unique() > 1
384
- signals.has_month = months.n_unique() > 1
385
- signals.has_day = days.n_unique() > 1
386
- signals.has_day_of_week = dow.n_unique() > 1
387
- signals.has_hour = int(hours.max()) > 0 # type: ignore[arg-type]
388
-
389
- # Weekend signal is only meaningful when day-of-week varies
390
- if signals.has_day_of_week:
391
- weekend_mask = dow >= 5 # Saturday=5, Sunday=6
392
- signals.has_is_weekend = bool(weekend_mask.any())
393
-
394
- # Month-end: day == last day of the respective month
395
- try:
396
- month_end_ts = clean.dt.month_end()
397
- is_month_end_mask = (
398
- (clean.dt.year() == month_end_ts.dt.year())
399
- & (clean.dt.month() == month_end_ts.dt.month())
400
- & (clean.dt.day() == month_end_ts.dt.day())
401
- )
402
- signals.has_is_month_end = bool(is_month_end_mask.any())
403
- except Exception:
404
- signals.has_is_month_end = False
405
-
406
- profile.signals = signals
@@ -170,6 +170,7 @@ class NumericFlag(StrEnum):
170
170
  NearConstant = "near_constant"
171
171
  Bimodal = "bimodal"
172
172
  HighOutlierDensity = "high_outlier_density"
173
+ FormatMismatch = "format_mismatch"
173
174
 
174
175
 
175
176
  @dataclass
@@ -144,6 +144,13 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
144
144
  clean = f64.drop_nulls()
145
145
  profile = NumericStats()
146
146
 
147
+ # FormatMismatch: a value that is present (non-null after the
148
+ # orchestrator's Effective-Null normalization) but fails the
149
+ # Float64 cast becomes null here. A shortfall in the non-null
150
+ # count means the column holds dirty, uncoercible data.
151
+ if clean.len() < series.drop_nulls().len():
152
+ profile.flags.append(NumericFlag.FormatMismatch)
153
+
147
154
  if clean.len() == 0:
148
155
  if series.drop_nulls().len() > 0 and col in user_overrides:
149
156
  from ._base import OverrideCoercionError
@@ -224,13 +224,22 @@ class StructuralProfiler:
224
224
  type_to_cols.setdefault(sem_type, []).append(col_name)
225
225
 
226
226
  pc = self.config.profiling
227
+ profiling_frame = _resolve_effective_nulls(
228
+ data,
229
+ numeric_sentinels=dict(pc.numeric_sentinels),
230
+ string_sentinels=dict(pc.string_sentinels),
231
+ )
227
232
  for sem_type, cols in type_to_cols.items():
228
233
  if sem_type == SemanticType.Numeric:
229
234
  profiler = NumericProfiler(config=pc.numeric)
230
235
  elif sem_type == SemanticType.Categorical:
231
236
  profiler = CategoricalProfiler(config=pc.categorical)
232
237
  elif sem_type == SemanticType.Datetime:
233
- profiler = DatetimeProfiler(config=pc.datetime_, epoch_units=pc.datetime_epoch_units)
238
+ profiler = DatetimeProfiler(
239
+ config=pc.datetime_,
240
+ epoch_units=pc.datetime_epoch_units,
241
+ formats=pc.datetime_formats,
242
+ )
234
243
  else:
235
244
  profiler_cls = _COLUMN_PROFILER_REGISTRY.get(sem_type) # type: ignore[arg-type]
236
245
  if profiler_cls is None:
@@ -241,7 +250,7 @@ class StructuralProfiler:
241
250
  c for c in cols
242
251
  if result.columns.get(c) and TypeFlag.UserOverride in result.columns[c].type_flags
243
252
  }
244
- batch = profiler.profile(data, columns=cols, user_overrides=user_overrides)
253
+ batch = profiler.profile(profiling_frame, columns=cols, user_overrides=user_overrides)
245
254
  for col_name in batch.analysed_columns:
246
255
  if col_name in result.columns:
247
256
  result.columns[col_name].stats = batch.columns.get(col_name)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 2.0.6
3
+ Version: 2.0.8
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
File without changes
File without changes
File without changes