dataforge-ml 2.0.6__tar.gz → 2.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {dataforge_ml-2.0.6/src/dataforge_ml.egg-info → dataforge_ml-2.0.7}/PKG-INFO +1 -1
  2. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/pyproject.toml +1 -1
  3. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_boolean_config.py +25 -1
  4. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_boolean_profiler.py +8 -1
  5. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_config.py +61 -0
  6. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_datetime_config.py +1 -0
  7. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_datetime_profiler.py +25 -4
  8. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_numeric_config.py +1 -0
  9. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_numeric_profiler.py +7 -0
  10. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/orchestrator.py +11 -2
  11. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7/src/dataforge_ml.egg-info}/PKG-INFO +1 -1
  12. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/LICENSE +0 -0
  13. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/README.md +0 -0
  14. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/setup.cfg +0 -0
  15. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/__init__.py +0 -0
  16. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/config.py +0 -0
  17. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/imputation/__init__.py +0 -0
  18. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/imputation/_config.py +0 -0
  19. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/imputation/_fitted_imputer.py +0 -0
  20. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/imputation/_numeric_imputer.py +0 -0
  21. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/imputation/_regression_estimator_factory.py +0 -0
  22. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/imputation/_strategy_router.py +0 -0
  23. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/imputation/_utils.py +0 -0
  24. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/imputation/orchestrator.py +0 -0
  25. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/models/__init__.py +0 -0
  26. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/models/_data_structure.py +0 -0
  27. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/models/_data_types.py +0 -0
  28. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/__init__.py +0 -0
  29. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_base.py +0 -0
  30. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_categorical.py +0 -0
  31. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
  32. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
  33. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_correlation_profiler.py +0 -0
  34. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
  35. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_missingness_profiler.py +0 -0
  36. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_nonlinearity_profiler.py +0 -0
  37. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_tabular.py +0 -0
  38. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_target_config.py +0 -0
  39. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_target_profiler.py +0 -0
  40. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_text_config.py +0 -0
  41. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
  42. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_type_detection_config.py +0 -0
  43. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_type_detector.py +0 -0
  44. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/splitting/__init__.py +0 -0
  45. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/splitting/_config.py +0 -0
  46. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/splitting/_profile_signals.py +0 -0
  47. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/splitting/_splitter.py +0 -0
  48. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/utils/__init__.py +0 -0
  49. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/utils/_null_detection.py +0 -0
  50. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/utils/_null_normalization.py +0 -0
  51. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml/utils/data_loader.py +0 -0
  52. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
  53. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
  54. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml.egg-info/requires.txt +0 -0
  55. {dataforge_ml-2.0.6 → dataforge_ml-2.0.7}/src/dataforge_ml.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 2.0.6
3
+ Version: 2.0.7
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dataforge-ml"
7
- version = "2.0.6"
7
+ version = "2.0.7"
8
8
  description = "A automated feature engineering and designing pipeline library"
9
9
  readme = "README.md"
10
10
  requires-python = ">3.10"
@@ -7,9 +7,14 @@ Populated by BooleanProfiler.
7
7
  from __future__ import annotations
8
8
 
9
9
  from dataclasses import dataclass, field
10
+ from enum import StrEnum
10
11
  from typing import Optional
11
12
 
12
13
 
14
+ class BooleanFlag(StrEnum):
15
+ FormatMismatch = "format_mismatch"
16
+
17
+
13
18
  @dataclass
14
19
  class BooleanStats:
15
20
  """Value distribution statistics for a single Boolean column.
@@ -24,6 +29,23 @@ class BooleanStats:
24
29
  true_ratio: float = 0.0
25
30
  false_ratio: float = 0.0
26
31
  mode: Optional[bool] = None
32
+ flags: list[BooleanFlag] = field(default_factory=list)
33
+
34
+ def has_flag(self, flag: BooleanFlag) -> bool:
35
+ """Check whether a specific ``BooleanFlag`` is set on this column.
36
+
37
+ Parameters
38
+ ----------
39
+ flag : BooleanFlag
40
+ The flag to test.
41
+
42
+ Returns
43
+ -------
44
+ bool
45
+ ``True`` if ``flag`` is present in :attr:`flags`, ``False``
46
+ otherwise.
47
+ """
48
+ return flag in self.flags
27
49
 
28
50
  def to_dict(self) -> dict:
29
51
  """Serialise the boolean statistics to a plain dictionary.
@@ -31,7 +53,8 @@ class BooleanStats:
31
53
  Returns
32
54
  -------
33
55
  dict
34
- All fields keyed by field name.
56
+ All fields keyed by field name. ``flags`` are serialised as their
57
+ string values.
35
58
  """
36
59
  return {
37
60
  "true_count": self.true_count,
@@ -39,6 +62,7 @@ class BooleanStats:
39
62
  "true_ratio": self.true_ratio,
40
63
  "false_ratio": self.false_ratio,
41
64
  "mode": self.mode,
65
+ "flags": [str(f) for f in self.flags],
42
66
  }
43
67
 
44
68
 
@@ -23,7 +23,7 @@ import polars as pl
23
23
 
24
24
  from ._base import ColumnBatchProfiler
25
25
  from ._config import BooleanStats
26
- from ._boolean_config import BooleanProfileResult
26
+ from ._boolean_config import BooleanFlag, BooleanProfileResult
27
27
  from ..models._data_types import _INT_DTYPES
28
28
 
29
29
  # ---------------------------------------------------------------------------
@@ -114,6 +114,13 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
114
114
  bool_series = self._to_bool_series(series)
115
115
  non_null_count = bool_series.len()
116
116
 
117
+ # FormatMismatch: a value that is present (non-null after the
118
+ # orchestrator's Effective-Null normalization) but falls outside the
119
+ # recognized true/false vocabulary is dropped by coercion. A shortfall
120
+ # in the non-null count means the column holds dirty, uncoercible data.
121
+ if non_null_count < series.drop_nulls().len():
122
+ profile.flags.append(BooleanFlag.FormatMismatch)
123
+
117
124
  if non_null_count == 0:
118
125
  if series.drop_nulls().len() > 0 and col_name in user_overrides:
119
126
  from ._base import OverrideCoercionError
@@ -658,15 +658,18 @@ class ProfileConfig:
658
658
  numeric_sentinels: InitVar[Optional[dict[str, list[float]]]] = None
659
659
  string_sentinels: InitVar[Optional[dict[str, list[str]]]] = None
660
660
  datetime_epoch_units: InitVar[Optional[dict[str, Union[str, EpochUnit]]]] = None
661
+ datetime_formats: InitVar[Optional[dict[str, str]]] = None
661
662
  _numeric_sentinels: dict[str, list[float]] = field(default_factory=dict, init=False)
662
663
  _string_sentinels: dict[str, list[str]] = field(default_factory=dict, init=False)
663
664
  _datetime_epoch_units: dict[str, EpochUnit] = field(default_factory=dict, init=False)
665
+ _datetime_formats: dict[str, str] = field(default_factory=dict, init=False)
664
666
 
665
667
  def __post_init__(
666
668
  self,
667
669
  numeric_sentinels: Optional[dict[str, list[float]]],
668
670
  string_sentinels: Optional[dict[str, list[str]]],
669
671
  datetime_epoch_units: Optional[dict[str, Union[str, EpochUnit]]] = None,
672
+ datetime_formats: Optional[dict[str, str]] = None,
670
673
  ) -> None:
671
674
  if numeric_sentinels is not None and not isinstance(numeric_sentinels, property):
672
675
  for k, vals in numeric_sentinels.items():
@@ -677,6 +680,9 @@ class ProfileConfig:
677
680
  if datetime_epoch_units is not None and not isinstance(datetime_epoch_units, property):
678
681
  for k, val in datetime_epoch_units.items():
679
682
  self.set_datetime_epoch_unit(k, val)
683
+ if datetime_formats is not None and not isinstance(datetime_formats, property):
684
+ for k, fmt in datetime_formats.items():
685
+ self.set_datetime_format(k, fmt)
680
686
 
681
687
  @property
682
688
  def numeric_sentinels(self) -> MappingProxyType[str, list[float]]:
@@ -731,6 +737,27 @@ class ProfileConfig:
731
737
  """
732
738
  return MappingProxyType(self._datetime_epoch_units)
733
739
 
740
+ @property
741
+ def datetime_formats(self) -> MappingProxyType[str, str]:
742
+ """
743
+ Get the per-column declared datetime format strings.
744
+
745
+ Keys are column names; values are strftime-style format strings (e.g.
746
+ ``{"Year": "%Y"}``) applied by ``DatetimeProfiler`` with
747
+ ``strict=False`` when coercing that column to Datetime. A declaration
748
+ applies to any column profiled as Datetime, whether overridden or
749
+ auto-detected. Format strings are not validated against strftime
750
+ grammar at declaration time — a bad format surfaces at profiling time.
751
+ Defaults to an empty dict — columns with no declaration fall back to
752
+ Polars format inference.
753
+
754
+ Returns
755
+ -------
756
+ MappingProxyType[str, str]
757
+ Read-only mapping of column names to declared datetime formats.
758
+ """
759
+ return MappingProxyType(self._datetime_formats)
760
+
734
761
  def set_numeric_sentinel(self, column: str | list[str], values: list[float]) -> None:
735
762
  """
736
763
  Set numeric sentinel values for one or more columns.
@@ -809,6 +836,38 @@ class ProfileConfig:
809
836
  for c in columns:
810
837
  self._datetime_epoch_units[c] = enum_unit
811
838
 
839
+ def set_datetime_format(self, column: str | list[str], format: str) -> None:
840
+ """
841
+ Declare a datetime format string for one or more columns.
842
+
843
+ The format is applied by ``DatetimeProfiler`` with ``strict=False``
844
+ when coercing the column to Datetime, and is not validated against
845
+ strftime grammar or the data at declaration time — a bad format
846
+ surfaces at profiling time, consistent with ``set_column_type`` and
847
+ ``set_datetime_epoch_unit``.
848
+
849
+ Parameters
850
+ ----------
851
+ column : str or list of str
852
+ Column name or list of column names to apply the format to.
853
+ format : str
854
+ A non-empty strftime-style format string (e.g. ``"%Y"``).
855
+
856
+ Raises
857
+ ------
858
+ ValueError
859
+ If any column name is empty, or if `format` is not a non-empty
860
+ string.
861
+ """
862
+ if not isinstance(format, str) or not format:
863
+ raise ValueError("format must be a non-empty string.")
864
+
865
+ columns = [column] if isinstance(column, str) else column
866
+ for c in columns:
867
+ if not isinstance(c, str) or not c:
868
+ raise ValueError("column name must be a non-empty string.")
869
+ self._datetime_formats[c] = format
870
+
812
871
  def to_dict(self) -> dict:
813
872
  """
814
873
  Serialise the config to a plain dictionary.
@@ -837,6 +896,7 @@ class ProfileConfig:
837
896
  "numeric_sentinels": {k: list(v) for k, v in self.numeric_sentinels.items()},
838
897
  "string_sentinels": {k: list(v) for k, v in self.string_sentinels.items()},
839
898
  "datetime_epoch_units": {k: v.value for k, v in self.datetime_epoch_units.items()},
899
+ "datetime_formats": {k: v for k, v in self.datetime_formats.items()},
840
900
  }
841
901
 
842
902
  @classmethod
@@ -884,6 +944,7 @@ class ProfileConfig:
884
944
  numeric_sentinels=data.get("numeric_sentinels", {}),
885
945
  string_sentinels=data.get("string_sentinels", {}),
886
946
  datetime_epoch_units=data.get("datetime_epoch_units", {}),
947
+ datetime_formats=data.get("datetime_formats", {}),
887
948
  )
888
949
 
889
950
  return config
@@ -28,6 +28,7 @@ class DatetimeFlag(StrEnum):
28
28
  HighGapVariance = "high_gap_variance"
29
29
  MnarSuspected = "mnar_suspected"
30
30
  RecentDateMissing = "recent_date_missing"
31
+ FormatMismatch = "format_mismatch"
31
32
 
32
33
 
33
34
  @dataclass
@@ -68,9 +68,11 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
68
68
  self,
69
69
  config: DatetimeProfileConfig | None = None,
70
70
  epoch_units: dict[str, str] | None = None,
71
+ formats: dict[str, str] | None = None,
71
72
  ) -> None:
72
73
  self._config = config if config is not None else DatetimeProfileConfig()
73
74
  self._epoch_units = epoch_units or {}
75
+ self._formats = formats or {}
74
76
 
75
77
  # ------------------------------------------------------------------
76
78
  # Public API
@@ -112,8 +114,14 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
112
114
 
113
115
  def _coerce_to_datetime(self, series: pl.Series, col_name: str) -> pl.Series | None:
114
116
  if series.dtype in (pl.Utf8, pl.String):
117
+ declared_format = self._formats.get(col_name)
115
118
  try:
116
- coerced = series.str.to_datetime(strict=False)
119
+ if declared_format is not None:
120
+ coerced = series.str.to_datetime(
121
+ format=declared_format, strict=False
122
+ )
123
+ else:
124
+ coerced = series.str.to_datetime(strict=False)
117
125
  return coerced if coerced.drop_nulls().len() > 0 else None
118
126
  except pl.exceptions.ComputeError:
119
127
  return None
@@ -139,22 +147,35 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
139
147
 
140
148
  available = []
141
149
  coerced_cache = {}
150
+ format_mismatch: dict[str, bool] = {}
142
151
  for col_name in self._resolve_columns(df.columns, columns):
143
- series = self._coerce_to_datetime(df[col_name], col_name)
152
+ original = df[col_name]
153
+ series = self._coerce_to_datetime(original, col_name)
144
154
  if series is not None:
145
155
  available.append(col_name)
146
156
  coerced_cache[col_name] = series
157
+ # FormatMismatch: a value that is present (non-null after the
158
+ # orchestrator's Effective-Null normalization) but fails
159
+ # coercion becomes null here. Compare non-null counts before
160
+ # and after coercion; a shortfall means dirty, uncoercible data.
161
+ format_mismatch[col_name] = (
162
+ series.drop_nulls().len() < original.drop_nulls().len()
163
+ )
147
164
  elif col_name in user_overrides:
148
- if df[col_name].drop_nulls().len() > 0:
165
+ if original.drop_nulls().len() > 0:
149
166
  from ._base import OverrideCoercionError
150
167
  raise OverrideCoercionError(
151
- f"Column {col_name!r} with TypeFlag.UserOverride completely failed coercion to Datetime."
168
+ f"Column {col_name!r} with TypeFlag.UserOverride completely failed coercion to Datetime. "
169
+ f"If Polars cannot infer the format, declare one explicitly via "
170
+ f"ProfileConfig.set_datetime_format({col_name!r}, <format>) (e.g. '%Y' for bare years)."
152
171
  )
153
172
 
154
173
  result.analysed_columns = available
155
174
 
156
175
  for col_name in available:
157
176
  profile = self._profile_column(coerced_cache[col_name], df.height, now)
177
+ if format_mismatch.get(col_name):
178
+ profile.flags.append(DatetimeFlag.FormatMismatch)
158
179
  result.columns[col_name] = profile
159
180
 
160
181
  return result
@@ -170,6 +170,7 @@ class NumericFlag(StrEnum):
170
170
  NearConstant = "near_constant"
171
171
  Bimodal = "bimodal"
172
172
  HighOutlierDensity = "high_outlier_density"
173
+ FormatMismatch = "format_mismatch"
173
174
 
174
175
 
175
176
  @dataclass
@@ -144,6 +144,13 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
144
144
  clean = f64.drop_nulls()
145
145
  profile = NumericStats()
146
146
 
147
+ # FormatMismatch: a value that is present (non-null after the
148
+ # orchestrator's Effective-Null normalization) but fails the
149
+ # Float64 cast becomes null here. A shortfall in the non-null
150
+ # count means the column holds dirty, uncoercible data.
151
+ if clean.len() < series.drop_nulls().len():
152
+ profile.flags.append(NumericFlag.FormatMismatch)
153
+
147
154
  if clean.len() == 0:
148
155
  if series.drop_nulls().len() > 0 and col in user_overrides:
149
156
  from ._base import OverrideCoercionError
@@ -224,13 +224,22 @@ class StructuralProfiler:
224
224
  type_to_cols.setdefault(sem_type, []).append(col_name)
225
225
 
226
226
  pc = self.config.profiling
227
+ profiling_frame = _resolve_effective_nulls(
228
+ data,
229
+ numeric_sentinels=dict(pc.numeric_sentinels),
230
+ string_sentinels=dict(pc.string_sentinels),
231
+ )
227
232
  for sem_type, cols in type_to_cols.items():
228
233
  if sem_type == SemanticType.Numeric:
229
234
  profiler = NumericProfiler(config=pc.numeric)
230
235
  elif sem_type == SemanticType.Categorical:
231
236
  profiler = CategoricalProfiler(config=pc.categorical)
232
237
  elif sem_type == SemanticType.Datetime:
233
- profiler = DatetimeProfiler(config=pc.datetime_, epoch_units=pc.datetime_epoch_units)
238
+ profiler = DatetimeProfiler(
239
+ config=pc.datetime_,
240
+ epoch_units=pc.datetime_epoch_units,
241
+ formats=pc.datetime_formats,
242
+ )
234
243
  else:
235
244
  profiler_cls = _COLUMN_PROFILER_REGISTRY.get(sem_type) # type: ignore[arg-type]
236
245
  if profiler_cls is None:
@@ -241,7 +250,7 @@ class StructuralProfiler:
241
250
  c for c in cols
242
251
  if result.columns.get(c) and TypeFlag.UserOverride in result.columns[c].type_flags
243
252
  }
244
- batch = profiler.profile(data, columns=cols, user_overrides=user_overrides)
253
+ batch = profiler.profile(profiling_frame, columns=cols, user_overrides=user_overrides)
245
254
  for col_name in batch.analysed_columns:
246
255
  if col_name in result.columns:
247
256
  result.columns[col_name].stats = batch.columns.get(col_name)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 2.0.6
3
+ Version: 2.0.7
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
File without changes
File without changes
File without changes