dataforge-ml 2.0.6__tar.gz → 2.0.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataforge_ml-2.0.6/src/dataforge_ml.egg-info → dataforge_ml-2.0.8}/PKG-INFO +1 -1
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/pyproject.toml +1 -1
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_boolean_config.py +25 -1
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_boolean_profiler.py +8 -1
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_config.py +61 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_datetime_config.py +3 -70
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_datetime_profiler.py +29 -59
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_numeric_config.py +1 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_numeric_profiler.py +7 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/orchestrator.py +11 -2
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8/src/dataforge_ml.egg-info}/PKG-INFO +1 -1
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/LICENSE +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/README.md +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/setup.cfg +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/__init__.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/config.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/__init__.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/_config.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/_fitted_imputer.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/_numeric_imputer.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/_regression_estimator_factory.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/_strategy_router.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/_utils.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/orchestrator.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/models/__init__.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/models/_data_structure.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/models/_data_types.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/__init__.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_base.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_categorical.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_correlation_profiler.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_missingness_profiler.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_nonlinearity_profiler.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_tabular.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_target_config.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_target_profiler.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_text_config.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_type_detection_config.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_type_detector.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/splitting/__init__.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/splitting/_config.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/splitting/_profile_signals.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/splitting/_splitter.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/utils/__init__.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/utils/_null_detection.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/utils/_null_normalization.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/utils/data_loader.py +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml.egg-info/requires.txt +0 -0
- {dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml.egg-info/top_level.txt +0 -0
|
@@ -7,9 +7,14 @@ Populated by BooleanProfiler.
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
9
|
from dataclasses import dataclass, field
|
|
10
|
+
from enum import StrEnum
|
|
10
11
|
from typing import Optional
|
|
11
12
|
|
|
12
13
|
|
|
14
|
+
class BooleanFlag(StrEnum):
|
|
15
|
+
FormatMismatch = "format_mismatch"
|
|
16
|
+
|
|
17
|
+
|
|
13
18
|
@dataclass
|
|
14
19
|
class BooleanStats:
|
|
15
20
|
"""Value distribution statistics for a single Boolean column.
|
|
@@ -24,6 +29,23 @@ class BooleanStats:
|
|
|
24
29
|
true_ratio: float = 0.0
|
|
25
30
|
false_ratio: float = 0.0
|
|
26
31
|
mode: Optional[bool] = None
|
|
32
|
+
flags: list[BooleanFlag] = field(default_factory=list)
|
|
33
|
+
|
|
34
|
+
def has_flag(self, flag: BooleanFlag) -> bool:
|
|
35
|
+
"""Check whether a specific ``BooleanFlag`` is set on this column.
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
----------
|
|
39
|
+
flag : BooleanFlag
|
|
40
|
+
The flag to test.
|
|
41
|
+
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
44
|
+
bool
|
|
45
|
+
``True`` if ``flag`` is present in :attr:`flags`, ``False``
|
|
46
|
+
otherwise.
|
|
47
|
+
"""
|
|
48
|
+
return flag in self.flags
|
|
27
49
|
|
|
28
50
|
def to_dict(self) -> dict:
|
|
29
51
|
"""Serialise the boolean statistics to a plain dictionary.
|
|
@@ -31,7 +53,8 @@ class BooleanStats:
|
|
|
31
53
|
Returns
|
|
32
54
|
-------
|
|
33
55
|
dict
|
|
34
|
-
All fields keyed by field name.
|
|
56
|
+
All fields keyed by field name. ``flags`` are serialised as their
|
|
57
|
+
string values.
|
|
35
58
|
"""
|
|
36
59
|
return {
|
|
37
60
|
"true_count": self.true_count,
|
|
@@ -39,6 +62,7 @@ class BooleanStats:
|
|
|
39
62
|
"true_ratio": self.true_ratio,
|
|
40
63
|
"false_ratio": self.false_ratio,
|
|
41
64
|
"mode": self.mode,
|
|
65
|
+
"flags": [str(f) for f in self.flags],
|
|
42
66
|
}
|
|
43
67
|
|
|
44
68
|
|
|
@@ -23,7 +23,7 @@ import polars as pl
|
|
|
23
23
|
|
|
24
24
|
from ._base import ColumnBatchProfiler
|
|
25
25
|
from ._config import BooleanStats
|
|
26
|
-
from ._boolean_config import BooleanProfileResult
|
|
26
|
+
from ._boolean_config import BooleanFlag, BooleanProfileResult
|
|
27
27
|
from ..models._data_types import _INT_DTYPES
|
|
28
28
|
|
|
29
29
|
# ---------------------------------------------------------------------------
|
|
@@ -114,6 +114,13 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
|
|
|
114
114
|
bool_series = self._to_bool_series(series)
|
|
115
115
|
non_null_count = bool_series.len()
|
|
116
116
|
|
|
117
|
+
# FormatMismatch: a value that is present (non-null after the
|
|
118
|
+
# orchestrator's Effective-Null normalization) but falls outside the
|
|
119
|
+
# recognized true/false vocabulary is dropped by coercion. A shortfall
|
|
120
|
+
# in the non-null count means the column holds dirty, uncoercible data.
|
|
121
|
+
if non_null_count < series.drop_nulls().len():
|
|
122
|
+
profile.flags.append(BooleanFlag.FormatMismatch)
|
|
123
|
+
|
|
117
124
|
if non_null_count == 0:
|
|
118
125
|
if series.drop_nulls().len() > 0 and col_name in user_overrides:
|
|
119
126
|
from ._base import OverrideCoercionError
|
|
@@ -658,15 +658,18 @@ class ProfileConfig:
|
|
|
658
658
|
numeric_sentinels: InitVar[Optional[dict[str, list[float]]]] = None
|
|
659
659
|
string_sentinels: InitVar[Optional[dict[str, list[str]]]] = None
|
|
660
660
|
datetime_epoch_units: InitVar[Optional[dict[str, Union[str, EpochUnit]]]] = None
|
|
661
|
+
datetime_formats: InitVar[Optional[dict[str, str]]] = None
|
|
661
662
|
_numeric_sentinels: dict[str, list[float]] = field(default_factory=dict, init=False)
|
|
662
663
|
_string_sentinels: dict[str, list[str]] = field(default_factory=dict, init=False)
|
|
663
664
|
_datetime_epoch_units: dict[str, EpochUnit] = field(default_factory=dict, init=False)
|
|
665
|
+
_datetime_formats: dict[str, str] = field(default_factory=dict, init=False)
|
|
664
666
|
|
|
665
667
|
def __post_init__(
|
|
666
668
|
self,
|
|
667
669
|
numeric_sentinels: Optional[dict[str, list[float]]],
|
|
668
670
|
string_sentinels: Optional[dict[str, list[str]]],
|
|
669
671
|
datetime_epoch_units: Optional[dict[str, Union[str, EpochUnit]]] = None,
|
|
672
|
+
datetime_formats: Optional[dict[str, str]] = None,
|
|
670
673
|
) -> None:
|
|
671
674
|
if numeric_sentinels is not None and not isinstance(numeric_sentinels, property):
|
|
672
675
|
for k, vals in numeric_sentinels.items():
|
|
@@ -677,6 +680,9 @@ class ProfileConfig:
|
|
|
677
680
|
if datetime_epoch_units is not None and not isinstance(datetime_epoch_units, property):
|
|
678
681
|
for k, val in datetime_epoch_units.items():
|
|
679
682
|
self.set_datetime_epoch_unit(k, val)
|
|
683
|
+
if datetime_formats is not None and not isinstance(datetime_formats, property):
|
|
684
|
+
for k, fmt in datetime_formats.items():
|
|
685
|
+
self.set_datetime_format(k, fmt)
|
|
680
686
|
|
|
681
687
|
@property
|
|
682
688
|
def numeric_sentinels(self) -> MappingProxyType[str, list[float]]:
|
|
@@ -731,6 +737,27 @@ class ProfileConfig:
|
|
|
731
737
|
"""
|
|
732
738
|
return MappingProxyType(self._datetime_epoch_units)
|
|
733
739
|
|
|
740
|
+
@property
|
|
741
|
+
def datetime_formats(self) -> MappingProxyType[str, str]:
|
|
742
|
+
"""
|
|
743
|
+
Get the per-column declared datetime format strings.
|
|
744
|
+
|
|
745
|
+
Keys are column names; values are strftime-style format strings (e.g.
|
|
746
|
+
``{"Year": "%Y"}``) applied by ``DatetimeProfiler`` with
|
|
747
|
+
``strict=False`` when coercing that column to Datetime. A declaration
|
|
748
|
+
applies to any column profiled as Datetime, whether overridden or
|
|
749
|
+
auto-detected. Format strings are not validated against strftime
|
|
750
|
+
grammar at declaration time — a bad format surfaces at profiling time.
|
|
751
|
+
Defaults to an empty dict — columns with no declaration fall back to
|
|
752
|
+
Polars format inference.
|
|
753
|
+
|
|
754
|
+
Returns
|
|
755
|
+
-------
|
|
756
|
+
MappingProxyType[str, str]
|
|
757
|
+
Read-only mapping of column names to declared datetime formats.
|
|
758
|
+
"""
|
|
759
|
+
return MappingProxyType(self._datetime_formats)
|
|
760
|
+
|
|
734
761
|
def set_numeric_sentinel(self, column: str | list[str], values: list[float]) -> None:
|
|
735
762
|
"""
|
|
736
763
|
Set numeric sentinel values for one or more columns.
|
|
@@ -809,6 +836,38 @@ class ProfileConfig:
|
|
|
809
836
|
for c in columns:
|
|
810
837
|
self._datetime_epoch_units[c] = enum_unit
|
|
811
838
|
|
|
839
|
+
def set_datetime_format(self, column: str | list[str], format: str) -> None:
|
|
840
|
+
"""
|
|
841
|
+
Declare a datetime format string for one or more columns.
|
|
842
|
+
|
|
843
|
+
The format is applied by ``DatetimeProfiler`` with ``strict=False``
|
|
844
|
+
when coercing the column to Datetime, and is not validated against
|
|
845
|
+
strftime grammar or the data at declaration time — a bad format
|
|
846
|
+
surfaces at profiling time, consistent with ``set_column_type`` and
|
|
847
|
+
``set_datetime_epoch_unit``.
|
|
848
|
+
|
|
849
|
+
Parameters
|
|
850
|
+
----------
|
|
851
|
+
column : str or list of str
|
|
852
|
+
Column name or list of column names to apply the format to.
|
|
853
|
+
format : str
|
|
854
|
+
A non-empty strftime-style format string (e.g. ``"%Y"``).
|
|
855
|
+
|
|
856
|
+
Raises
|
|
857
|
+
------
|
|
858
|
+
ValueError
|
|
859
|
+
If any column name is empty, or if `format` is not a non-empty
|
|
860
|
+
string.
|
|
861
|
+
"""
|
|
862
|
+
if not isinstance(format, str) or not format:
|
|
863
|
+
raise ValueError("format must be a non-empty string.")
|
|
864
|
+
|
|
865
|
+
columns = [column] if isinstance(column, str) else column
|
|
866
|
+
for c in columns:
|
|
867
|
+
if not isinstance(c, str) or not c:
|
|
868
|
+
raise ValueError("column name must be a non-empty string.")
|
|
869
|
+
self._datetime_formats[c] = format
|
|
870
|
+
|
|
812
871
|
def to_dict(self) -> dict:
|
|
813
872
|
"""
|
|
814
873
|
Serialise the config to a plain dictionary.
|
|
@@ -837,6 +896,7 @@ class ProfileConfig:
|
|
|
837
896
|
"numeric_sentinels": {k: list(v) for k, v in self.numeric_sentinels.items()},
|
|
838
897
|
"string_sentinels": {k: list(v) for k, v in self.string_sentinels.items()},
|
|
839
898
|
"datetime_epoch_units": {k: v.value for k, v in self.datetime_epoch_units.items()},
|
|
899
|
+
"datetime_formats": {k: v for k, v in self.datetime_formats.items()},
|
|
840
900
|
}
|
|
841
901
|
|
|
842
902
|
@classmethod
|
|
@@ -884,6 +944,7 @@ class ProfileConfig:
|
|
|
884
944
|
numeric_sentinels=data.get("numeric_sentinels", {}),
|
|
885
945
|
string_sentinels=data.get("string_sentinels", {}),
|
|
886
946
|
datetime_epoch_units=data.get("datetime_epoch_units", {}),
|
|
947
|
+
datetime_formats=data.get("datetime_formats", {}),
|
|
887
948
|
)
|
|
888
949
|
|
|
889
950
|
return config
|
|
@@ -28,70 +28,7 @@ class DatetimeFlag(StrEnum):
|
|
|
28
28
|
HighGapVariance = "high_gap_variance"
|
|
29
29
|
MnarSuspected = "mnar_suspected"
|
|
30
30
|
RecentDateMissing = "recent_date_missing"
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
@dataclass
|
|
34
|
-
class TemporalSignals:
|
|
35
|
-
"""Which time-component features are present in a Datetime column.
|
|
36
|
-
|
|
37
|
-
Each boolean field indicates that the corresponding granularity was
|
|
38
|
-
detected as non-constant, making it a candidate for feature extraction
|
|
39
|
-
in Phase 5 Encoding.
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
has_year: bool = False
|
|
43
|
-
has_month: bool = False
|
|
44
|
-
has_day: bool = False
|
|
45
|
-
has_day_of_week: bool = False
|
|
46
|
-
has_hour: bool = False
|
|
47
|
-
has_is_weekend: bool = False
|
|
48
|
-
has_is_month_end: bool = False
|
|
49
|
-
|
|
50
|
-
def extractable_features(self) -> list[str]:
|
|
51
|
-
"""Return the names of all time-component features that can be extracted.
|
|
52
|
-
|
|
53
|
-
Returns
|
|
54
|
-
-------
|
|
55
|
-
list[str]
|
|
56
|
-
Feature names corresponding to every ``has_*`` field that is
|
|
57
|
-
``True``. An empty list means no temporal variation was detected.
|
|
58
|
-
"""
|
|
59
|
-
features = []
|
|
60
|
-
if self.has_year:
|
|
61
|
-
features.append("year")
|
|
62
|
-
if self.has_month:
|
|
63
|
-
features.append("month")
|
|
64
|
-
if self.has_day:
|
|
65
|
-
features.append("day_of_month")
|
|
66
|
-
if self.has_day_of_week:
|
|
67
|
-
features.append("day_of_week")
|
|
68
|
-
if self.has_hour:
|
|
69
|
-
features.append("hour")
|
|
70
|
-
if self.has_is_weekend:
|
|
71
|
-
features.append("is_weekend")
|
|
72
|
-
if self.has_is_month_end:
|
|
73
|
-
features.append("is_month_end")
|
|
74
|
-
return features
|
|
75
|
-
|
|
76
|
-
def to_dict(self) -> dict:
|
|
77
|
-
"""Serialise the temporal signals to a plain dictionary.
|
|
78
|
-
|
|
79
|
-
Returns
|
|
80
|
-
-------
|
|
81
|
-
dict
|
|
82
|
-
All ``has_*`` flags plus an ``extractable_features`` key
|
|
83
|
-
containing the result of :meth:`extractable_features`.
|
|
84
|
-
"""
|
|
85
|
-
return {
|
|
86
|
-
"has_year": self.has_year,
|
|
87
|
-
"has_month": self.has_month,
|
|
88
|
-
"has_day": self.has_day,
|
|
89
|
-
"has_day_of_week": self.has_day_of_week,
|
|
90
|
-
"has_hour": self.has_hour,
|
|
91
|
-
"has_is_weekend": self.has_is_weekend,
|
|
92
|
-
"has_is_month_end": self.has_is_month_end,
|
|
93
|
-
"extractable_features": self.extractable_features(),
|
|
94
|
-
}
|
|
31
|
+
FormatMismatch = "format_mismatch"
|
|
95
32
|
|
|
96
33
|
|
|
97
34
|
@dataclass
|
|
@@ -99,8 +36,7 @@ class DatetimeStats:
|
|
|
99
36
|
"""Statistical summary of a single Datetime column.
|
|
100
37
|
|
|
101
38
|
Produced by ``DatetimeProfiler`` for each opted-in column. Stores
|
|
102
|
-
range, gap regularity, inferred granularity
|
|
103
|
-
indicating which time components are available for feature extraction.
|
|
39
|
+
range, gap regularity, and inferred granularity.
|
|
104
40
|
"""
|
|
105
41
|
|
|
106
42
|
min_date: Optional[str] = None
|
|
@@ -110,7 +46,6 @@ class DatetimeStats:
|
|
|
110
46
|
inferred_granularity: Optional[InferredGranularity] = None
|
|
111
47
|
median_gap_seconds: Optional[float] = None
|
|
112
48
|
gap_cv: Optional[float] = None
|
|
113
|
-
signals: TemporalSignals = field(default_factory=TemporalSignals)
|
|
114
49
|
flags: list[DatetimeFlag] = field(default_factory=list)
|
|
115
50
|
|
|
116
51
|
def has_flag(self, flag: DatetimeFlag) -> bool:
|
|
@@ -136,8 +71,7 @@ class DatetimeStats:
|
|
|
136
71
|
-------
|
|
137
72
|
dict
|
|
138
73
|
All fields keyed by field name. ``inferred_granularity`` is
|
|
139
|
-
serialised as its string value; ``
|
|
140
|
-
:meth:`TemporalSignals.to_dict`; ``flags`` are serialised as
|
|
74
|
+
serialised as its string value; ``flags`` are serialised as
|
|
141
75
|
their string values.
|
|
142
76
|
"""
|
|
143
77
|
return {
|
|
@@ -148,7 +82,6 @@ class DatetimeStats:
|
|
|
148
82
|
"inferred_granularity": str(self.inferred_granularity) if self.inferred_granularity else None,
|
|
149
83
|
"median_gap_seconds": self.median_gap_seconds,
|
|
150
84
|
"gap_cv": self.gap_cv,
|
|
151
|
-
"signals": self.signals.to_dict(),
|
|
152
85
|
"flags": [str(f) for f in self.flags],
|
|
153
86
|
}
|
|
154
87
|
|
|
@@ -7,9 +7,10 @@ Per-column metrics (opt-in via ProfileConfig.datetime_columns):
|
|
|
7
7
|
3. Future dates – count of values > now, with context note
|
|
8
8
|
4. Granularity – inferred periodicity from median consecutive gap;
|
|
9
9
|
high gap-CV flagged as irregular
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
|
|
11
|
+
Temporal-component variance (whether year, month, day-of-week, etc. vary) is
|
|
12
|
+
intentionally not profiled here; Phase 5 Encoding derives it on demand from
|
|
13
|
+
the column.
|
|
13
14
|
|
|
14
15
|
Granularity inference bands (median gap in seconds):
|
|
15
16
|
< 90 s → secondly
|
|
@@ -34,7 +35,6 @@ from ._datetime_config import (
|
|
|
34
35
|
DatetimeStats,
|
|
35
36
|
InferredGranularity,
|
|
36
37
|
DatetimeFlag,
|
|
37
|
-
TemporalSignals,
|
|
38
38
|
)
|
|
39
39
|
|
|
40
40
|
# Granularity bands — upper bound (exclusive) in seconds for each label.
|
|
@@ -68,9 +68,11 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
|
|
|
68
68
|
self,
|
|
69
69
|
config: DatetimeProfileConfig | None = None,
|
|
70
70
|
epoch_units: dict[str, str] | None = None,
|
|
71
|
+
formats: dict[str, str] | None = None,
|
|
71
72
|
) -> None:
|
|
72
73
|
self._config = config if config is not None else DatetimeProfileConfig()
|
|
73
74
|
self._epoch_units = epoch_units or {}
|
|
75
|
+
self._formats = formats or {}
|
|
74
76
|
|
|
75
77
|
# ------------------------------------------------------------------
|
|
76
78
|
# Public API
|
|
@@ -112,8 +114,14 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
|
|
|
112
114
|
|
|
113
115
|
def _coerce_to_datetime(self, series: pl.Series, col_name: str) -> pl.Series | None:
|
|
114
116
|
if series.dtype in (pl.Utf8, pl.String):
|
|
117
|
+
declared_format = self._formats.get(col_name)
|
|
115
118
|
try:
|
|
116
|
-
|
|
119
|
+
if declared_format is not None:
|
|
120
|
+
coerced = series.str.to_datetime(
|
|
121
|
+
format=declared_format, strict=False
|
|
122
|
+
)
|
|
123
|
+
else:
|
|
124
|
+
coerced = series.str.to_datetime(strict=False)
|
|
117
125
|
return coerced if coerced.drop_nulls().len() > 0 else None
|
|
118
126
|
except pl.exceptions.ComputeError:
|
|
119
127
|
return None
|
|
@@ -139,22 +147,35 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
|
|
|
139
147
|
|
|
140
148
|
available = []
|
|
141
149
|
coerced_cache = {}
|
|
150
|
+
format_mismatch: dict[str, bool] = {}
|
|
142
151
|
for col_name in self._resolve_columns(df.columns, columns):
|
|
143
|
-
|
|
152
|
+
original = df[col_name]
|
|
153
|
+
series = self._coerce_to_datetime(original, col_name)
|
|
144
154
|
if series is not None:
|
|
145
155
|
available.append(col_name)
|
|
146
156
|
coerced_cache[col_name] = series
|
|
157
|
+
# FormatMismatch: a value that is present (non-null after the
|
|
158
|
+
# orchestrator's Effective-Null normalization) but fails
|
|
159
|
+
# coercion becomes null here. Compare non-null counts before
|
|
160
|
+
# and after coercion; a shortfall means dirty, uncoercible data.
|
|
161
|
+
format_mismatch[col_name] = (
|
|
162
|
+
series.drop_nulls().len() < original.drop_nulls().len()
|
|
163
|
+
)
|
|
147
164
|
elif col_name in user_overrides:
|
|
148
|
-
if
|
|
165
|
+
if original.drop_nulls().len() > 0:
|
|
149
166
|
from ._base import OverrideCoercionError
|
|
150
167
|
raise OverrideCoercionError(
|
|
151
|
-
f"Column {col_name!r} with TypeFlag.UserOverride completely failed coercion to Datetime."
|
|
168
|
+
f"Column {col_name!r} with TypeFlag.UserOverride completely failed coercion to Datetime. "
|
|
169
|
+
f"If Polars cannot infer the format, declare one explicitly via "
|
|
170
|
+
f"ProfileConfig.set_datetime_format({col_name!r}, <format>) (e.g. '%Y' for bare years)."
|
|
152
171
|
)
|
|
153
172
|
|
|
154
173
|
result.analysed_columns = available
|
|
155
174
|
|
|
156
175
|
for col_name in available:
|
|
157
176
|
profile = self._profile_column(coerced_cache[col_name], df.height, now)
|
|
177
|
+
if format_mismatch.get(col_name):
|
|
178
|
+
profile.flags.append(DatetimeFlag.FormatMismatch)
|
|
158
179
|
result.columns[col_name] = profile
|
|
159
180
|
|
|
160
181
|
return result
|
|
@@ -205,9 +226,6 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
|
|
|
205
226
|
# 6. Granularity
|
|
206
227
|
self._infer_granularity(clean, profile)
|
|
207
228
|
|
|
208
|
-
# 7. Temporal signals
|
|
209
|
-
self._audit_temporal_signals(clean, profile)
|
|
210
|
-
|
|
211
229
|
return profile
|
|
212
230
|
|
|
213
231
|
# ------------------------------------------------------------------
|
|
@@ -356,51 +374,3 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
|
|
|
356
374
|
break
|
|
357
375
|
|
|
358
376
|
profile.inferred_granularity = granularity
|
|
359
|
-
|
|
360
|
-
# ------------------------------------------------------------------
|
|
361
|
-
# Step 7: Temporal signal audit
|
|
362
|
-
# ------------------------------------------------------------------
|
|
363
|
-
|
|
364
|
-
@staticmethod
|
|
365
|
-
def _audit_temporal_signals(
|
|
366
|
-
clean: pl.Series,
|
|
367
|
-
profile: DatetimeStats,
|
|
368
|
-
) -> None:
|
|
369
|
-
"""
|
|
370
|
-
Check which temporal features vary across rows.
|
|
371
|
-
|
|
372
|
-
All checks are done via Polars expressions on the full clean series,
|
|
373
|
-
so no Python-level loops are required.
|
|
374
|
-
"""
|
|
375
|
-
signals = TemporalSignals()
|
|
376
|
-
|
|
377
|
-
years = clean.dt.year()
|
|
378
|
-
months = clean.dt.month()
|
|
379
|
-
days = clean.dt.day()
|
|
380
|
-
dow = clean.dt.weekday() # 0=Monday … 6=Sunday
|
|
381
|
-
hours = clean.dt.hour()
|
|
382
|
-
|
|
383
|
-
signals.has_year = years.n_unique() > 1
|
|
384
|
-
signals.has_month = months.n_unique() > 1
|
|
385
|
-
signals.has_day = days.n_unique() > 1
|
|
386
|
-
signals.has_day_of_week = dow.n_unique() > 1
|
|
387
|
-
signals.has_hour = int(hours.max()) > 0 # type: ignore[arg-type]
|
|
388
|
-
|
|
389
|
-
# Weekend signal is only meaningful when day-of-week varies
|
|
390
|
-
if signals.has_day_of_week:
|
|
391
|
-
weekend_mask = dow >= 5 # Saturday=5, Sunday=6
|
|
392
|
-
signals.has_is_weekend = bool(weekend_mask.any())
|
|
393
|
-
|
|
394
|
-
# Month-end: day == last day of the respective month
|
|
395
|
-
try:
|
|
396
|
-
month_end_ts = clean.dt.month_end()
|
|
397
|
-
is_month_end_mask = (
|
|
398
|
-
(clean.dt.year() == month_end_ts.dt.year())
|
|
399
|
-
& (clean.dt.month() == month_end_ts.dt.month())
|
|
400
|
-
& (clean.dt.day() == month_end_ts.dt.day())
|
|
401
|
-
)
|
|
402
|
-
signals.has_is_month_end = bool(is_month_end_mask.any())
|
|
403
|
-
except Exception:
|
|
404
|
-
signals.has_is_month_end = False
|
|
405
|
-
|
|
406
|
-
profile.signals = signals
|
|
@@ -144,6 +144,13 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
144
144
|
clean = f64.drop_nulls()
|
|
145
145
|
profile = NumericStats()
|
|
146
146
|
|
|
147
|
+
# FormatMismatch: a value that is present (non-null after the
|
|
148
|
+
# orchestrator's Effective-Null normalization) but fails the
|
|
149
|
+
# Float64 cast becomes null here. A shortfall in the non-null
|
|
150
|
+
# count means the column holds dirty, uncoercible data.
|
|
151
|
+
if clean.len() < series.drop_nulls().len():
|
|
152
|
+
profile.flags.append(NumericFlag.FormatMismatch)
|
|
153
|
+
|
|
147
154
|
if clean.len() == 0:
|
|
148
155
|
if series.drop_nulls().len() > 0 and col in user_overrides:
|
|
149
156
|
from ._base import OverrideCoercionError
|
|
@@ -224,13 +224,22 @@ class StructuralProfiler:
|
|
|
224
224
|
type_to_cols.setdefault(sem_type, []).append(col_name)
|
|
225
225
|
|
|
226
226
|
pc = self.config.profiling
|
|
227
|
+
profiling_frame = _resolve_effective_nulls(
|
|
228
|
+
data,
|
|
229
|
+
numeric_sentinels=dict(pc.numeric_sentinels),
|
|
230
|
+
string_sentinels=dict(pc.string_sentinels),
|
|
231
|
+
)
|
|
227
232
|
for sem_type, cols in type_to_cols.items():
|
|
228
233
|
if sem_type == SemanticType.Numeric:
|
|
229
234
|
profiler = NumericProfiler(config=pc.numeric)
|
|
230
235
|
elif sem_type == SemanticType.Categorical:
|
|
231
236
|
profiler = CategoricalProfiler(config=pc.categorical)
|
|
232
237
|
elif sem_type == SemanticType.Datetime:
|
|
233
|
-
profiler = DatetimeProfiler(
|
|
238
|
+
profiler = DatetimeProfiler(
|
|
239
|
+
config=pc.datetime_,
|
|
240
|
+
epoch_units=pc.datetime_epoch_units,
|
|
241
|
+
formats=pc.datetime_formats,
|
|
242
|
+
)
|
|
234
243
|
else:
|
|
235
244
|
profiler_cls = _COLUMN_PROFILER_REGISTRY.get(sem_type) # type: ignore[arg-type]
|
|
236
245
|
if profiler_cls is None:
|
|
@@ -241,7 +250,7 @@ class StructuralProfiler:
|
|
|
241
250
|
c for c in cols
|
|
242
251
|
if result.columns.get(c) and TypeFlag.UserOverride in result.columns[c].type_flags
|
|
243
252
|
}
|
|
244
|
-
batch = profiler.profile(
|
|
253
|
+
batch = profiler.profile(profiling_frame, columns=cols, user_overrides=user_overrides)
|
|
245
254
|
for col_name in batch.analysed_columns:
|
|
246
255
|
if col_name in result.columns:
|
|
247
256
|
result.columns[col_name].stats = batch.columns.get(col_name)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_correlation_profiler.py
RENAMED
|
File without changes
|
|
File without changes
|
{dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_missingness_profiler.py
RENAMED
|
File without changes
|
{dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_nonlinearity_profiler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataforge_ml-2.0.6 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_type_detection_config.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|