dataforge-ml 2.0.5__tar.gz → 2.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataforge_ml-2.0.5/src/dataforge_ml.egg-info → dataforge_ml-2.0.7}/PKG-INFO +1 -1
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/pyproject.toml +1 -1
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_boolean_config.py +25 -1
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_boolean_profiler.py +8 -1
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_config.py +61 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_datetime_config.py +1 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_datetime_profiler.py +25 -4
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_numeric_config.py +1 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_numeric_profiler.py +7 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/orchestrator.py +18 -3
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7/src/dataforge_ml.egg-info}/PKG-INFO +1 -1
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/LICENSE +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/README.md +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/setup.cfg +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/__init__.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/config.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/imputation/__init__.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/imputation/_config.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/imputation/_fitted_imputer.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/imputation/_numeric_imputer.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/imputation/_regression_estimator_factory.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/imputation/_strategy_router.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/imputation/_utils.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/imputation/orchestrator.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/models/__init__.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/models/_data_structure.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/models/_data_types.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/__init__.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_base.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_categorical.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_correlation_profiler.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_missingness_profiler.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_nonlinearity_profiler.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_tabular.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_target_config.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_target_profiler.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_text_config.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_type_detection_config.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_type_detector.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/splitting/__init__.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/splitting/_config.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/splitting/_profile_signals.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/splitting/_splitter.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/utils/__init__.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/utils/_null_detection.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/utils/_null_normalization.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/utils/data_loader.py +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml.egg-info/requires.txt +0 -0
- {dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml.egg-info/top_level.txt +0 -0
|
@@ -7,9 +7,14 @@ Populated by BooleanProfiler.
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
9
|
from dataclasses import dataclass, field
|
|
10
|
+
from enum import StrEnum
|
|
10
11
|
from typing import Optional
|
|
11
12
|
|
|
12
13
|
|
|
14
|
+
class BooleanFlag(StrEnum):
|
|
15
|
+
FormatMismatch = "format_mismatch"
|
|
16
|
+
|
|
17
|
+
|
|
13
18
|
@dataclass
|
|
14
19
|
class BooleanStats:
|
|
15
20
|
"""Value distribution statistics for a single Boolean column.
|
|
@@ -24,6 +29,23 @@ class BooleanStats:
|
|
|
24
29
|
true_ratio: float = 0.0
|
|
25
30
|
false_ratio: float = 0.0
|
|
26
31
|
mode: Optional[bool] = None
|
|
32
|
+
flags: list[BooleanFlag] = field(default_factory=list)
|
|
33
|
+
|
|
34
|
+
def has_flag(self, flag: BooleanFlag) -> bool:
|
|
35
|
+
"""Check whether a specific ``BooleanFlag`` is set on this column.
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
----------
|
|
39
|
+
flag : BooleanFlag
|
|
40
|
+
The flag to test.
|
|
41
|
+
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
44
|
+
bool
|
|
45
|
+
``True`` if ``flag`` is present in :attr:`flags`, ``False``
|
|
46
|
+
otherwise.
|
|
47
|
+
"""
|
|
48
|
+
return flag in self.flags
|
|
27
49
|
|
|
28
50
|
def to_dict(self) -> dict:
|
|
29
51
|
"""Serialise the boolean statistics to a plain dictionary.
|
|
@@ -31,7 +53,8 @@ class BooleanStats:
|
|
|
31
53
|
Returns
|
|
32
54
|
-------
|
|
33
55
|
dict
|
|
34
|
-
All fields keyed by field name.
|
|
56
|
+
All fields keyed by field name. ``flags`` are serialised as their
|
|
57
|
+
string values.
|
|
35
58
|
"""
|
|
36
59
|
return {
|
|
37
60
|
"true_count": self.true_count,
|
|
@@ -39,6 +62,7 @@ class BooleanStats:
|
|
|
39
62
|
"true_ratio": self.true_ratio,
|
|
40
63
|
"false_ratio": self.false_ratio,
|
|
41
64
|
"mode": self.mode,
|
|
65
|
+
"flags": [str(f) for f in self.flags],
|
|
42
66
|
}
|
|
43
67
|
|
|
44
68
|
|
|
@@ -23,7 +23,7 @@ import polars as pl
|
|
|
23
23
|
|
|
24
24
|
from ._base import ColumnBatchProfiler
|
|
25
25
|
from ._config import BooleanStats
|
|
26
|
-
from ._boolean_config import BooleanProfileResult
|
|
26
|
+
from ._boolean_config import BooleanFlag, BooleanProfileResult
|
|
27
27
|
from ..models._data_types import _INT_DTYPES
|
|
28
28
|
|
|
29
29
|
# ---------------------------------------------------------------------------
|
|
@@ -114,6 +114,13 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
|
|
|
114
114
|
bool_series = self._to_bool_series(series)
|
|
115
115
|
non_null_count = bool_series.len()
|
|
116
116
|
|
|
117
|
+
# FormatMismatch: a value that is present (non-null after the
|
|
118
|
+
# orchestrator's Effective-Null normalization) but falls outside the
|
|
119
|
+
# recognized true/false vocabulary is dropped by coercion. A shortfall
|
|
120
|
+
# in the non-null count means the column holds dirty, uncoercible data.
|
|
121
|
+
if non_null_count < series.drop_nulls().len():
|
|
122
|
+
profile.flags.append(BooleanFlag.FormatMismatch)
|
|
123
|
+
|
|
117
124
|
if non_null_count == 0:
|
|
118
125
|
if series.drop_nulls().len() > 0 and col_name in user_overrides:
|
|
119
126
|
from ._base import OverrideCoercionError
|
|
@@ -658,15 +658,18 @@ class ProfileConfig:
|
|
|
658
658
|
numeric_sentinels: InitVar[Optional[dict[str, list[float]]]] = None
|
|
659
659
|
string_sentinels: InitVar[Optional[dict[str, list[str]]]] = None
|
|
660
660
|
datetime_epoch_units: InitVar[Optional[dict[str, Union[str, EpochUnit]]]] = None
|
|
661
|
+
datetime_formats: InitVar[Optional[dict[str, str]]] = None
|
|
661
662
|
_numeric_sentinels: dict[str, list[float]] = field(default_factory=dict, init=False)
|
|
662
663
|
_string_sentinels: dict[str, list[str]] = field(default_factory=dict, init=False)
|
|
663
664
|
_datetime_epoch_units: dict[str, EpochUnit] = field(default_factory=dict, init=False)
|
|
665
|
+
_datetime_formats: dict[str, str] = field(default_factory=dict, init=False)
|
|
664
666
|
|
|
665
667
|
def __post_init__(
|
|
666
668
|
self,
|
|
667
669
|
numeric_sentinels: Optional[dict[str, list[float]]],
|
|
668
670
|
string_sentinels: Optional[dict[str, list[str]]],
|
|
669
671
|
datetime_epoch_units: Optional[dict[str, Union[str, EpochUnit]]] = None,
|
|
672
|
+
datetime_formats: Optional[dict[str, str]] = None,
|
|
670
673
|
) -> None:
|
|
671
674
|
if numeric_sentinels is not None and not isinstance(numeric_sentinels, property):
|
|
672
675
|
for k, vals in numeric_sentinels.items():
|
|
@@ -677,6 +680,9 @@ class ProfileConfig:
|
|
|
677
680
|
if datetime_epoch_units is not None and not isinstance(datetime_epoch_units, property):
|
|
678
681
|
for k, val in datetime_epoch_units.items():
|
|
679
682
|
self.set_datetime_epoch_unit(k, val)
|
|
683
|
+
if datetime_formats is not None and not isinstance(datetime_formats, property):
|
|
684
|
+
for k, fmt in datetime_formats.items():
|
|
685
|
+
self.set_datetime_format(k, fmt)
|
|
680
686
|
|
|
681
687
|
@property
|
|
682
688
|
def numeric_sentinels(self) -> MappingProxyType[str, list[float]]:
|
|
@@ -731,6 +737,27 @@ class ProfileConfig:
|
|
|
731
737
|
"""
|
|
732
738
|
return MappingProxyType(self._datetime_epoch_units)
|
|
733
739
|
|
|
740
|
+
@property
|
|
741
|
+
def datetime_formats(self) -> MappingProxyType[str, str]:
|
|
742
|
+
"""
|
|
743
|
+
Get the per-column declared datetime format strings.
|
|
744
|
+
|
|
745
|
+
Keys are column names; values are strftime-style format strings (e.g.
|
|
746
|
+
``{"Year": "%Y"}``) applied by ``DatetimeProfiler`` with
|
|
747
|
+
``strict=False`` when coercing that column to Datetime. A declaration
|
|
748
|
+
applies to any column profiled as Datetime, whether overridden or
|
|
749
|
+
auto-detected. Format strings are not validated against strftime
|
|
750
|
+
grammar at declaration time — a bad format surfaces at profiling time.
|
|
751
|
+
Defaults to an empty dict — columns with no declaration fall back to
|
|
752
|
+
Polars format inference.
|
|
753
|
+
|
|
754
|
+
Returns
|
|
755
|
+
-------
|
|
756
|
+
MappingProxyType[str, str]
|
|
757
|
+
Read-only mapping of column names to declared datetime formats.
|
|
758
|
+
"""
|
|
759
|
+
return MappingProxyType(self._datetime_formats)
|
|
760
|
+
|
|
734
761
|
def set_numeric_sentinel(self, column: str | list[str], values: list[float]) -> None:
|
|
735
762
|
"""
|
|
736
763
|
Set numeric sentinel values for one or more columns.
|
|
@@ -809,6 +836,38 @@ class ProfileConfig:
|
|
|
809
836
|
for c in columns:
|
|
810
837
|
self._datetime_epoch_units[c] = enum_unit
|
|
811
838
|
|
|
839
|
+
def set_datetime_format(self, column: str | list[str], format: str) -> None:
|
|
840
|
+
"""
|
|
841
|
+
Declare a datetime format string for one or more columns.
|
|
842
|
+
|
|
843
|
+
The format is applied by ``DatetimeProfiler`` with ``strict=False``
|
|
844
|
+
when coercing the column to Datetime, and is not validated against
|
|
845
|
+
strftime grammar or the data at declaration time — a bad format
|
|
846
|
+
surfaces at profiling time, consistent with ``set_column_type`` and
|
|
847
|
+
``set_datetime_epoch_unit``.
|
|
848
|
+
|
|
849
|
+
Parameters
|
|
850
|
+
----------
|
|
851
|
+
column : str or list of str
|
|
852
|
+
Column name or list of column names to apply the format to.
|
|
853
|
+
format : str
|
|
854
|
+
A non-empty strftime-style format string (e.g. ``"%Y"``).
|
|
855
|
+
|
|
856
|
+
Raises
|
|
857
|
+
------
|
|
858
|
+
ValueError
|
|
859
|
+
If any column name is empty, or if `format` is not a non-empty
|
|
860
|
+
string.
|
|
861
|
+
"""
|
|
862
|
+
if not isinstance(format, str) or not format:
|
|
863
|
+
raise ValueError("format must be a non-empty string.")
|
|
864
|
+
|
|
865
|
+
columns = [column] if isinstance(column, str) else column
|
|
866
|
+
for c in columns:
|
|
867
|
+
if not isinstance(c, str) or not c:
|
|
868
|
+
raise ValueError("column name must be a non-empty string.")
|
|
869
|
+
self._datetime_formats[c] = format
|
|
870
|
+
|
|
812
871
|
def to_dict(self) -> dict:
|
|
813
872
|
"""
|
|
814
873
|
Serialise the config to a plain dictionary.
|
|
@@ -837,6 +896,7 @@ class ProfileConfig:
|
|
|
837
896
|
"numeric_sentinels": {k: list(v) for k, v in self.numeric_sentinels.items()},
|
|
838
897
|
"string_sentinels": {k: list(v) for k, v in self.string_sentinels.items()},
|
|
839
898
|
"datetime_epoch_units": {k: v.value for k, v in self.datetime_epoch_units.items()},
|
|
899
|
+
"datetime_formats": {k: v for k, v in self.datetime_formats.items()},
|
|
840
900
|
}
|
|
841
901
|
|
|
842
902
|
@classmethod
|
|
@@ -884,6 +944,7 @@ class ProfileConfig:
|
|
|
884
944
|
numeric_sentinels=data.get("numeric_sentinels", {}),
|
|
885
945
|
string_sentinels=data.get("string_sentinels", {}),
|
|
886
946
|
datetime_epoch_units=data.get("datetime_epoch_units", {}),
|
|
947
|
+
datetime_formats=data.get("datetime_formats", {}),
|
|
887
948
|
)
|
|
888
949
|
|
|
889
950
|
return config
|
|
@@ -68,9 +68,11 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
|
|
|
68
68
|
self,
|
|
69
69
|
config: DatetimeProfileConfig | None = None,
|
|
70
70
|
epoch_units: dict[str, str] | None = None,
|
|
71
|
+
formats: dict[str, str] | None = None,
|
|
71
72
|
) -> None:
|
|
72
73
|
self._config = config if config is not None else DatetimeProfileConfig()
|
|
73
74
|
self._epoch_units = epoch_units or {}
|
|
75
|
+
self._formats = formats or {}
|
|
74
76
|
|
|
75
77
|
# ------------------------------------------------------------------
|
|
76
78
|
# Public API
|
|
@@ -112,8 +114,14 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
|
|
|
112
114
|
|
|
113
115
|
def _coerce_to_datetime(self, series: pl.Series, col_name: str) -> pl.Series | None:
|
|
114
116
|
if series.dtype in (pl.Utf8, pl.String):
|
|
117
|
+
declared_format = self._formats.get(col_name)
|
|
115
118
|
try:
|
|
116
|
-
|
|
119
|
+
if declared_format is not None:
|
|
120
|
+
coerced = series.str.to_datetime(
|
|
121
|
+
format=declared_format, strict=False
|
|
122
|
+
)
|
|
123
|
+
else:
|
|
124
|
+
coerced = series.str.to_datetime(strict=False)
|
|
117
125
|
return coerced if coerced.drop_nulls().len() > 0 else None
|
|
118
126
|
except pl.exceptions.ComputeError:
|
|
119
127
|
return None
|
|
@@ -139,22 +147,35 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
|
|
|
139
147
|
|
|
140
148
|
available = []
|
|
141
149
|
coerced_cache = {}
|
|
150
|
+
format_mismatch: dict[str, bool] = {}
|
|
142
151
|
for col_name in self._resolve_columns(df.columns, columns):
|
|
143
|
-
|
|
152
|
+
original = df[col_name]
|
|
153
|
+
series = self._coerce_to_datetime(original, col_name)
|
|
144
154
|
if series is not None:
|
|
145
155
|
available.append(col_name)
|
|
146
156
|
coerced_cache[col_name] = series
|
|
157
|
+
# FormatMismatch: a value that is present (non-null after the
|
|
158
|
+
# orchestrator's Effective-Null normalization) but fails
|
|
159
|
+
# coercion becomes null here. Compare non-null counts before
|
|
160
|
+
# and after coercion; a shortfall means dirty, uncoercible data.
|
|
161
|
+
format_mismatch[col_name] = (
|
|
162
|
+
series.drop_nulls().len() < original.drop_nulls().len()
|
|
163
|
+
)
|
|
147
164
|
elif col_name in user_overrides:
|
|
148
|
-
if
|
|
165
|
+
if original.drop_nulls().len() > 0:
|
|
149
166
|
from ._base import OverrideCoercionError
|
|
150
167
|
raise OverrideCoercionError(
|
|
151
|
-
f"Column {col_name!r} with TypeFlag.UserOverride completely failed coercion to Datetime."
|
|
168
|
+
f"Column {col_name!r} with TypeFlag.UserOverride completely failed coercion to Datetime. "
|
|
169
|
+
f"If Polars cannot infer the format, declare one explicitly via "
|
|
170
|
+
f"ProfileConfig.set_datetime_format({col_name!r}, <format>) (e.g. '%Y' for bare years)."
|
|
152
171
|
)
|
|
153
172
|
|
|
154
173
|
result.analysed_columns = available
|
|
155
174
|
|
|
156
175
|
for col_name in available:
|
|
157
176
|
profile = self._profile_column(coerced_cache[col_name], df.height, now)
|
|
177
|
+
if format_mismatch.get(col_name):
|
|
178
|
+
profile.flags.append(DatetimeFlag.FormatMismatch)
|
|
158
179
|
result.columns[col_name] = profile
|
|
159
180
|
|
|
160
181
|
return result
|
|
@@ -144,6 +144,13 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
144
144
|
clean = f64.drop_nulls()
|
|
145
145
|
profile = NumericStats()
|
|
146
146
|
|
|
147
|
+
# FormatMismatch: a value that is present (non-null after the
|
|
148
|
+
# orchestrator's Effective-Null normalization) but fails the
|
|
149
|
+
# Float64 cast becomes null here. A shortfall in the non-null
|
|
150
|
+
# count means the column holds dirty, uncoercible data.
|
|
151
|
+
if clean.len() < series.drop_nulls().len():
|
|
152
|
+
profile.flags.append(NumericFlag.FormatMismatch)
|
|
153
|
+
|
|
147
154
|
if clean.len() == 0:
|
|
148
155
|
if series.drop_nulls().len() > 0 and col in user_overrides:
|
|
149
156
|
from ._base import OverrideCoercionError
|
|
@@ -28,7 +28,7 @@ from typing import Any
|
|
|
28
28
|
import numpy as np
|
|
29
29
|
import polars as pl
|
|
30
30
|
|
|
31
|
-
from ._base import ModalityProfiler, ColumnBatchProfiler
|
|
31
|
+
from ._base import ModalityProfiler, ColumnBatchProfiler, OverrideCoercionError
|
|
32
32
|
from ._tabular import TabularProfiler
|
|
33
33
|
from ._categorical import CategoricalProfiler
|
|
34
34
|
from ._datetime_profiler import DatetimeProfiler
|
|
@@ -114,6 +114,10 @@ class StructuralProfiler:
|
|
|
114
114
|
------
|
|
115
115
|
TypeError
|
|
116
116
|
When ``data`` is not a ``polars.DataFrame``.
|
|
117
|
+
OverrideCoercionError
|
|
118
|
+
When a column carrying ``TypeFlag.UserOverride`` completely fails
|
|
119
|
+
coercion to its overridden ``SemanticType`` (zero usable values
|
|
120
|
+
remain despite the original column having non-null data).
|
|
117
121
|
"""
|
|
118
122
|
if not isinstance(data, pl.DataFrame):
|
|
119
123
|
raise TypeError(
|
|
@@ -220,13 +224,22 @@ class StructuralProfiler:
|
|
|
220
224
|
type_to_cols.setdefault(sem_type, []).append(col_name)
|
|
221
225
|
|
|
222
226
|
pc = self.config.profiling
|
|
227
|
+
profiling_frame = _resolve_effective_nulls(
|
|
228
|
+
data,
|
|
229
|
+
numeric_sentinels=dict(pc.numeric_sentinels),
|
|
230
|
+
string_sentinels=dict(pc.string_sentinels),
|
|
231
|
+
)
|
|
223
232
|
for sem_type, cols in type_to_cols.items():
|
|
224
233
|
if sem_type == SemanticType.Numeric:
|
|
225
234
|
profiler = NumericProfiler(config=pc.numeric)
|
|
226
235
|
elif sem_type == SemanticType.Categorical:
|
|
227
236
|
profiler = CategoricalProfiler(config=pc.categorical)
|
|
228
237
|
elif sem_type == SemanticType.Datetime:
|
|
229
|
-
profiler = DatetimeProfiler(
|
|
238
|
+
profiler = DatetimeProfiler(
|
|
239
|
+
config=pc.datetime_,
|
|
240
|
+
epoch_units=pc.datetime_epoch_units,
|
|
241
|
+
formats=pc.datetime_formats,
|
|
242
|
+
)
|
|
230
243
|
else:
|
|
231
244
|
profiler_cls = _COLUMN_PROFILER_REGISTRY.get(sem_type) # type: ignore[arg-type]
|
|
232
245
|
if profiler_cls is None:
|
|
@@ -237,10 +250,12 @@ class StructuralProfiler:
|
|
|
237
250
|
c for c in cols
|
|
238
251
|
if result.columns.get(c) and TypeFlag.UserOverride in result.columns[c].type_flags
|
|
239
252
|
}
|
|
240
|
-
batch = profiler.profile(
|
|
253
|
+
batch = profiler.profile(profiling_frame, columns=cols, user_overrides=user_overrides)
|
|
241
254
|
for col_name in batch.analysed_columns:
|
|
242
255
|
if col_name in result.columns:
|
|
243
256
|
result.columns[col_name].stats = batch.columns.get(col_name)
|
|
257
|
+
except OverrideCoercionError:
|
|
258
|
+
raise
|
|
244
259
|
except Exception:
|
|
245
260
|
pass
|
|
246
261
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_correlation_profiler.py
RENAMED
|
File without changes
|
|
File without changes
|
{dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_missingness_profiler.py
RENAMED
|
File without changes
|
{dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_nonlinearity_profiler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataforge_ml-2.0.5 → dataforge_ml-2.0.7}/src/dataforge_ml/profiling/_type_detection_config.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|