dataforge-ml 0.8.0__tar.gz → 0.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/PKG-INFO +1 -1
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/pyproject.toml +1 -1
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/__init__.py +4 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_base.py +11 -14
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_boolean_profiler.py +4 -41
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_categorical.py +3 -44
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_correlation_profiler.py +1 -1
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_datetime_profiler.py +7 -34
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_missingness_profiler.py +8 -69
- dataforge_ml-0.9.0/src/dataforge_ml/profiling/_null_detection.py +22 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_numeric_profiler.py +3 -32
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_tabular.py +25 -26
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_target_profiler.py +3 -3
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_text_profiler.py +4 -42
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/config.py +102 -1
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/structural.py +36 -20
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/PKG-INFO +1 -1
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/SOURCES.txt +1 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/LICENSE +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/README.md +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/setup.cfg +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/__init__.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/models/__init__.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/models/_data_structure.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/models/_data_types.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_target_config.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_text_config.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_type_detector.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/splitting/__init__.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/splitting/_config.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/splitting/_splitter.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/utils/__init__.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/utils/data_loader.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/requires.txt +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from .structural import StructuralProfiler
|
|
2
2
|
from .config import (
|
|
3
3
|
ProfileConfig,
|
|
4
|
+
PipelineConfig,
|
|
5
|
+
PipelinePhase,
|
|
4
6
|
SemanticType,
|
|
5
7
|
Modality,
|
|
6
8
|
TypeFlag,
|
|
@@ -19,6 +21,8 @@ from ._base import ModalityProfiler
|
|
|
19
21
|
__all__ = [
|
|
20
22
|
"StructuralProfiler",
|
|
21
23
|
"ProfileConfig",
|
|
24
|
+
"PipelineConfig",
|
|
25
|
+
"PipelinePhase",
|
|
22
26
|
"SemanticType",
|
|
23
27
|
"Modality",
|
|
24
28
|
"TypeFlag",
|
|
@@ -3,9 +3,9 @@ Abstract base classes for all structural profilers.
|
|
|
3
3
|
|
|
4
4
|
Hierarchy
|
|
5
5
|
---------
|
|
6
|
-
Profiling[R] — root:
|
|
7
|
-
├── ColumnBatchProfiler[R] — registry tier:
|
|
8
|
-
│ │
|
|
6
|
+
Profiling[R] — root: thin ABC, provides _resolve_columns
|
|
7
|
+
├── ColumnBatchProfiler[R] — registry tier: profile(df, columns) processes a
|
|
8
|
+
│ │ typed column batch; no config, no eligibility gates
|
|
9
9
|
│ ├── NumericProfiler
|
|
10
10
|
│ ├── CategoricalProfiler
|
|
11
11
|
│ ├── DatetimeProfiler
|
|
@@ -26,22 +26,19 @@ import polars as pl
|
|
|
26
26
|
from abc import abstractmethod, ABC
|
|
27
27
|
from typing import Generic, TypeVar
|
|
28
28
|
|
|
29
|
-
from .config import DatasetStats
|
|
29
|
+
from .config import DatasetStats
|
|
30
30
|
|
|
31
31
|
R = TypeVar("R")
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
class Profiling(ABC, Generic[R]):
|
|
35
35
|
"""
|
|
36
|
-
Root base for all profilers.
|
|
36
|
+
Root base for all profilers. Thin ABC — no config state.
|
|
37
37
|
|
|
38
|
-
|
|
39
|
-
|
|
38
|
+
Sub-processors are pure batch processors: given a DataFrame and a column
|
|
39
|
+
list, return a result. No routing, no scoping, no config.
|
|
40
40
|
"""
|
|
41
41
|
|
|
42
|
-
def __init__(self, config: ProfileConfig | None = None):
|
|
43
|
-
self.config = config or ProfileConfig()
|
|
44
|
-
|
|
45
42
|
@abstractmethod
|
|
46
43
|
def profile(self, data: pl.DataFrame, **kwargs) -> R: ...
|
|
47
44
|
|
|
@@ -62,11 +59,11 @@ class ColumnBatchProfiler(Profiling[R]):
|
|
|
62
59
|
|
|
63
60
|
Contract
|
|
64
61
|
--------
|
|
65
|
-
- __init__
|
|
66
|
-
StructuralProfiler to instantiate any registered profiler uniformly via
|
|
67
|
-
profiler_cls(config=self.config)
|
|
62
|
+
- __init__ takes no arguments (instantiated as profiler_cls()).
|
|
68
63
|
- profile(df, columns) receives the full DataFrame and the list of same-type
|
|
69
|
-
column names to process.
|
|
64
|
+
column names to process. Profiles every column in the list without any
|
|
65
|
+
internal eligibility gate or config consultation.
|
|
66
|
+
- Returns a result with:
|
|
70
67
|
.columns: dict[str, <Stats>] — per-column stats
|
|
71
68
|
.analysed_columns: list[str] — columns actually profiled
|
|
72
69
|
"""
|
|
@@ -22,11 +22,7 @@ from __future__ import annotations
|
|
|
22
22
|
import polars as pl
|
|
23
23
|
|
|
24
24
|
from ._base import ColumnBatchProfiler
|
|
25
|
-
from .config import
|
|
26
|
-
ProfileConfig,
|
|
27
|
-
BooleanStats,
|
|
28
|
-
SemanticType,
|
|
29
|
-
)
|
|
25
|
+
from .config import BooleanStats
|
|
30
26
|
from ._boolean_config import BooleanProfileResult
|
|
31
27
|
from ..models._data_types import _INT_DTYPES
|
|
32
28
|
|
|
@@ -42,22 +38,10 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
|
|
|
42
38
|
"""
|
|
43
39
|
Boolean column profiler for Polars DataFrames.
|
|
44
40
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
- Its dtype is an integer with values exclusively in {0, 1}, OR
|
|
48
|
-
- It has a SemanticType.Boolean override in ProfileConfig.column_overrides
|
|
49
|
-
|
|
50
|
-
Non-eligible columns in the provided list are silently skipped.
|
|
51
|
-
|
|
52
|
-
Parameters
|
|
53
|
-
----------
|
|
54
|
-
config : ProfileConfig | None
|
|
55
|
-
Shared profiling configuration.
|
|
41
|
+
Profiles every column passed to profile(df, columns) — no config,
|
|
42
|
+
no internal eligibility gate.
|
|
56
43
|
"""
|
|
57
44
|
|
|
58
|
-
def __init__(self, config: ProfileConfig | None = None) -> None:
|
|
59
|
-
super().__init__(config)
|
|
60
|
-
|
|
61
45
|
# ------------------------------------------------------------------
|
|
62
46
|
# Public API
|
|
63
47
|
# ------------------------------------------------------------------
|
|
@@ -69,23 +53,6 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
|
|
|
69
53
|
) -> BooleanProfileResult:
|
|
70
54
|
return self._run(data, columns)
|
|
71
55
|
|
|
72
|
-
# ------------------------------------------------------------------
|
|
73
|
-
# Eligibility
|
|
74
|
-
# ------------------------------------------------------------------
|
|
75
|
-
|
|
76
|
-
def _eligible(self, series: pl.Series) -> bool:
|
|
77
|
-
override = self.config.column_overrides.get(series.name)
|
|
78
|
-
|
|
79
|
-
# Explicit override — trust it
|
|
80
|
-
if override == SemanticType.Boolean:
|
|
81
|
-
return True
|
|
82
|
-
|
|
83
|
-
# Another override takes precedence over auto-detection
|
|
84
|
-
if override is not None:
|
|
85
|
-
return False
|
|
86
|
-
|
|
87
|
-
return True
|
|
88
|
-
|
|
89
56
|
# ------------------------------------------------------------------
|
|
90
57
|
# Orchestration
|
|
91
58
|
# ------------------------------------------------------------------
|
|
@@ -97,11 +64,7 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
|
|
|
97
64
|
) -> BooleanProfileResult:
|
|
98
65
|
result = BooleanProfileResult()
|
|
99
66
|
|
|
100
|
-
available =
|
|
101
|
-
c
|
|
102
|
-
for c in self._resolve_columns(df.columns, columns)
|
|
103
|
-
if self._eligible(df[c])
|
|
104
|
-
]
|
|
67
|
+
available = self._resolve_columns(df.columns, columns)
|
|
105
68
|
result.analysed_columns = available
|
|
106
69
|
|
|
107
70
|
for col_name in available:
|
|
@@ -45,10 +45,6 @@ from ._categorical_config import (
|
|
|
45
45
|
RareCategoryStats,
|
|
46
46
|
ImbalanceMetrics,
|
|
47
47
|
)
|
|
48
|
-
from .config import (
|
|
49
|
-
ProfileConfig,
|
|
50
|
-
SemanticType,
|
|
51
|
-
)
|
|
52
48
|
|
|
53
49
|
# ---------------------------------------------------------------------------
|
|
54
50
|
# Module-level thresholds (documented so callers can see what drives flags)
|
|
@@ -65,29 +61,10 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
|
|
|
65
61
|
"""
|
|
66
62
|
Categorical profiler for Polars DataFrames.
|
|
67
63
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
columns : list[str]
|
|
71
|
-
Columns to profile. The profiler intersects this list with
|
|
72
|
-
the DataFrame's actual columns at runtime.
|
|
73
|
-
config : ProfileConfig | None
|
|
74
|
-
Shared profiling configuration (used for chunk_size, etc.).
|
|
75
|
-
|
|
76
|
-
Usage
|
|
77
|
-
-----
|
|
78
|
-
>>> profiler = CategoricalProfiler(
|
|
79
|
-
... columns=["status", "country", "product_type"],
|
|
80
|
-
... )
|
|
81
|
-
>>> result = profiler.profile(df)
|
|
82
|
-
>>> print(result)
|
|
64
|
+
Profiles every column passed to profile(df, columns) — no config,
|
|
65
|
+
no internal eligibility gate.
|
|
83
66
|
"""
|
|
84
67
|
|
|
85
|
-
def __init__(
|
|
86
|
-
self,
|
|
87
|
-
config: ProfileConfig | None = None,
|
|
88
|
-
) -> None:
|
|
89
|
-
super().__init__(config)
|
|
90
|
-
|
|
91
68
|
# ------------------------------------------------------------------
|
|
92
69
|
# Public API
|
|
93
70
|
# ------------------------------------------------------------------
|
|
@@ -103,19 +80,6 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
|
|
|
103
80
|
# Orchestration
|
|
104
81
|
# ------------------------------------------------------------------
|
|
105
82
|
|
|
106
|
-
def _eligible(
|
|
107
|
-
self,
|
|
108
|
-
series: pl.Series,
|
|
109
|
-
) -> bool:
|
|
110
|
-
override = self.config.column_overrides.get(series.name)
|
|
111
|
-
if override == SemanticType.Categorical:
|
|
112
|
-
return True
|
|
113
|
-
|
|
114
|
-
if override is not None:
|
|
115
|
-
return False
|
|
116
|
-
|
|
117
|
-
return True
|
|
118
|
-
|
|
119
83
|
def _run(
|
|
120
84
|
self,
|
|
121
85
|
df: pl.DataFrame,
|
|
@@ -123,12 +87,7 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
|
|
|
123
87
|
) -> CategoricalProfileResult:
|
|
124
88
|
result = CategoricalProfileResult()
|
|
125
89
|
|
|
126
|
-
|
|
127
|
-
available = [
|
|
128
|
-
c
|
|
129
|
-
for c in self._resolve_columns(df.columns, columns)
|
|
130
|
-
if self._eligible(df[c])
|
|
131
|
-
]
|
|
90
|
+
available = self._resolve_columns(df.columns, columns)
|
|
132
91
|
result.analysed_columns = available
|
|
133
92
|
|
|
134
93
|
n_rows = df.height
|
{dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_correlation_profiler.py
RENAMED
|
@@ -125,7 +125,7 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
|
|
|
125
125
|
near_redundant_threshold: float = _NEAR_REDUNDANT_THRESHOLD,
|
|
126
126
|
top_n_feature_target: int = _TOP_N_FEATURE_TARGET,
|
|
127
127
|
) -> None:
|
|
128
|
-
super().__init__(
|
|
128
|
+
super().__init__()
|
|
129
129
|
self._numeric_columns = numeric_columns
|
|
130
130
|
self._categorical_columns = categorical_columns or []
|
|
131
131
|
self._threshold = near_redundant_threshold
|
|
@@ -43,10 +43,6 @@ from datetime import datetime, timezone
|
|
|
43
43
|
import polars as pl
|
|
44
44
|
|
|
45
45
|
from ._base import ColumnBatchProfiler
|
|
46
|
-
from .config import (
|
|
47
|
-
ProfileConfig,
|
|
48
|
-
SemanticType,
|
|
49
|
-
)
|
|
50
46
|
from ._datetime_config import (
|
|
51
47
|
DatetimeProfileResult,
|
|
52
48
|
DatetimeStats,
|
|
@@ -90,20 +86,11 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
|
|
|
90
86
|
"""
|
|
91
87
|
Datetime distribution profiler for Polars DataFrames.
|
|
92
88
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
columns
|
|
96
|
-
Columns to profile. Non-datetime columns are skipped with a warning.
|
|
97
|
-
config : ProfileConfig | None
|
|
98
|
-
Shared profiling configuration.
|
|
89
|
+
Profiles every column passed to profile(df, columns) — no config,
|
|
90
|
+
no internal eligibility gate. String columns are coerced to Datetime;
|
|
91
|
+
columns that cannot be coerced are silently skipped.
|
|
99
92
|
"""
|
|
100
93
|
|
|
101
|
-
def __init__(
|
|
102
|
-
self,
|
|
103
|
-
config: ProfileConfig | None = None,
|
|
104
|
-
) -> None:
|
|
105
|
-
super().__init__(config)
|
|
106
|
-
|
|
107
94
|
# ------------------------------------------------------------------
|
|
108
95
|
# Public API
|
|
109
96
|
# ------------------------------------------------------------------
|
|
@@ -119,35 +106,21 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
|
|
|
119
106
|
# Orchestration
|
|
120
107
|
# ------------------------------------------------------------------
|
|
121
108
|
|
|
122
|
-
def _eligible(self, series: pl.Series) -> bool:
|
|
123
|
-
override = self.config.column_overrides.get(series.name)
|
|
124
|
-
|
|
125
|
-
if override == SemanticType.Datetime:
|
|
126
|
-
return True
|
|
127
|
-
if override is not None:
|
|
128
|
-
return False
|
|
129
|
-
|
|
130
|
-
return _is_datetime_dtype(series.dtype) or series.dtype in (pl.Utf8, pl.String)
|
|
131
|
-
|
|
132
109
|
def _coerce_to_datetime(self, series: pl.Series) -> pl.Series | None:
|
|
133
110
|
if series.dtype in (pl.Utf8, pl.String):
|
|
134
111
|
coerced = series.str.to_datetime(strict=False)
|
|
135
112
|
return coerced if coerced.drop_nulls().len() > 0 else None
|
|
136
|
-
|
|
113
|
+
if _is_datetime_dtype(series.dtype):
|
|
114
|
+
return series
|
|
115
|
+
return None
|
|
137
116
|
|
|
138
117
|
def _run(self, df: pl.DataFrame, columns: list[str]) -> DatetimeProfileResult:
|
|
139
118
|
result = DatetimeProfileResult()
|
|
140
119
|
now = datetime.now(tz=timezone.utc)
|
|
141
120
|
|
|
142
|
-
candidates = [
|
|
143
|
-
c
|
|
144
|
-
for c in self._resolve_columns(df.columns, columns)
|
|
145
|
-
if self._eligible(df[c])
|
|
146
|
-
]
|
|
147
|
-
|
|
148
121
|
available = []
|
|
149
122
|
coerced_cache = {}
|
|
150
|
-
for col_name in
|
|
123
|
+
for col_name in self._resolve_columns(df.columns, columns):
|
|
151
124
|
series = self._coerce_to_datetime(df[col_name])
|
|
152
125
|
if series is not None:
|
|
153
126
|
available.append(col_name)
|
{dataforge_ml-0.8.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_missingness_profiler.py
RENAMED
|
@@ -3,19 +3,10 @@ MissingnessProfiler – Phase 1 extension: Missingness Profiling.
|
|
|
3
3
|
|
|
4
4
|
Eligibility model
|
|
5
5
|
-----------------
|
|
6
|
-
Effective-null detection is
|
|
7
|
-
overrides acting only as suppressors, not as enablers:
|
|
6
|
+
Effective-null detection is purely dtype-driven — no SemanticType overrides:
|
|
8
7
|
|
|
9
|
-
sentinel-string detection → runs
|
|
10
|
-
|
|
11
|
-
(those types cannot have meaningful sentinel strings)
|
|
12
|
-
|
|
13
|
-
Inf / NaN expansion → runs when dtype is Float32/Float64
|
|
14
|
-
never suppressed (Inf in a float column is always
|
|
15
|
-
effectively missing regardless of semantic label)
|
|
16
|
-
|
|
17
|
-
column_overrides is SPARSE — most columns will have no entry.
|
|
18
|
-
Absence of an override is not a signal; it means "trust the dtype".
|
|
8
|
+
sentinel-string detection → runs for every String/Utf8 column unconditionally
|
|
9
|
+
Inf / NaN expansion → runs for every Float32/Float64 column unconditionally
|
|
19
10
|
"""
|
|
20
11
|
|
|
21
12
|
from __future__ import annotations
|
|
@@ -24,13 +15,13 @@ from __future__ import annotations
|
|
|
24
15
|
import polars as pl
|
|
25
16
|
|
|
26
17
|
from ._base import DatasetLevelProfiler
|
|
27
|
-
from .config import ProfileConfig, SemanticType
|
|
28
18
|
from ._missingness_config import (
|
|
29
19
|
ColumnMissingnessProfile,
|
|
30
20
|
MissingnessFlag,
|
|
31
21
|
MissingnessProfileResult,
|
|
32
22
|
MissingSeverity,
|
|
33
23
|
)
|
|
24
|
+
from ._null_detection import _SENTINEL_STRINGS, _inf_eligible, _sentinel_eligible
|
|
34
25
|
|
|
35
26
|
# ---------------------------------------------------------------------------
|
|
36
27
|
# Thresholds
|
|
@@ -43,52 +34,12 @@ _SEVERITY_HIGH = 0.20
|
|
|
43
34
|
_MAR_CORRELATION_THRESHOLD = 0.60
|
|
44
35
|
_COL_DROP_THRESHOLD = 0.50
|
|
45
36
|
|
|
46
|
-
_SENTINEL_STRINGS = frozenset({"NA", "NAN", "NULL", "NONE", "?"})
|
|
47
|
-
|
|
48
|
-
# Overrides that suppress sentinel-string detection on a String column.
|
|
49
|
-
# If a column is String but the user says "this is Numeric", treating
|
|
50
|
-
# "NA" as a sentinel is correct — but if they say Categorical or Text,
|
|
51
|
-
# sentinel detection still makes sense and should run.
|
|
52
|
-
_SENTINEL_SUPPRESSING_SEMANTICS = frozenset(
|
|
53
|
-
{
|
|
54
|
-
SemanticType.Numeric,
|
|
55
|
-
SemanticType.Datetime,
|
|
56
|
-
SemanticType.Boolean,
|
|
57
|
-
SemanticType.Identifier,
|
|
58
|
-
}
|
|
59
|
-
)
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def _sentinel_eligible(dtype: pl.DataType, override: SemanticType | None) -> bool:
|
|
63
|
-
"""True when sentinel-string detection should run for this column."""
|
|
64
|
-
if dtype not in (pl.Utf8, pl.String):
|
|
65
|
-
return False
|
|
66
|
-
# Override present and it's a non-text semantic → suppress
|
|
67
|
-
if override is not None and override in _SENTINEL_SUPPRESSING_SEMANTICS:
|
|
68
|
-
return False
|
|
69
|
-
return True
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def _inf_eligible(dtype: pl.DataType) -> bool:
|
|
73
|
-
"""True when Inf/NaN expansion should run. Always dtype-driven, never suppressed."""
|
|
74
|
-
return dtype in (pl.Float32, pl.Float64)
|
|
75
|
-
|
|
76
37
|
|
|
77
38
|
class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
|
|
78
|
-
"""
|
|
79
|
-
Missingness profiler for Polars DataFrames.
|
|
80
|
-
|
|
81
|
-
Column scoping
|
|
82
|
-
--------------
|
|
83
|
-
Resolution priority (high → low):
|
|
84
|
-
1. Explicit ``columns`` argument to ``profile()``.
|
|
85
|
-
2. ``config.exclude_columns`` — always removed.
|
|
86
|
-
3. All remaining DataFrame columns.
|
|
87
|
-
"""
|
|
39
|
+
"""Missingness profiler for Polars DataFrames."""
|
|
88
40
|
|
|
89
|
-
def __init__(self
|
|
90
|
-
super().__init__(
|
|
91
|
-
self._config: ProfileConfig = config or ProfileConfig()
|
|
41
|
+
def __init__(self) -> None:
|
|
42
|
+
super().__init__()
|
|
92
43
|
|
|
93
44
|
# ------------------------------------------------------------------
|
|
94
45
|
# Public API
|
|
@@ -117,16 +68,13 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
|
|
|
117
68
|
if n_rows == 0 or not cols:
|
|
118
69
|
return result
|
|
119
70
|
|
|
120
|
-
overrides = self._config.column_overrides # sparse — most keys absent
|
|
121
71
|
indicator_cols: list[pl.Series] = []
|
|
122
72
|
|
|
123
73
|
for col_name in cols:
|
|
124
|
-
override = overrides.get(col_name) # None for most columns
|
|
125
74
|
col_profile, indicator = self._profile_column(
|
|
126
75
|
series=df[col_name],
|
|
127
76
|
col_name=col_name,
|
|
128
77
|
n_rows=n_rows,
|
|
129
|
-
override=override,
|
|
130
78
|
)
|
|
131
79
|
result.columns[col_name] = col_profile
|
|
132
80
|
indicator_cols.append(indicator)
|
|
@@ -173,21 +121,12 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
|
|
|
173
121
|
series: pl.Series,
|
|
174
122
|
col_name: str,
|
|
175
123
|
n_rows: int,
|
|
176
|
-
override: SemanticType | None = None, # sparse — None is the common case
|
|
177
124
|
) -> tuple[ColumnMissingnessProfile, pl.Series]:
|
|
178
|
-
"""
|
|
179
|
-
Compute standard + effective null counts for one column.
|
|
180
|
-
|
|
181
|
-
Eligibility is dtype-first:
|
|
182
|
-
- sentinel strings → String dtype, unless override suppresses it
|
|
183
|
-
- Inf/NaN → Float dtype, always (never suppressed)
|
|
184
|
-
- everything else → standard Polars null only
|
|
185
|
-
"""
|
|
186
125
|
profile = ColumnMissingnessProfile(column=col_name, total_rows=n_rows)
|
|
187
126
|
dtype = series.dtype
|
|
188
127
|
std_null = series.is_null()
|
|
189
128
|
|
|
190
|
-
if _sentinel_eligible(dtype
|
|
129
|
+
if _sentinel_eligible(dtype):
|
|
191
130
|
eff_null = (
|
|
192
131
|
std_null
|
|
193
132
|
| (series.str.strip_chars() == "")
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""
|
|
2
|
+
_null_detection – shared dtype-driven null primitives for Phase 1.
|
|
3
|
+
|
|
4
|
+
Single authority for what counts as "effectively null" across the entire
|
|
5
|
+
Phase 1 implementation. No config, no SemanticType overrides, no state.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import polars as pl
|
|
11
|
+
|
|
12
|
+
_SENTINEL_STRINGS: frozenset[str] = frozenset({"NA", "NAN", "NULL", "NONE", "?"})
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _sentinel_eligible(dtype: pl.DataType) -> bool:
|
|
16
|
+
"""True when sentinel-string detection should run for this column (String/Utf8 only)."""
|
|
17
|
+
return dtype in (pl.Utf8, pl.String)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _inf_eligible(dtype: pl.DataType) -> bool:
|
|
21
|
+
"""True when Inf/NaN expansion should run (Float32/Float64 only)."""
|
|
22
|
+
return dtype in (pl.Float32, pl.Float64)
|
|
@@ -35,10 +35,6 @@ from __future__ import annotations
|
|
|
35
35
|
import polars as pl
|
|
36
36
|
|
|
37
37
|
from ._base import ColumnBatchProfiler
|
|
38
|
-
from .config import (
|
|
39
|
-
ProfileConfig,
|
|
40
|
-
SemanticType,
|
|
41
|
-
)
|
|
42
38
|
from ._correlation_profiler import _INT_DTYPES
|
|
43
39
|
from ._numeric_config import (
|
|
44
40
|
NumericProfileResult,
|
|
@@ -80,21 +76,10 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
80
76
|
"""
|
|
81
77
|
Numeric distribution profiler for Polars DataFrames.
|
|
82
78
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
columns : list[str]
|
|
86
|
-
Columns to profile. Non-numeric or absent columns are skipped
|
|
87
|
-
with a warning; they do not raise.
|
|
88
|
-
config : ProfileConfig | None
|
|
89
|
-
Shared profiling configuration.
|
|
79
|
+
Profiles every column passed to profile(df, columns) — no config,
|
|
80
|
+
no internal eligibility gate.
|
|
90
81
|
"""
|
|
91
82
|
|
|
92
|
-
def __init__(
|
|
93
|
-
self,
|
|
94
|
-
config: ProfileConfig | None = None,
|
|
95
|
-
) -> None:
|
|
96
|
-
super().__init__(config)
|
|
97
|
-
|
|
98
83
|
# ------------------------------------------------------------------
|
|
99
84
|
# Public API
|
|
100
85
|
# ------------------------------------------------------------------
|
|
@@ -110,16 +95,6 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
110
95
|
# Orchestration
|
|
111
96
|
# ------------------------------------------------------------------
|
|
112
97
|
|
|
113
|
-
def _eligible(self, series: pl.Series) -> bool:
|
|
114
|
-
override = self.config.column_overrides.get(series.name)
|
|
115
|
-
if override == SemanticType.Numeric:
|
|
116
|
-
return True
|
|
117
|
-
|
|
118
|
-
if override is not None:
|
|
119
|
-
return False
|
|
120
|
-
|
|
121
|
-
return True
|
|
122
|
-
|
|
123
98
|
def _run(
|
|
124
99
|
self,
|
|
125
100
|
df: pl.DataFrame,
|
|
@@ -128,11 +103,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
128
103
|
result = NumericProfileResult()
|
|
129
104
|
n_rows = df.height
|
|
130
105
|
|
|
131
|
-
available =
|
|
132
|
-
c
|
|
133
|
-
for c in self._resolve_columns(df.columns, columns)
|
|
134
|
-
if self._eligible(df[c])
|
|
135
|
-
]
|
|
106
|
+
available = self._resolve_columns(df.columns, columns)
|
|
136
107
|
result.analysed_columns = available
|
|
137
108
|
|
|
138
109
|
if not available:
|
|
@@ -3,16 +3,18 @@ TabularProfiler – Phase 1: Structural Profiling for tabular datasets.
|
|
|
3
3
|
|
|
4
4
|
All DataFrame operations use Polars (no pandas dependency).
|
|
5
5
|
|
|
6
|
+
A pipeline-agnostic data-catalog tool: receives the full raw DataFrame and
|
|
7
|
+
computes dataset-level stats over every column — no exclusion logic, no
|
|
8
|
+
config dependency.
|
|
9
|
+
|
|
6
10
|
Computes:
|
|
7
|
-
• row / column count (
|
|
11
|
+
• row / column count (full dataset)
|
|
8
12
|
• memory usage + per-column breakdown when threshold exceeded
|
|
9
|
-
• duplicate row count & ratio (
|
|
10
|
-
• overall sparsity (
|
|
11
|
-
• data-type detection (scoped to config.type_detection_columns;
|
|
12
|
-
skipped entirely when None)
|
|
13
|
+
• duplicate row count & ratio (all columns)
|
|
14
|
+
• overall sparsity (all columns)
|
|
13
15
|
|
|
14
16
|
Chunked processing is activated automatically when the DataFrame's
|
|
15
|
-
estimated memory exceeds
|
|
17
|
+
estimated memory exceeds _MEMORY_THRESHOLD_MB.
|
|
16
18
|
"""
|
|
17
19
|
|
|
18
20
|
from __future__ import annotations
|
|
@@ -24,31 +26,32 @@ import polars as pl
|
|
|
24
26
|
from ._base import ModalityProfiler
|
|
25
27
|
from .config import (
|
|
26
28
|
MemoryBreakdown,
|
|
27
|
-
ProfileConfig,
|
|
28
29
|
DatasetStats,
|
|
29
30
|
)
|
|
30
31
|
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# Module-level constants (previously sourced from ProfileConfig)
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
_MEMORY_THRESHOLD_MB: float = 500.0
|
|
37
|
+
_CHUNK_SIZE: int = 100_000
|
|
38
|
+
|
|
31
39
|
|
|
32
40
|
class TabularProfiler(ModalityProfiler):
|
|
33
41
|
"""
|
|
34
42
|
Structural profiler for Polars DataFrames.
|
|
35
43
|
|
|
44
|
+
Pipeline-agnostic: accepts no constructor arguments and applies no column
|
|
45
|
+
filtering. Computes dataset-level stats (row count, column count, memory,
|
|
46
|
+
duplicate ratio, overall sparsity) over the complete DataFrame it receives.
|
|
47
|
+
|
|
36
48
|
Usage
|
|
37
49
|
-----
|
|
38
|
-
>>>
|
|
39
|
-
... duplicate_columns=["user_id", "event_time"],
|
|
40
|
-
... sparsity_columns=["age", "income", "postcode"],
|
|
41
|
-
... type_detection_columns=["age", "income", "postcode", "created_at"],
|
|
42
|
-
... memory_threshold_mb=200,
|
|
43
|
-
... )
|
|
44
|
-
>>> profiler = TabularProfiler(config=cfg)
|
|
50
|
+
>>> profiler = TabularProfiler()
|
|
45
51
|
>>> result = profiler.profile(df)
|
|
46
52
|
>>> print(result)
|
|
47
53
|
"""
|
|
48
54
|
|
|
49
|
-
def __init__(self, config: ProfileConfig | None = None):
|
|
50
|
-
super().__init__(config)
|
|
51
|
-
|
|
52
55
|
# ------------------------------------------------------------------
|
|
53
56
|
# Public API
|
|
54
57
|
# ------------------------------------------------------------------
|
|
@@ -77,17 +80,13 @@ class TabularProfiler(ModalityProfiler):
|
|
|
77
80
|
if result.row_count == 0:
|
|
78
81
|
return result
|
|
79
82
|
|
|
80
|
-
# 3.
|
|
83
|
+
# 3. Operate on all columns — no exclusion logic
|
|
81
84
|
all_cols: list[str] = df.columns
|
|
82
|
-
analysed_cols = [c for c in all_cols if c not in self.config.exclude_columns]
|
|
83
|
-
|
|
84
|
-
dup_cols = analysed_cols
|
|
85
|
-
missingness_cols = analysed_cols
|
|
86
85
|
|
|
87
86
|
if use_chunks:
|
|
88
|
-
self._chunked_metrics(df,
|
|
87
|
+
self._chunked_metrics(df, all_cols, all_cols, result)
|
|
89
88
|
else:
|
|
90
|
-
self._full_metrics(df,
|
|
89
|
+
self._full_metrics(df, all_cols, all_cols, result)
|
|
91
90
|
|
|
92
91
|
return result
|
|
93
92
|
|
|
@@ -136,7 +135,7 @@ class TabularProfiler(ModalityProfiler):
|
|
|
136
135
|
total_bytes = sum(col_bytes.values())
|
|
137
136
|
|
|
138
137
|
result.memory_bytes = total_bytes
|
|
139
|
-
threshold_bytes =
|
|
138
|
+
threshold_bytes = _MEMORY_THRESHOLD_MB * 1024 * 1024
|
|
140
139
|
|
|
141
140
|
if total_bytes > threshold_bytes:
|
|
142
141
|
result.memory_breakdown = MemoryBreakdown(column_bytes=col_bytes)
|
|
@@ -189,7 +188,7 @@ class TabularProfiler(ModalityProfiler):
|
|
|
189
188
|
seen hashes — semantics match keep='first'.
|
|
190
189
|
Sparsity is accumulated as (missing_cells, total_cells).
|
|
191
190
|
"""
|
|
192
|
-
chunk_size =
|
|
191
|
+
chunk_size = _CHUNK_SIZE
|
|
193
192
|
n_chunks = math.ceil(result.row_count / chunk_size)
|
|
194
193
|
|
|
195
194
|
seen_hashes: set[int] = set()
|
|
@@ -36,7 +36,7 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
|
|
|
36
36
|
"""
|
|
37
37
|
|
|
38
38
|
def __init__(self, target_column: str, config: ProfileConfig | None = None) -> None:
|
|
39
|
-
super().__init__(
|
|
39
|
+
super().__init__()
|
|
40
40
|
self.target_column = target_column
|
|
41
41
|
|
|
42
42
|
def profile(self, data: pl.DataFrame, **kwargs) -> TargetProfileResult:
|
|
@@ -129,7 +129,7 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
|
|
|
129
129
|
self, series: pl.Series, n_rows: int, result: TargetProfileResult
|
|
130
130
|
) -> None:
|
|
131
131
|
"""Generates categorical metrics and checks for class imbalance."""
|
|
132
|
-
cat_profiler = CategoricalProfiler(
|
|
132
|
+
cat_profiler = CategoricalProfiler()
|
|
133
133
|
|
|
134
134
|
# Internally compute cardinality, top values, and imbalance metrics
|
|
135
135
|
cat_profile = cat_profiler._profile_column(series, self.target_column, n_rows)
|
|
@@ -146,7 +146,7 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
|
|
|
146
146
|
self, series: pl.Series, n_rows: int, result: TargetProfileResult
|
|
147
147
|
) -> None:
|
|
148
148
|
"""Generates numeric metrics and checks for target skewness."""
|
|
149
|
-
num_profiler = NumericProfiler(
|
|
149
|
+
num_profiler = NumericProfiler()
|
|
150
150
|
|
|
151
151
|
col_name = series.name
|
|
152
152
|
num_result = num_profiler.profile(series.to_frame(), [col_name])
|
|
@@ -54,11 +54,7 @@ from __future__ import annotations
|
|
|
54
54
|
import polars as pl
|
|
55
55
|
|
|
56
56
|
from ._base import ColumnBatchProfiler
|
|
57
|
-
from .config import
|
|
58
|
-
ProfileConfig,
|
|
59
|
-
TextStats,
|
|
60
|
-
SemanticType,
|
|
61
|
-
)
|
|
57
|
+
from .config import TextStats
|
|
62
58
|
from ._text_config import TextProfileResult
|
|
63
59
|
|
|
64
60
|
# Regex that counts non-whitespace token runs — used with str.count_matches.
|
|
@@ -69,22 +65,10 @@ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
|
|
|
69
65
|
"""
|
|
70
66
|
Free-text column profiler for Polars DataFrames.
|
|
71
67
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
``ProfileConfig.column_overrides``, OR
|
|
75
|
-
- Its Polars dtype is ``pl.Utf8`` / ``pl.String`` and no override is set.
|
|
76
|
-
|
|
77
|
-
Non-eligible columns are silently skipped.
|
|
78
|
-
|
|
79
|
-
Parameters
|
|
80
|
-
----------
|
|
81
|
-
config : ProfileConfig | None
|
|
82
|
-
Shared profiling configuration.
|
|
68
|
+
Profiles every column passed to profile(df, columns) — no config,
|
|
69
|
+
no internal eligibility gate.
|
|
83
70
|
"""
|
|
84
71
|
|
|
85
|
-
def __init__(self, config: ProfileConfig | None = None) -> None:
|
|
86
|
-
super().__init__(config)
|
|
87
|
-
|
|
88
72
|
# ------------------------------------------------------------------
|
|
89
73
|
# Public API
|
|
90
74
|
# ------------------------------------------------------------------
|
|
@@ -96,24 +80,6 @@ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
|
|
|
96
80
|
) -> TextProfileResult:
|
|
97
81
|
return self._run(data, columns)
|
|
98
82
|
|
|
99
|
-
# ------------------------------------------------------------------
|
|
100
|
-
# Eligibility
|
|
101
|
-
# ------------------------------------------------------------------
|
|
102
|
-
|
|
103
|
-
def _eligible(self, series: pl.Series) -> bool:
|
|
104
|
-
override = self.config.column_overrides.get(series.name)
|
|
105
|
-
|
|
106
|
-
if override == SemanticType.Text:
|
|
107
|
-
return True
|
|
108
|
-
|
|
109
|
-
# Any other explicit override takes precedence
|
|
110
|
-
if override is not None:
|
|
111
|
-
return False
|
|
112
|
-
|
|
113
|
-
# Native string dtype (pl.Utf8 is the canonical name; pl.String is
|
|
114
|
-
# an alias in newer Polars — check both for cross-version safety)
|
|
115
|
-
return series.dtype in (pl.Utf8, pl.String)
|
|
116
|
-
|
|
117
83
|
# ------------------------------------------------------------------
|
|
118
84
|
# Orchestration
|
|
119
85
|
# ------------------------------------------------------------------
|
|
@@ -125,11 +91,7 @@ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
|
|
|
125
91
|
) -> TextProfileResult:
|
|
126
92
|
result = TextProfileResult()
|
|
127
93
|
|
|
128
|
-
available =
|
|
129
|
-
c
|
|
130
|
-
for c in self._resolve_columns(df.columns, columns)
|
|
131
|
-
if self._eligible(df[c])
|
|
132
|
-
]
|
|
94
|
+
available = self._resolve_columns(df.columns, columns)
|
|
133
95
|
result.analysed_columns = available
|
|
134
96
|
|
|
135
97
|
for col_name in available:
|
|
@@ -52,6 +52,15 @@ class Modality(StrEnum):
|
|
|
52
52
|
# TimeSeries = "time_series"
|
|
53
53
|
|
|
54
54
|
|
|
55
|
+
class PipelinePhase(StrEnum):
|
|
56
|
+
Profiling = "profiling"
|
|
57
|
+
Imputation = "imputation"
|
|
58
|
+
OutlierDetection = "outlier_detection"
|
|
59
|
+
Normalization = "normalization"
|
|
60
|
+
Encoding = "encoding"
|
|
61
|
+
Scaling = "scaling"
|
|
62
|
+
|
|
63
|
+
|
|
55
64
|
# ---------------------------------------------------------------------------
|
|
56
65
|
# Type-detection enums — kept for TypeDetector compatibility
|
|
57
66
|
# ---------------------------------------------------------------------------
|
|
@@ -285,7 +294,7 @@ class ProfileConfig:
|
|
|
285
294
|
def from_dict(cls, data: dict) -> ProfileConfig:
|
|
286
295
|
return cls(
|
|
287
296
|
modality=Modality(data.get("modality", Modality.Tabular)),
|
|
288
|
-
|
|
297
|
+
target_columns=list(data.get("target_columns", [])),
|
|
289
298
|
column_overrides={
|
|
290
299
|
k: SemanticType(v) for k, v in data.get("column_overrides", {}).items()
|
|
291
300
|
},
|
|
@@ -304,6 +313,98 @@ class ProfileConfig:
|
|
|
304
313
|
return cls.from_dict(json.loads(json_str))
|
|
305
314
|
|
|
306
315
|
|
|
316
|
+
@dataclass
|
|
317
|
+
class PipelineConfig:
|
|
318
|
+
"""
|
|
319
|
+
Master configuration for the full 6-phase feature engineering pipeline.
|
|
320
|
+
|
|
321
|
+
Parameters
|
|
322
|
+
----------
|
|
323
|
+
exclude_columns : list[str]
|
|
324
|
+
Hard exclusions — columns dropped globally from every phase.
|
|
325
|
+
phase_exclusions : dict[PipelinePhase, list[str]]
|
|
326
|
+
Soft exclusions — columns bypassed for a specific phase but retained
|
|
327
|
+
in the dataset.
|
|
328
|
+
column_overrides : dict[str, SemanticType]
|
|
329
|
+
Explicit semantic type assignments respected by all downstream phases.
|
|
330
|
+
profiling : ProfileConfig
|
|
331
|
+
Phase 1-specific parameters (correlation, chunking, memory threshold).
|
|
332
|
+
"""
|
|
333
|
+
|
|
334
|
+
exclude_columns: list[str] = field(default_factory=list)
|
|
335
|
+
phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict)
|
|
336
|
+
column_overrides: dict[str, SemanticType] = field(default_factory=dict)
|
|
337
|
+
profiling: ProfileConfig = field(default_factory=ProfileConfig)
|
|
338
|
+
|
|
339
|
+
def resolve_active_columns(
|
|
340
|
+
self, phase: PipelinePhase, available_columns: list[str]
|
|
341
|
+
) -> list[str]:
|
|
342
|
+
"""
|
|
343
|
+
Return the columns the given phase should operate on.
|
|
344
|
+
|
|
345
|
+
Hard exclusions are applied first, then phase-specific soft exclusions.
|
|
346
|
+
Columns absent from available_columns are silently ignored in both lists.
|
|
347
|
+
"""
|
|
348
|
+
hard_set = set(self.exclude_columns)
|
|
349
|
+
soft_set = set(self.phase_exclusions.get(phase, []))
|
|
350
|
+
excluded = hard_set | soft_set
|
|
351
|
+
return [c for c in available_columns if c not in excluded]
|
|
352
|
+
|
|
353
|
+
def set_column_type(
|
|
354
|
+
self, column: str, semantic_type: Union[str, "SemanticType"]
|
|
355
|
+
) -> None:
|
|
356
|
+
"""
|
|
357
|
+
Explicitly set the semantic type for a column, overriding auto-detection.
|
|
358
|
+
This override is respected by all downstream phases.
|
|
359
|
+
"""
|
|
360
|
+
if isinstance(semantic_type, str):
|
|
361
|
+
try:
|
|
362
|
+
semantic_type = SemanticType(semantic_type)
|
|
363
|
+
except ValueError:
|
|
364
|
+
valid = [e.value for e in SemanticType]
|
|
365
|
+
raise ValueError(
|
|
366
|
+
f"Unknown semantic type {semantic_type!r}. "
|
|
367
|
+
f"Valid values: {valid}"
|
|
368
|
+
)
|
|
369
|
+
self.column_overrides[column] = semantic_type
|
|
370
|
+
|
|
371
|
+
def to_dict(self) -> dict:
|
|
372
|
+
return {
|
|
373
|
+
"exclude_columns": list(self.exclude_columns),
|
|
374
|
+
"phase_exclusions": {
|
|
375
|
+
str(phase): list(cols)
|
|
376
|
+
for phase, cols in self.phase_exclusions.items()
|
|
377
|
+
},
|
|
378
|
+
"column_overrides": {
|
|
379
|
+
col: str(sem_type)
|
|
380
|
+
for col, sem_type in self.column_overrides.items()
|
|
381
|
+
},
|
|
382
|
+
"profiling": self.profiling.to_dict(),
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
@classmethod
|
|
386
|
+
def from_dict(cls, data: dict) -> "PipelineConfig":
|
|
387
|
+
return cls(
|
|
388
|
+
exclude_columns=list(data.get("exclude_columns", [])),
|
|
389
|
+
phase_exclusions={
|
|
390
|
+
PipelinePhase(phase_str): list(cols)
|
|
391
|
+
for phase_str, cols in data.get("phase_exclusions", {}).items()
|
|
392
|
+
},
|
|
393
|
+
column_overrides={
|
|
394
|
+
col: SemanticType(sem_str)
|
|
395
|
+
for col, sem_str in data.get("column_overrides", {}).items()
|
|
396
|
+
},
|
|
397
|
+
profiling=ProfileConfig.from_dict(data.get("profiling", {})),
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
def to_json(self, indent: int = 2) -> str:
|
|
401
|
+
return json.dumps(self.to_dict(), indent=indent)
|
|
402
|
+
|
|
403
|
+
@classmethod
|
|
404
|
+
def from_json(cls, json_str: str) -> "PipelineConfig":
|
|
405
|
+
return cls.from_dict(json.loads(json_str))
|
|
406
|
+
|
|
407
|
+
|
|
307
408
|
@dataclass
|
|
308
409
|
class ColumnTypeInfo:
|
|
309
410
|
column: str
|
|
@@ -35,7 +35,8 @@ from ._target_profiler import TargetProfiler
|
|
|
35
35
|
from ._correlation_profiler import CorrelationProfiler
|
|
36
36
|
from ._type_detector import TypeDetector
|
|
37
37
|
from .config import (
|
|
38
|
-
|
|
38
|
+
PipelineConfig,
|
|
39
|
+
PipelinePhase,
|
|
39
40
|
ColumnProfile,
|
|
40
41
|
StructuralProfileResult,
|
|
41
42
|
RowMissingnessDistribution,
|
|
@@ -64,14 +65,16 @@ _COLUMN_PROFILER_REGISTRY: dict[SemanticType, type[ColumnBatchProfiler]] = { #
|
|
|
64
65
|
|
|
65
66
|
class StructuralProfiler:
|
|
66
67
|
|
|
67
|
-
def __init__(self, config:
|
|
68
|
-
self.config = config or
|
|
68
|
+
def __init__(self, config: PipelineConfig | None = None) -> None:
|
|
69
|
+
self.config: PipelineConfig = config or PipelineConfig()
|
|
70
|
+
# Keep sub-profilers aligned with the master column_overrides.
|
|
71
|
+
self.config.profiling.column_overrides = self.config.column_overrides
|
|
69
72
|
|
|
70
|
-
if self.config.modality == Modality.Tabular:
|
|
71
|
-
self.modality_profiler: ModalityProfiler = TabularProfiler(
|
|
73
|
+
if self.config.profiling.modality == Modality.Tabular:
|
|
74
|
+
self.modality_profiler: ModalityProfiler = TabularProfiler()
|
|
72
75
|
else:
|
|
73
76
|
raise NotImplementedError(
|
|
74
|
-
f"modality {self.config.modality} not supported yet"
|
|
77
|
+
f"modality {self.config.profiling.modality} not supported yet"
|
|
75
78
|
)
|
|
76
79
|
|
|
77
80
|
# ------------------------------------------------------------------
|
|
@@ -87,7 +90,17 @@ class StructuralProfiler:
|
|
|
87
90
|
|
|
88
91
|
result = StructuralProfileResult()
|
|
89
92
|
|
|
90
|
-
active_cols =
|
|
93
|
+
active_cols = self.config.resolve_active_columns(
|
|
94
|
+
PipelinePhase.Profiling, list(data.columns)
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Columns soft-excluded for Profiling: skipped but retained in the result.
|
|
98
|
+
hard_set = set(self.config.exclude_columns)
|
|
99
|
+
soft_retained = [
|
|
100
|
+
c for c in data.columns
|
|
101
|
+
if c in set(self.config.phase_exclusions.get(PipelinePhase.Profiling, []))
|
|
102
|
+
and c not in hard_set
|
|
103
|
+
]
|
|
91
104
|
|
|
92
105
|
# ── 1. Modality profiler ─────────────────────────────────────────
|
|
93
106
|
# Replaces default DatasetStats with the real one (row_count, memory,
|
|
@@ -97,7 +110,7 @@ class StructuralProfiler:
|
|
|
97
110
|
# ── 2. Missingness pre-pass ──────────────────────────────────────
|
|
98
111
|
# setdefault creates ColumnProfile entries; subsequent steps mutate
|
|
99
112
|
# the same objects via the same setdefault pattern.
|
|
100
|
-
missingness_result = MissingnessProfiler(
|
|
113
|
+
missingness_result = MissingnessProfiler().profile(
|
|
101
114
|
data, columns=active_cols
|
|
102
115
|
)
|
|
103
116
|
for col_name in missingness_result.analysed_columns:
|
|
@@ -112,7 +125,6 @@ class StructuralProfiler:
|
|
|
112
125
|
df=data,
|
|
113
126
|
cols=active_cols,
|
|
114
127
|
n_rows=data.height,
|
|
115
|
-
overrides=self.config.column_overrides,
|
|
116
128
|
)
|
|
117
129
|
|
|
118
130
|
# ── 4. Type detection ────────────────────────────────────────────
|
|
@@ -153,7 +165,7 @@ class StructuralProfiler:
|
|
|
153
165
|
profiler_cls = _COLUMN_PROFILER_REGISTRY.get(sem_type) # type: ignore[arg-type]
|
|
154
166
|
if profiler_cls is None:
|
|
155
167
|
continue
|
|
156
|
-
profiler = profiler_cls(
|
|
168
|
+
profiler = profiler_cls()
|
|
157
169
|
try:
|
|
158
170
|
batch = profiler.profile(data, columns=cols)
|
|
159
171
|
for col_name in batch.analysed_columns:
|
|
@@ -165,13 +177,13 @@ class StructuralProfiler:
|
|
|
165
177
|
# ── 7. Target columns ────────────────────────────────────────────
|
|
166
178
|
# TargetProfiler produces target-specific analysis stored in
|
|
167
179
|
# result.targets. cp.stats is NOT overwritten — step 6 already set it.
|
|
168
|
-
if self.config.target_columns:
|
|
169
|
-
for target in self.config.target_columns:
|
|
180
|
+
if self.config.profiling.target_columns:
|
|
181
|
+
for target in self.config.profiling.target_columns:
|
|
170
182
|
if target not in data.columns:
|
|
171
183
|
continue
|
|
172
184
|
target_result = TargetProfiler(
|
|
173
185
|
target_column=target,
|
|
174
|
-
config=self.config,
|
|
186
|
+
config=self.config.profiling,
|
|
175
187
|
).profile(data)
|
|
176
188
|
result.targets[target] = target_result
|
|
177
189
|
|
|
@@ -180,7 +192,7 @@ class StructuralProfiler:
|
|
|
180
192
|
cp.is_target = True
|
|
181
193
|
|
|
182
194
|
# ── 8. Correlation ───────────────────────────────────────────────
|
|
183
|
-
if self.config.compute_correlation:
|
|
195
|
+
if self.config.profiling.compute_correlation:
|
|
184
196
|
# Resolve column lists by detected SemanticType (post-override).
|
|
185
197
|
numeric_cols = [
|
|
186
198
|
c
|
|
@@ -198,7 +210,7 @@ class StructuralProfiler:
|
|
|
198
210
|
corr_profiler = CorrelationProfiler(
|
|
199
211
|
numeric_columns=numeric_cols,
|
|
200
212
|
categorical_columns=categorical_cols,
|
|
201
|
-
config=self.config,
|
|
213
|
+
config=self.config.profiling,
|
|
202
214
|
)
|
|
203
215
|
|
|
204
216
|
# 8a. Feature-feature matrices — computed ONCE, target-independent.
|
|
@@ -209,7 +221,7 @@ class StructuralProfiler:
|
|
|
209
221
|
|
|
210
222
|
# 8b. Per-target analysis — matrices are NOT recomputed; each call
|
|
211
223
|
# shallow-copies feature_corr and appends target-specific fields.
|
|
212
|
-
for target in self.config.target_columns:
|
|
224
|
+
for target in self.config.profiling.target_columns:
|
|
213
225
|
if target not in data.columns:
|
|
214
226
|
continue
|
|
215
227
|
result.dataset.target_correlations[target] = (
|
|
@@ -218,6 +230,12 @@ class StructuralProfiler:
|
|
|
218
230
|
)
|
|
219
231
|
)
|
|
220
232
|
|
|
233
|
+
# ── Soft-excluded placeholders ───────────────────────────────────────
|
|
234
|
+
# Columns soft-excluded for Profiling are not profiled but must still
|
|
235
|
+
# appear in the result so downstream phases can reference them.
|
|
236
|
+
for col in soft_retained:
|
|
237
|
+
result.columns.setdefault(col, ColumnProfile(name=col))
|
|
238
|
+
|
|
221
239
|
return result
|
|
222
240
|
|
|
223
241
|
# ------------------------------------------------------------------
|
|
@@ -229,9 +247,8 @@ class StructuralProfiler:
|
|
|
229
247
|
df: pl.DataFrame,
|
|
230
248
|
cols: list[str],
|
|
231
249
|
n_rows: int,
|
|
232
|
-
overrides: dict[str, SemanticType],
|
|
233
250
|
) -> RowMissingnessDistribution:
|
|
234
|
-
from .
|
|
251
|
+
from ._null_detection import (
|
|
235
252
|
_sentinel_eligible,
|
|
236
253
|
_inf_eligible,
|
|
237
254
|
_SENTINEL_STRINGS,
|
|
@@ -246,10 +263,9 @@ class StructuralProfiler:
|
|
|
246
263
|
|
|
247
264
|
for col_name in cols:
|
|
248
265
|
dtype = df[col_name].dtype
|
|
249
|
-
override = overrides.get(col_name)
|
|
250
266
|
null_e = pl.col(col_name).is_null()
|
|
251
267
|
|
|
252
|
-
if _sentinel_eligible(dtype
|
|
268
|
+
if _sentinel_eligible(dtype):
|
|
253
269
|
eff = (
|
|
254
270
|
null_e
|
|
255
271
|
| (pl.col(col_name).str.strip_chars() == "")
|
|
@@ -22,6 +22,7 @@ src/dataforge_ml/profiling/_datetime_config.py
|
|
|
22
22
|
src/dataforge_ml/profiling/_datetime_profiler.py
|
|
23
23
|
src/dataforge_ml/profiling/_missingness_config.py
|
|
24
24
|
src/dataforge_ml/profiling/_missingness_profiler.py
|
|
25
|
+
src/dataforge_ml/profiling/_null_detection.py
|
|
25
26
|
src/dataforge_ml/profiling/_numeric_config.py
|
|
26
27
|
src/dataforge_ml/profiling/_numeric_profiler.py
|
|
27
28
|
src/dataforge_ml/profiling/_tabular.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|