dataforge-ml 0.7.0__tar.gz → 0.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/PKG-INFO +1 -1
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/pyproject.toml +1 -1
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/__init__.py +4 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_base.py +11 -14
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_boolean_profiler.py +4 -41
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_categorical.py +3 -44
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_correlation_profiler.py +1 -1
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_datetime_profiler.py +7 -34
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_missingness_profiler.py +11 -70
- dataforge_ml-0.9.0/src/dataforge_ml/profiling/_null_detection.py +22 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_numeric_profiler.py +6 -34
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_tabular.py +25 -26
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_target_profiler.py +3 -3
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_text_profiler.py +4 -42
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/config.py +131 -1
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/structural.py +41 -21
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/utils/data_loader.py +1 -3
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/PKG-INFO +1 -1
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/SOURCES.txt +1 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/LICENSE +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/README.md +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/setup.cfg +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/__init__.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/models/__init__.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/models/_data_structure.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/models/_data_types.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_target_config.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_text_config.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_type_detector.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/splitting/__init__.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/splitting/_config.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/splitting/_splitter.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/utils/__init__.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/requires.txt +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from .structural import StructuralProfiler
|
|
2
2
|
from .config import (
|
|
3
3
|
ProfileConfig,
|
|
4
|
+
PipelineConfig,
|
|
5
|
+
PipelinePhase,
|
|
4
6
|
SemanticType,
|
|
5
7
|
Modality,
|
|
6
8
|
TypeFlag,
|
|
@@ -19,6 +21,8 @@ from ._base import ModalityProfiler
|
|
|
19
21
|
__all__ = [
|
|
20
22
|
"StructuralProfiler",
|
|
21
23
|
"ProfileConfig",
|
|
24
|
+
"PipelineConfig",
|
|
25
|
+
"PipelinePhase",
|
|
22
26
|
"SemanticType",
|
|
23
27
|
"Modality",
|
|
24
28
|
"TypeFlag",
|
|
@@ -3,9 +3,9 @@ Abstract base classes for all structural profilers.
|
|
|
3
3
|
|
|
4
4
|
Hierarchy
|
|
5
5
|
---------
|
|
6
|
-
Profiling[R] — root:
|
|
7
|
-
├── ColumnBatchProfiler[R] — registry tier:
|
|
8
|
-
│ │
|
|
6
|
+
Profiling[R] — root: thin ABC, provides _resolve_columns
|
|
7
|
+
├── ColumnBatchProfiler[R] — registry tier: profile(df, columns) processes a
|
|
8
|
+
│ │ typed column batch; no config, no eligibility gates
|
|
9
9
|
│ ├── NumericProfiler
|
|
10
10
|
│ ├── CategoricalProfiler
|
|
11
11
|
│ ├── DatetimeProfiler
|
|
@@ -26,22 +26,19 @@ import polars as pl
|
|
|
26
26
|
from abc import abstractmethod, ABC
|
|
27
27
|
from typing import Generic, TypeVar
|
|
28
28
|
|
|
29
|
-
from .config import DatasetStats
|
|
29
|
+
from .config import DatasetStats
|
|
30
30
|
|
|
31
31
|
R = TypeVar("R")
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
class Profiling(ABC, Generic[R]):
|
|
35
35
|
"""
|
|
36
|
-
Root base for all profilers.
|
|
36
|
+
Root base for all profilers. Thin ABC — no config state.
|
|
37
37
|
|
|
38
|
-
|
|
39
|
-
|
|
38
|
+
Sub-processors are pure batch processors: given a DataFrame and a column
|
|
39
|
+
list, return a result. No routing, no scoping, no config.
|
|
40
40
|
"""
|
|
41
41
|
|
|
42
|
-
def __init__(self, config: ProfileConfig | None = None):
|
|
43
|
-
self.config = config or ProfileConfig()
|
|
44
|
-
|
|
45
42
|
@abstractmethod
|
|
46
43
|
def profile(self, data: pl.DataFrame, **kwargs) -> R: ...
|
|
47
44
|
|
|
@@ -62,11 +59,11 @@ class ColumnBatchProfiler(Profiling[R]):
|
|
|
62
59
|
|
|
63
60
|
Contract
|
|
64
61
|
--------
|
|
65
|
-
- __init__
|
|
66
|
-
StructuralProfiler to instantiate any registered profiler uniformly via
|
|
67
|
-
profiler_cls(config=self.config)
|
|
62
|
+
- __init__ takes no arguments (instantiated as profiler_cls()).
|
|
68
63
|
- profile(df, columns) receives the full DataFrame and the list of same-type
|
|
69
|
-
column names to process.
|
|
64
|
+
column names to process. Profiles every column in the list without any
|
|
65
|
+
internal eligibility gate or config consultation.
|
|
66
|
+
- Returns a result with:
|
|
70
67
|
.columns: dict[str, <Stats>] — per-column stats
|
|
71
68
|
.analysed_columns: list[str] — columns actually profiled
|
|
72
69
|
"""
|
|
@@ -22,11 +22,7 @@ from __future__ import annotations
|
|
|
22
22
|
import polars as pl
|
|
23
23
|
|
|
24
24
|
from ._base import ColumnBatchProfiler
|
|
25
|
-
from .config import
|
|
26
|
-
ProfileConfig,
|
|
27
|
-
BooleanStats,
|
|
28
|
-
SemanticType,
|
|
29
|
-
)
|
|
25
|
+
from .config import BooleanStats
|
|
30
26
|
from ._boolean_config import BooleanProfileResult
|
|
31
27
|
from ..models._data_types import _INT_DTYPES
|
|
32
28
|
|
|
@@ -42,22 +38,10 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
|
|
|
42
38
|
"""
|
|
43
39
|
Boolean column profiler for Polars DataFrames.
|
|
44
40
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
- Its dtype is an integer with values exclusively in {0, 1}, OR
|
|
48
|
-
- It has a SemanticType.Boolean override in ProfileConfig.column_overrides
|
|
49
|
-
|
|
50
|
-
Non-eligible columns in the provided list are silently skipped.
|
|
51
|
-
|
|
52
|
-
Parameters
|
|
53
|
-
----------
|
|
54
|
-
config : ProfileConfig | None
|
|
55
|
-
Shared profiling configuration.
|
|
41
|
+
Profiles every column passed to profile(df, columns) — no config,
|
|
42
|
+
no internal eligibility gate.
|
|
56
43
|
"""
|
|
57
44
|
|
|
58
|
-
def __init__(self, config: ProfileConfig | None = None) -> None:
|
|
59
|
-
super().__init__(config)
|
|
60
|
-
|
|
61
45
|
# ------------------------------------------------------------------
|
|
62
46
|
# Public API
|
|
63
47
|
# ------------------------------------------------------------------
|
|
@@ -69,23 +53,6 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
|
|
|
69
53
|
) -> BooleanProfileResult:
|
|
70
54
|
return self._run(data, columns)
|
|
71
55
|
|
|
72
|
-
# ------------------------------------------------------------------
|
|
73
|
-
# Eligibility
|
|
74
|
-
# ------------------------------------------------------------------
|
|
75
|
-
|
|
76
|
-
def _eligible(self, series: pl.Series) -> bool:
|
|
77
|
-
override = self.config.column_overrides.get(series.name)
|
|
78
|
-
|
|
79
|
-
# Explicit override — trust it
|
|
80
|
-
if override == SemanticType.Boolean:
|
|
81
|
-
return True
|
|
82
|
-
|
|
83
|
-
# Another override takes precedence over auto-detection
|
|
84
|
-
if override is not None:
|
|
85
|
-
return False
|
|
86
|
-
|
|
87
|
-
return True
|
|
88
|
-
|
|
89
56
|
# ------------------------------------------------------------------
|
|
90
57
|
# Orchestration
|
|
91
58
|
# ------------------------------------------------------------------
|
|
@@ -97,11 +64,7 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
|
|
|
97
64
|
) -> BooleanProfileResult:
|
|
98
65
|
result = BooleanProfileResult()
|
|
99
66
|
|
|
100
|
-
available =
|
|
101
|
-
c
|
|
102
|
-
for c in self._resolve_columns(df.columns, columns)
|
|
103
|
-
if self._eligible(df[c])
|
|
104
|
-
]
|
|
67
|
+
available = self._resolve_columns(df.columns, columns)
|
|
105
68
|
result.analysed_columns = available
|
|
106
69
|
|
|
107
70
|
for col_name in available:
|
|
@@ -45,10 +45,6 @@ from ._categorical_config import (
|
|
|
45
45
|
RareCategoryStats,
|
|
46
46
|
ImbalanceMetrics,
|
|
47
47
|
)
|
|
48
|
-
from .config import (
|
|
49
|
-
ProfileConfig,
|
|
50
|
-
SemanticType,
|
|
51
|
-
)
|
|
52
48
|
|
|
53
49
|
# ---------------------------------------------------------------------------
|
|
54
50
|
# Module-level thresholds (documented so callers can see what drives flags)
|
|
@@ -65,29 +61,10 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
|
|
|
65
61
|
"""
|
|
66
62
|
Categorical profiler for Polars DataFrames.
|
|
67
63
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
columns : list[str]
|
|
71
|
-
Columns to profile. The profiler intersects this list with
|
|
72
|
-
the DataFrame's actual columns at runtime.
|
|
73
|
-
config : ProfileConfig | None
|
|
74
|
-
Shared profiling configuration (used for chunk_size, etc.).
|
|
75
|
-
|
|
76
|
-
Usage
|
|
77
|
-
-----
|
|
78
|
-
>>> profiler = CategoricalProfiler(
|
|
79
|
-
... columns=["status", "country", "product_type"],
|
|
80
|
-
... )
|
|
81
|
-
>>> result = profiler.profile(df)
|
|
82
|
-
>>> print(result)
|
|
64
|
+
Profiles every column passed to profile(df, columns) — no config,
|
|
65
|
+
no internal eligibility gate.
|
|
83
66
|
"""
|
|
84
67
|
|
|
85
|
-
def __init__(
|
|
86
|
-
self,
|
|
87
|
-
config: ProfileConfig | None = None,
|
|
88
|
-
) -> None:
|
|
89
|
-
super().__init__(config)
|
|
90
|
-
|
|
91
68
|
# ------------------------------------------------------------------
|
|
92
69
|
# Public API
|
|
93
70
|
# ------------------------------------------------------------------
|
|
@@ -103,19 +80,6 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
|
|
|
103
80
|
# Orchestration
|
|
104
81
|
# ------------------------------------------------------------------
|
|
105
82
|
|
|
106
|
-
def _eligible(
|
|
107
|
-
self,
|
|
108
|
-
series: pl.Series,
|
|
109
|
-
) -> bool:
|
|
110
|
-
override = self.config.column_overrides.get(series.name)
|
|
111
|
-
if override == SemanticType.Categorical:
|
|
112
|
-
return True
|
|
113
|
-
|
|
114
|
-
if override is not None:
|
|
115
|
-
return False
|
|
116
|
-
|
|
117
|
-
return True
|
|
118
|
-
|
|
119
83
|
def _run(
|
|
120
84
|
self,
|
|
121
85
|
df: pl.DataFrame,
|
|
@@ -123,12 +87,7 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
|
|
|
123
87
|
) -> CategoricalProfileResult:
|
|
124
88
|
result = CategoricalProfileResult()
|
|
125
89
|
|
|
126
|
-
|
|
127
|
-
available = [
|
|
128
|
-
c
|
|
129
|
-
for c in self._resolve_columns(df.columns, columns)
|
|
130
|
-
if self._eligible(df[c])
|
|
131
|
-
]
|
|
90
|
+
available = self._resolve_columns(df.columns, columns)
|
|
132
91
|
result.analysed_columns = available
|
|
133
92
|
|
|
134
93
|
n_rows = df.height
|
{dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_correlation_profiler.py
RENAMED
|
@@ -125,7 +125,7 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
|
|
|
125
125
|
near_redundant_threshold: float = _NEAR_REDUNDANT_THRESHOLD,
|
|
126
126
|
top_n_feature_target: int = _TOP_N_FEATURE_TARGET,
|
|
127
127
|
) -> None:
|
|
128
|
-
super().__init__(
|
|
128
|
+
super().__init__()
|
|
129
129
|
self._numeric_columns = numeric_columns
|
|
130
130
|
self._categorical_columns = categorical_columns or []
|
|
131
131
|
self._threshold = near_redundant_threshold
|
|
@@ -43,10 +43,6 @@ from datetime import datetime, timezone
|
|
|
43
43
|
import polars as pl
|
|
44
44
|
|
|
45
45
|
from ._base import ColumnBatchProfiler
|
|
46
|
-
from .config import (
|
|
47
|
-
ProfileConfig,
|
|
48
|
-
SemanticType,
|
|
49
|
-
)
|
|
50
46
|
from ._datetime_config import (
|
|
51
47
|
DatetimeProfileResult,
|
|
52
48
|
DatetimeStats,
|
|
@@ -90,20 +86,11 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
|
|
|
90
86
|
"""
|
|
91
87
|
Datetime distribution profiler for Polars DataFrames.
|
|
92
88
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
columns
|
|
96
|
-
Columns to profile. Non-datetime columns are skipped with a warning.
|
|
97
|
-
config : ProfileConfig | None
|
|
98
|
-
Shared profiling configuration.
|
|
89
|
+
Profiles every column passed to profile(df, columns) — no config,
|
|
90
|
+
no internal eligibility gate. String columns are coerced to Datetime;
|
|
91
|
+
columns that cannot be coerced are silently skipped.
|
|
99
92
|
"""
|
|
100
93
|
|
|
101
|
-
def __init__(
|
|
102
|
-
self,
|
|
103
|
-
config: ProfileConfig | None = None,
|
|
104
|
-
) -> None:
|
|
105
|
-
super().__init__(config)
|
|
106
|
-
|
|
107
94
|
# ------------------------------------------------------------------
|
|
108
95
|
# Public API
|
|
109
96
|
# ------------------------------------------------------------------
|
|
@@ -119,35 +106,21 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
|
|
|
119
106
|
# Orchestration
|
|
120
107
|
# ------------------------------------------------------------------
|
|
121
108
|
|
|
122
|
-
def _eligible(self, series: pl.Series) -> bool:
|
|
123
|
-
override = self.config.column_overrides.get(series.name)
|
|
124
|
-
|
|
125
|
-
if override == SemanticType.Datetime:
|
|
126
|
-
return True
|
|
127
|
-
if override is not None:
|
|
128
|
-
return False
|
|
129
|
-
|
|
130
|
-
return _is_datetime_dtype(series.dtype) or series.dtype in (pl.Utf8, pl.String)
|
|
131
|
-
|
|
132
109
|
def _coerce_to_datetime(self, series: pl.Series) -> pl.Series | None:
|
|
133
110
|
if series.dtype in (pl.Utf8, pl.String):
|
|
134
111
|
coerced = series.str.to_datetime(strict=False)
|
|
135
112
|
return coerced if coerced.drop_nulls().len() > 0 else None
|
|
136
|
-
|
|
113
|
+
if _is_datetime_dtype(series.dtype):
|
|
114
|
+
return series
|
|
115
|
+
return None
|
|
137
116
|
|
|
138
117
|
def _run(self, df: pl.DataFrame, columns: list[str]) -> DatetimeProfileResult:
|
|
139
118
|
result = DatetimeProfileResult()
|
|
140
119
|
now = datetime.now(tz=timezone.utc)
|
|
141
120
|
|
|
142
|
-
candidates = [
|
|
143
|
-
c
|
|
144
|
-
for c in self._resolve_columns(df.columns, columns)
|
|
145
|
-
if self._eligible(df[c])
|
|
146
|
-
]
|
|
147
|
-
|
|
148
121
|
available = []
|
|
149
122
|
coerced_cache = {}
|
|
150
|
-
for col_name in
|
|
123
|
+
for col_name in self._resolve_columns(df.columns, columns):
|
|
151
124
|
series = self._coerce_to_datetime(df[col_name])
|
|
152
125
|
if series is not None:
|
|
153
126
|
available.append(col_name)
|
{dataforge_ml-0.7.0 → dataforge_ml-0.9.0}/src/dataforge_ml/profiling/_missingness_profiler.py
RENAMED
|
@@ -3,19 +3,10 @@ MissingnessProfiler – Phase 1 extension: Missingness Profiling.
|
|
|
3
3
|
|
|
4
4
|
Eligibility model
|
|
5
5
|
-----------------
|
|
6
|
-
Effective-null detection is
|
|
7
|
-
overrides acting only as suppressors, not as enablers:
|
|
6
|
+
Effective-null detection is purely dtype-driven — no SemanticType overrides:
|
|
8
7
|
|
|
9
|
-
sentinel-string detection → runs
|
|
10
|
-
|
|
11
|
-
(those types cannot have meaningful sentinel strings)
|
|
12
|
-
|
|
13
|
-
Inf / NaN expansion → runs when dtype is Float32/Float64
|
|
14
|
-
never suppressed (Inf in a float column is always
|
|
15
|
-
effectively missing regardless of semantic label)
|
|
16
|
-
|
|
17
|
-
column_overrides is SPARSE — most columns will have no entry.
|
|
18
|
-
Absence of an override is not a signal; it means "trust the dtype".
|
|
8
|
+
sentinel-string detection → runs for every String/Utf8 column unconditionally
|
|
9
|
+
Inf / NaN expansion → runs for every Float32/Float64 column unconditionally
|
|
19
10
|
"""
|
|
20
11
|
|
|
21
12
|
from __future__ import annotations
|
|
@@ -24,13 +15,13 @@ from __future__ import annotations
|
|
|
24
15
|
import polars as pl
|
|
25
16
|
|
|
26
17
|
from ._base import DatasetLevelProfiler
|
|
27
|
-
from .config import ProfileConfig, SemanticType
|
|
28
18
|
from ._missingness_config import (
|
|
29
19
|
ColumnMissingnessProfile,
|
|
30
20
|
MissingnessFlag,
|
|
31
21
|
MissingnessProfileResult,
|
|
32
22
|
MissingSeverity,
|
|
33
23
|
)
|
|
24
|
+
from ._null_detection import _SENTINEL_STRINGS, _inf_eligible, _sentinel_eligible
|
|
34
25
|
|
|
35
26
|
# ---------------------------------------------------------------------------
|
|
36
27
|
# Thresholds
|
|
@@ -43,52 +34,12 @@ _SEVERITY_HIGH = 0.20
|
|
|
43
34
|
_MAR_CORRELATION_THRESHOLD = 0.60
|
|
44
35
|
_COL_DROP_THRESHOLD = 0.50
|
|
45
36
|
|
|
46
|
-
_SENTINEL_STRINGS = frozenset({"NA", "NAN", "NULL", "NONE", "?"})
|
|
47
|
-
|
|
48
|
-
# Overrides that suppress sentinel-string detection on a String column.
|
|
49
|
-
# If a column is String but the user says "this is Numeric", treating
|
|
50
|
-
# "NA" as a sentinel is correct — but if they say Categorical or Text,
|
|
51
|
-
# sentinel detection still makes sense and should run.
|
|
52
|
-
_SENTINEL_SUPPRESSING_SEMANTICS = frozenset(
|
|
53
|
-
{
|
|
54
|
-
SemanticType.Numeric,
|
|
55
|
-
SemanticType.Datetime,
|
|
56
|
-
SemanticType.Boolean,
|
|
57
|
-
SemanticType.Identifier,
|
|
58
|
-
}
|
|
59
|
-
)
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def _sentinel_eligible(dtype: pl.DataType, override: SemanticType | None) -> bool:
|
|
63
|
-
"""True when sentinel-string detection should run for this column."""
|
|
64
|
-
if dtype not in (pl.Utf8, pl.String):
|
|
65
|
-
return False
|
|
66
|
-
# Override present and it's a non-text semantic → suppress
|
|
67
|
-
if override is not None and override in _SENTINEL_SUPPRESSING_SEMANTICS:
|
|
68
|
-
return False
|
|
69
|
-
return True
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def _inf_eligible(dtype: pl.DataType) -> bool:
|
|
73
|
-
"""True when Inf/NaN expansion should run. Always dtype-driven, never suppressed."""
|
|
74
|
-
return dtype in (pl.Float32, pl.Float64)
|
|
75
|
-
|
|
76
37
|
|
|
77
38
|
class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
|
|
78
|
-
"""
|
|
79
|
-
Missingness profiler for Polars DataFrames.
|
|
80
|
-
|
|
81
|
-
Column scoping
|
|
82
|
-
--------------
|
|
83
|
-
Resolution priority (high → low):
|
|
84
|
-
1. Explicit ``columns`` argument to ``profile()``.
|
|
85
|
-
2. ``config.exclude_columns`` — always removed.
|
|
86
|
-
3. All remaining DataFrame columns.
|
|
87
|
-
"""
|
|
39
|
+
"""Missingness profiler for Polars DataFrames."""
|
|
88
40
|
|
|
89
|
-
def __init__(self
|
|
90
|
-
super().__init__(
|
|
91
|
-
self._config: ProfileConfig = config or ProfileConfig()
|
|
41
|
+
def __init__(self) -> None:
|
|
42
|
+
super().__init__()
|
|
92
43
|
|
|
93
44
|
# ------------------------------------------------------------------
|
|
94
45
|
# Public API
|
|
@@ -117,16 +68,13 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
|
|
|
117
68
|
if n_rows == 0 or not cols:
|
|
118
69
|
return result
|
|
119
70
|
|
|
120
|
-
overrides = self._config.column_overrides # sparse — most keys absent
|
|
121
71
|
indicator_cols: list[pl.Series] = []
|
|
122
72
|
|
|
123
73
|
for col_name in cols:
|
|
124
|
-
override = overrides.get(col_name) # None for most columns
|
|
125
74
|
col_profile, indicator = self._profile_column(
|
|
126
75
|
series=df[col_name],
|
|
127
76
|
col_name=col_name,
|
|
128
77
|
n_rows=n_rows,
|
|
129
|
-
override=override,
|
|
130
78
|
)
|
|
131
79
|
result.columns[col_name] = col_profile
|
|
132
80
|
indicator_cols.append(indicator)
|
|
@@ -173,21 +121,12 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
|
|
|
173
121
|
series: pl.Series,
|
|
174
122
|
col_name: str,
|
|
175
123
|
n_rows: int,
|
|
176
|
-
override: SemanticType | None = None, # sparse — None is the common case
|
|
177
124
|
) -> tuple[ColumnMissingnessProfile, pl.Series]:
|
|
178
|
-
"""
|
|
179
|
-
Compute standard + effective null counts for one column.
|
|
180
|
-
|
|
181
|
-
Eligibility is dtype-first:
|
|
182
|
-
- sentinel strings → String dtype, unless override suppresses it
|
|
183
|
-
- Inf/NaN → Float dtype, always (never suppressed)
|
|
184
|
-
- everything else → standard Polars null only
|
|
185
|
-
"""
|
|
186
125
|
profile = ColumnMissingnessProfile(column=col_name, total_rows=n_rows)
|
|
187
126
|
dtype = series.dtype
|
|
188
127
|
std_null = series.is_null()
|
|
189
128
|
|
|
190
|
-
if _sentinel_eligible(dtype
|
|
129
|
+
if _sentinel_eligible(dtype):
|
|
191
130
|
eff_null = (
|
|
192
131
|
std_null
|
|
193
132
|
| (series.str.strip_chars() == "")
|
|
@@ -208,7 +147,9 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
|
|
|
208
147
|
|
|
209
148
|
r = profile.effective_null_ratio
|
|
210
149
|
|
|
211
|
-
if r
|
|
150
|
+
if r == 0.0:
|
|
151
|
+
profile.severity = None
|
|
152
|
+
elif r < _SEVERITY_MINOR:
|
|
212
153
|
profile.severity = MissingSeverity.Minor
|
|
213
154
|
elif r < _SEVERITY_MODERATE:
|
|
214
155
|
profile.severity = MissingSeverity.Moderate
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""
|
|
2
|
+
_null_detection – shared dtype-driven null primitives for Phase 1.
|
|
3
|
+
|
|
4
|
+
Single authority for what counts as "effectively null" across the entire
|
|
5
|
+
Phase 1 implementation. No config, no SemanticType overrides, no state.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import polars as pl
|
|
11
|
+
|
|
12
|
+
_SENTINEL_STRINGS: frozenset[str] = frozenset({"NA", "NAN", "NULL", "NONE", "?"})
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _sentinel_eligible(dtype: pl.DataType) -> bool:
|
|
16
|
+
"""True when sentinel-string detection should run for this column (String/Utf8 only)."""
|
|
17
|
+
return dtype in (pl.Utf8, pl.String)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _inf_eligible(dtype: pl.DataType) -> bool:
|
|
21
|
+
"""True when Inf/NaN expansion should run (Float32/Float64 only)."""
|
|
22
|
+
return dtype in (pl.Float32, pl.Float64)
|
|
@@ -35,10 +35,6 @@ from __future__ import annotations
|
|
|
35
35
|
import polars as pl
|
|
36
36
|
|
|
37
37
|
from ._base import ColumnBatchProfiler
|
|
38
|
-
from .config import (
|
|
39
|
-
ProfileConfig,
|
|
40
|
-
SemanticType,
|
|
41
|
-
)
|
|
42
38
|
from ._correlation_profiler import _INT_DTYPES
|
|
43
39
|
from ._numeric_config import (
|
|
44
40
|
NumericProfileResult,
|
|
@@ -80,21 +76,10 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
80
76
|
"""
|
|
81
77
|
Numeric distribution profiler for Polars DataFrames.
|
|
82
78
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
columns : list[str]
|
|
86
|
-
Columns to profile. Non-numeric or absent columns are skipped
|
|
87
|
-
with a warning; they do not raise.
|
|
88
|
-
config : ProfileConfig | None
|
|
89
|
-
Shared profiling configuration.
|
|
79
|
+
Profiles every column passed to profile(df, columns) — no config,
|
|
80
|
+
no internal eligibility gate.
|
|
90
81
|
"""
|
|
91
82
|
|
|
92
|
-
def __init__(
|
|
93
|
-
self,
|
|
94
|
-
config: ProfileConfig | None = None,
|
|
95
|
-
) -> None:
|
|
96
|
-
super().__init__(config)
|
|
97
|
-
|
|
98
83
|
# ------------------------------------------------------------------
|
|
99
84
|
# Public API
|
|
100
85
|
# ------------------------------------------------------------------
|
|
@@ -110,16 +95,6 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
110
95
|
# Orchestration
|
|
111
96
|
# ------------------------------------------------------------------
|
|
112
97
|
|
|
113
|
-
def _eligible(self, series: pl.Series) -> bool:
|
|
114
|
-
override = self.config.column_overrides.get(series.name)
|
|
115
|
-
if override == SemanticType.Numeric:
|
|
116
|
-
return True
|
|
117
|
-
|
|
118
|
-
if override is not None:
|
|
119
|
-
return False
|
|
120
|
-
|
|
121
|
-
return True
|
|
122
|
-
|
|
123
98
|
def _run(
|
|
124
99
|
self,
|
|
125
100
|
df: pl.DataFrame,
|
|
@@ -128,11 +103,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
128
103
|
result = NumericProfileResult()
|
|
129
104
|
n_rows = df.height
|
|
130
105
|
|
|
131
|
-
available =
|
|
132
|
-
c
|
|
133
|
-
for c in self._resolve_columns(df.columns, columns)
|
|
134
|
-
if self._eligible(df[c])
|
|
135
|
-
]
|
|
106
|
+
available = self._resolve_columns(df.columns, columns)
|
|
136
107
|
result.analysed_columns = available
|
|
137
108
|
|
|
138
109
|
if not available:
|
|
@@ -254,16 +225,17 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
254
225
|
for i in range(top_rows)
|
|
255
226
|
]
|
|
256
227
|
else:
|
|
257
|
-
# ---
|
|
228
|
+
# --- Histogram Distribution (Continuous) ---
|
|
258
229
|
import numpy as np
|
|
259
230
|
|
|
260
231
|
counts, bin_edges = np.histogram(clean_f64.to_numpy(), bins="auto")
|
|
232
|
+
n_clean = clean_f64.len()
|
|
261
233
|
profile.histogram = [
|
|
262
234
|
HistogramBin(
|
|
263
235
|
lower_bound=float(bin_edges[i]),
|
|
264
236
|
upper_bound=float(bin_edges[i + 1]),
|
|
265
237
|
count=int(counts[i]),
|
|
266
|
-
percentage=int(counts[i]) /
|
|
238
|
+
percentage=int(counts[i]) / n_clean if n_clean > 0 else 0.0,
|
|
267
239
|
)
|
|
268
240
|
for i in range(len(counts))
|
|
269
241
|
]
|
|
@@ -3,16 +3,18 @@ TabularProfiler – Phase 1: Structural Profiling for tabular datasets.
|
|
|
3
3
|
|
|
4
4
|
All DataFrame operations use Polars (no pandas dependency).
|
|
5
5
|
|
|
6
|
+
A pipeline-agnostic data-catalog tool: receives the full raw DataFrame and
|
|
7
|
+
computes dataset-level stats over every column — no exclusion logic, no
|
|
8
|
+
config dependency.
|
|
9
|
+
|
|
6
10
|
Computes:
|
|
7
|
-
• row / column count (
|
|
11
|
+
• row / column count (full dataset)
|
|
8
12
|
• memory usage + per-column breakdown when threshold exceeded
|
|
9
|
-
• duplicate row count & ratio (
|
|
10
|
-
• overall sparsity (
|
|
11
|
-
• data-type detection (scoped to config.type_detection_columns;
|
|
12
|
-
skipped entirely when None)
|
|
13
|
+
• duplicate row count & ratio (all columns)
|
|
14
|
+
• overall sparsity (all columns)
|
|
13
15
|
|
|
14
16
|
Chunked processing is activated automatically when the DataFrame's
|
|
15
|
-
estimated memory exceeds
|
|
17
|
+
estimated memory exceeds _MEMORY_THRESHOLD_MB.
|
|
16
18
|
"""
|
|
17
19
|
|
|
18
20
|
from __future__ import annotations
|
|
@@ -24,31 +26,32 @@ import polars as pl
|
|
|
24
26
|
from ._base import ModalityProfiler
|
|
25
27
|
from .config import (
|
|
26
28
|
MemoryBreakdown,
|
|
27
|
-
ProfileConfig,
|
|
28
29
|
DatasetStats,
|
|
29
30
|
)
|
|
30
31
|
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# Module-level constants (previously sourced from ProfileConfig)
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
_MEMORY_THRESHOLD_MB: float = 500.0
|
|
37
|
+
_CHUNK_SIZE: int = 100_000
|
|
38
|
+
|
|
31
39
|
|
|
32
40
|
class TabularProfiler(ModalityProfiler):
|
|
33
41
|
"""
|
|
34
42
|
Structural profiler for Polars DataFrames.
|
|
35
43
|
|
|
44
|
+
Pipeline-agnostic: accepts no constructor arguments and applies no column
|
|
45
|
+
filtering. Computes dataset-level stats (row count, column count, memory,
|
|
46
|
+
duplicate ratio, overall sparsity) over the complete DataFrame it receives.
|
|
47
|
+
|
|
36
48
|
Usage
|
|
37
49
|
-----
|
|
38
|
-
>>>
|
|
39
|
-
... duplicate_columns=["user_id", "event_time"],
|
|
40
|
-
... sparsity_columns=["age", "income", "postcode"],
|
|
41
|
-
... type_detection_columns=["age", "income", "postcode", "created_at"],
|
|
42
|
-
... memory_threshold_mb=200,
|
|
43
|
-
... )
|
|
44
|
-
>>> profiler = TabularProfiler(config=cfg)
|
|
50
|
+
>>> profiler = TabularProfiler()
|
|
45
51
|
>>> result = profiler.profile(df)
|
|
46
52
|
>>> print(result)
|
|
47
53
|
"""
|
|
48
54
|
|
|
49
|
-
def __init__(self, config: ProfileConfig | None = None):
|
|
50
|
-
super().__init__(config)
|
|
51
|
-
|
|
52
55
|
# ------------------------------------------------------------------
|
|
53
56
|
# Public API
|
|
54
57
|
# ------------------------------------------------------------------
|
|
@@ -77,17 +80,13 @@ class TabularProfiler(ModalityProfiler):
|
|
|
77
80
|
if result.row_count == 0:
|
|
78
81
|
return result
|
|
79
82
|
|
|
80
|
-
# 3.
|
|
83
|
+
# 3. Operate on all columns — no exclusion logic
|
|
81
84
|
all_cols: list[str] = df.columns
|
|
82
|
-
analysed_cols = [c for c in all_cols if c not in self.config.exclude_columns]
|
|
83
|
-
|
|
84
|
-
dup_cols = analysed_cols
|
|
85
|
-
missingness_cols = analysed_cols
|
|
86
85
|
|
|
87
86
|
if use_chunks:
|
|
88
|
-
self._chunked_metrics(df,
|
|
87
|
+
self._chunked_metrics(df, all_cols, all_cols, result)
|
|
89
88
|
else:
|
|
90
|
-
self._full_metrics(df,
|
|
89
|
+
self._full_metrics(df, all_cols, all_cols, result)
|
|
91
90
|
|
|
92
91
|
return result
|
|
93
92
|
|
|
@@ -136,7 +135,7 @@ class TabularProfiler(ModalityProfiler):
|
|
|
136
135
|
total_bytes = sum(col_bytes.values())
|
|
137
136
|
|
|
138
137
|
result.memory_bytes = total_bytes
|
|
139
|
-
threshold_bytes =
|
|
138
|
+
threshold_bytes = _MEMORY_THRESHOLD_MB * 1024 * 1024
|
|
140
139
|
|
|
141
140
|
if total_bytes > threshold_bytes:
|
|
142
141
|
result.memory_breakdown = MemoryBreakdown(column_bytes=col_bytes)
|
|
@@ -189,7 +188,7 @@ class TabularProfiler(ModalityProfiler):
|
|
|
189
188
|
seen hashes — semantics match keep='first'.
|
|
190
189
|
Sparsity is accumulated as (missing_cells, total_cells).
|
|
191
190
|
"""
|
|
192
|
-
chunk_size =
|
|
191
|
+
chunk_size = _CHUNK_SIZE
|
|
193
192
|
n_chunks = math.ceil(result.row_count / chunk_size)
|
|
194
193
|
|
|
195
194
|
seen_hashes: set[int] = set()
|
|
@@ -36,7 +36,7 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
|
|
|
36
36
|
"""
|
|
37
37
|
|
|
38
38
|
def __init__(self, target_column: str, config: ProfileConfig | None = None) -> None:
|
|
39
|
-
super().__init__(
|
|
39
|
+
super().__init__()
|
|
40
40
|
self.target_column = target_column
|
|
41
41
|
|
|
42
42
|
def profile(self, data: pl.DataFrame, **kwargs) -> TargetProfileResult:
|
|
@@ -129,7 +129,7 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
|
|
|
129
129
|
self, series: pl.Series, n_rows: int, result: TargetProfileResult
|
|
130
130
|
) -> None:
|
|
131
131
|
"""Generates categorical metrics and checks for class imbalance."""
|
|
132
|
-
cat_profiler = CategoricalProfiler(
|
|
132
|
+
cat_profiler = CategoricalProfiler()
|
|
133
133
|
|
|
134
134
|
# Internally compute cardinality, top values, and imbalance metrics
|
|
135
135
|
cat_profile = cat_profiler._profile_column(series, self.target_column, n_rows)
|
|
@@ -146,7 +146,7 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
|
|
|
146
146
|
self, series: pl.Series, n_rows: int, result: TargetProfileResult
|
|
147
147
|
) -> None:
|
|
148
148
|
"""Generates numeric metrics and checks for target skewness."""
|
|
149
|
-
num_profiler = NumericProfiler(
|
|
149
|
+
num_profiler = NumericProfiler()
|
|
150
150
|
|
|
151
151
|
col_name = series.name
|
|
152
152
|
num_result = num_profiler.profile(series.to_frame(), [col_name])
|
|
@@ -54,11 +54,7 @@ from __future__ import annotations
|
|
|
54
54
|
import polars as pl
|
|
55
55
|
|
|
56
56
|
from ._base import ColumnBatchProfiler
|
|
57
|
-
from .config import
|
|
58
|
-
ProfileConfig,
|
|
59
|
-
TextStats,
|
|
60
|
-
SemanticType,
|
|
61
|
-
)
|
|
57
|
+
from .config import TextStats
|
|
62
58
|
from ._text_config import TextProfileResult
|
|
63
59
|
|
|
64
60
|
# Regex that counts non-whitespace token runs — used with str.count_matches.
|
|
@@ -69,22 +65,10 @@ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
|
|
|
69
65
|
"""
|
|
70
66
|
Free-text column profiler for Polars DataFrames.
|
|
71
67
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
``ProfileConfig.column_overrides``, OR
|
|
75
|
-
- Its Polars dtype is ``pl.Utf8`` / ``pl.String`` and no override is set.
|
|
76
|
-
|
|
77
|
-
Non-eligible columns are silently skipped.
|
|
78
|
-
|
|
79
|
-
Parameters
|
|
80
|
-
----------
|
|
81
|
-
config : ProfileConfig | None
|
|
82
|
-
Shared profiling configuration.
|
|
68
|
+
Profiles every column passed to profile(df, columns) — no config,
|
|
69
|
+
no internal eligibility gate.
|
|
83
70
|
"""
|
|
84
71
|
|
|
85
|
-
def __init__(self, config: ProfileConfig | None = None) -> None:
|
|
86
|
-
super().__init__(config)
|
|
87
|
-
|
|
88
72
|
# ------------------------------------------------------------------
|
|
89
73
|
# Public API
|
|
90
74
|
# ------------------------------------------------------------------
|
|
@@ -96,24 +80,6 @@ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
|
|
|
96
80
|
) -> TextProfileResult:
|
|
97
81
|
return self._run(data, columns)
|
|
98
82
|
|
|
99
|
-
# ------------------------------------------------------------------
|
|
100
|
-
# Eligibility
|
|
101
|
-
# ------------------------------------------------------------------
|
|
102
|
-
|
|
103
|
-
def _eligible(self, series: pl.Series) -> bool:
|
|
104
|
-
override = self.config.column_overrides.get(series.name)
|
|
105
|
-
|
|
106
|
-
if override == SemanticType.Text:
|
|
107
|
-
return True
|
|
108
|
-
|
|
109
|
-
# Any other explicit override takes precedence
|
|
110
|
-
if override is not None:
|
|
111
|
-
return False
|
|
112
|
-
|
|
113
|
-
# Native string dtype (pl.Utf8 is the canonical name; pl.String is
|
|
114
|
-
# an alias in newer Polars — check both for cross-version safety)
|
|
115
|
-
return series.dtype in (pl.Utf8, pl.String)
|
|
116
|
-
|
|
117
83
|
# ------------------------------------------------------------------
|
|
118
84
|
# Orchestration
|
|
119
85
|
# ------------------------------------------------------------------
|
|
@@ -125,11 +91,7 @@ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
|
|
|
125
91
|
) -> TextProfileResult:
|
|
126
92
|
result = TextProfileResult()
|
|
127
93
|
|
|
128
|
-
available =
|
|
129
|
-
c
|
|
130
|
-
for c in self._resolve_columns(df.columns, columns)
|
|
131
|
-
if self._eligible(df[c])
|
|
132
|
-
]
|
|
94
|
+
available = self._resolve_columns(df.columns, columns)
|
|
133
95
|
result.analysed_columns = available
|
|
134
96
|
|
|
135
97
|
for col_name in available:
|
|
@@ -52,6 +52,15 @@ class Modality(StrEnum):
|
|
|
52
52
|
# TimeSeries = "time_series"
|
|
53
53
|
|
|
54
54
|
|
|
55
|
+
class PipelinePhase(StrEnum):
|
|
56
|
+
Profiling = "profiling"
|
|
57
|
+
Imputation = "imputation"
|
|
58
|
+
OutlierDetection = "outlier_detection"
|
|
59
|
+
Normalization = "normalization"
|
|
60
|
+
Encoding = "encoding"
|
|
61
|
+
Scaling = "scaling"
|
|
62
|
+
|
|
63
|
+
|
|
55
64
|
# ---------------------------------------------------------------------------
|
|
56
65
|
# Type-detection enums — kept for TypeDetector compatibility
|
|
57
66
|
# ---------------------------------------------------------------------------
|
|
@@ -71,6 +80,7 @@ class TypeFlag(StrEnum):
|
|
|
71
80
|
SequentialIndex = "sequential_index"
|
|
72
81
|
FloatSequentialIndex = "float_sequential_index"
|
|
73
82
|
FreeTextCandidate = "free_text_candidate"
|
|
83
|
+
UserOverride = "user_override"
|
|
74
84
|
|
|
75
85
|
|
|
76
86
|
# ---------------------------------------------------------------------------
|
|
@@ -240,6 +250,34 @@ class ProfileConfig:
|
|
|
240
250
|
memory_threshold_mb: float = 500.0
|
|
241
251
|
chunk_size: int = 100_000
|
|
242
252
|
|
|
253
|
+
def set_column_type(self, column: str, semantic_type: Union[str, "SemanticType"]) -> None:
|
|
254
|
+
"""
|
|
255
|
+
Explicitly set the semantic type for a column, overriding auto-detection.
|
|
256
|
+
|
|
257
|
+
The override is the sole source of truth for that column's type — the
|
|
258
|
+
type detector's verdict is ignored during profiling. Calling this method
|
|
259
|
+
multiple times on the same column is valid; the last call wins.
|
|
260
|
+
|
|
261
|
+
Parameters
|
|
262
|
+
----------
|
|
263
|
+
column : str
|
|
264
|
+
Name of the column to override.
|
|
265
|
+
semantic_type : str | SemanticType
|
|
266
|
+
Target semantic type. Accepts a plain string (e.g. ``"numeric"``,
|
|
267
|
+
``"categorical"``) or a ``SemanticType`` enum value. Invalid strings
|
|
268
|
+
raise ``ValueError``.
|
|
269
|
+
"""
|
|
270
|
+
if isinstance(semantic_type, str):
|
|
271
|
+
try:
|
|
272
|
+
semantic_type = SemanticType(semantic_type)
|
|
273
|
+
except ValueError:
|
|
274
|
+
valid = [e.value for e in SemanticType]
|
|
275
|
+
raise ValueError(
|
|
276
|
+
f"Unknown semantic type {semantic_type!r}. "
|
|
277
|
+
f"Valid values: {valid}"
|
|
278
|
+
)
|
|
279
|
+
self.column_overrides[column] = semantic_type
|
|
280
|
+
|
|
243
281
|
def to_dict(self) -> dict:
|
|
244
282
|
return {
|
|
245
283
|
"modality": str(self.modality),
|
|
@@ -256,7 +294,7 @@ class ProfileConfig:
|
|
|
256
294
|
def from_dict(cls, data: dict) -> ProfileConfig:
|
|
257
295
|
return cls(
|
|
258
296
|
modality=Modality(data.get("modality", Modality.Tabular)),
|
|
259
|
-
|
|
297
|
+
target_columns=list(data.get("target_columns", [])),
|
|
260
298
|
column_overrides={
|
|
261
299
|
k: SemanticType(v) for k, v in data.get("column_overrides", {}).items()
|
|
262
300
|
},
|
|
@@ -275,6 +313,98 @@ class ProfileConfig:
|
|
|
275
313
|
return cls.from_dict(json.loads(json_str))
|
|
276
314
|
|
|
277
315
|
|
|
316
|
+
@dataclass
|
|
317
|
+
class PipelineConfig:
|
|
318
|
+
"""
|
|
319
|
+
Master configuration for the full 6-phase feature engineering pipeline.
|
|
320
|
+
|
|
321
|
+
Parameters
|
|
322
|
+
----------
|
|
323
|
+
exclude_columns : list[str]
|
|
324
|
+
Hard exclusions — columns dropped globally from every phase.
|
|
325
|
+
phase_exclusions : dict[PipelinePhase, list[str]]
|
|
326
|
+
Soft exclusions — columns bypassed for a specific phase but retained
|
|
327
|
+
in the dataset.
|
|
328
|
+
column_overrides : dict[str, SemanticType]
|
|
329
|
+
Explicit semantic type assignments respected by all downstream phases.
|
|
330
|
+
profiling : ProfileConfig
|
|
331
|
+
Phase 1-specific parameters (correlation, chunking, memory threshold).
|
|
332
|
+
"""
|
|
333
|
+
|
|
334
|
+
exclude_columns: list[str] = field(default_factory=list)
|
|
335
|
+
phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict)
|
|
336
|
+
column_overrides: dict[str, SemanticType] = field(default_factory=dict)
|
|
337
|
+
profiling: ProfileConfig = field(default_factory=ProfileConfig)
|
|
338
|
+
|
|
339
|
+
def resolve_active_columns(
|
|
340
|
+
self, phase: PipelinePhase, available_columns: list[str]
|
|
341
|
+
) -> list[str]:
|
|
342
|
+
"""
|
|
343
|
+
Return the columns the given phase should operate on.
|
|
344
|
+
|
|
345
|
+
Hard exclusions are applied first, then phase-specific soft exclusions.
|
|
346
|
+
Columns absent from available_columns are silently ignored in both lists.
|
|
347
|
+
"""
|
|
348
|
+
hard_set = set(self.exclude_columns)
|
|
349
|
+
soft_set = set(self.phase_exclusions.get(phase, []))
|
|
350
|
+
excluded = hard_set | soft_set
|
|
351
|
+
return [c for c in available_columns if c not in excluded]
|
|
352
|
+
|
|
353
|
+
def set_column_type(
|
|
354
|
+
self, column: str, semantic_type: Union[str, "SemanticType"]
|
|
355
|
+
) -> None:
|
|
356
|
+
"""
|
|
357
|
+
Explicitly set the semantic type for a column, overriding auto-detection.
|
|
358
|
+
This override is respected by all downstream phases.
|
|
359
|
+
"""
|
|
360
|
+
if isinstance(semantic_type, str):
|
|
361
|
+
try:
|
|
362
|
+
semantic_type = SemanticType(semantic_type)
|
|
363
|
+
except ValueError:
|
|
364
|
+
valid = [e.value for e in SemanticType]
|
|
365
|
+
raise ValueError(
|
|
366
|
+
f"Unknown semantic type {semantic_type!r}. "
|
|
367
|
+
f"Valid values: {valid}"
|
|
368
|
+
)
|
|
369
|
+
self.column_overrides[column] = semantic_type
|
|
370
|
+
|
|
371
|
+
def to_dict(self) -> dict:
|
|
372
|
+
return {
|
|
373
|
+
"exclude_columns": list(self.exclude_columns),
|
|
374
|
+
"phase_exclusions": {
|
|
375
|
+
str(phase): list(cols)
|
|
376
|
+
for phase, cols in self.phase_exclusions.items()
|
|
377
|
+
},
|
|
378
|
+
"column_overrides": {
|
|
379
|
+
col: str(sem_type)
|
|
380
|
+
for col, sem_type in self.column_overrides.items()
|
|
381
|
+
},
|
|
382
|
+
"profiling": self.profiling.to_dict(),
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
@classmethod
|
|
386
|
+
def from_dict(cls, data: dict) -> "PipelineConfig":
|
|
387
|
+
return cls(
|
|
388
|
+
exclude_columns=list(data.get("exclude_columns", [])),
|
|
389
|
+
phase_exclusions={
|
|
390
|
+
PipelinePhase(phase_str): list(cols)
|
|
391
|
+
for phase_str, cols in data.get("phase_exclusions", {}).items()
|
|
392
|
+
},
|
|
393
|
+
column_overrides={
|
|
394
|
+
col: SemanticType(sem_str)
|
|
395
|
+
for col, sem_str in data.get("column_overrides", {}).items()
|
|
396
|
+
},
|
|
397
|
+
profiling=ProfileConfig.from_dict(data.get("profiling", {})),
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
def to_json(self, indent: int = 2) -> str:
|
|
401
|
+
return json.dumps(self.to_dict(), indent=indent)
|
|
402
|
+
|
|
403
|
+
@classmethod
|
|
404
|
+
def from_json(cls, json_str: str) -> "PipelineConfig":
|
|
405
|
+
return cls.from_dict(json.loads(json_str))
|
|
406
|
+
|
|
407
|
+
|
|
278
408
|
@dataclass
|
|
279
409
|
class ColumnTypeInfo:
|
|
280
410
|
column: str
|
|
@@ -35,11 +35,13 @@ from ._target_profiler import TargetProfiler
|
|
|
35
35
|
from ._correlation_profiler import CorrelationProfiler
|
|
36
36
|
from ._type_detector import TypeDetector
|
|
37
37
|
from .config import (
|
|
38
|
-
|
|
38
|
+
PipelineConfig,
|
|
39
|
+
PipelinePhase,
|
|
39
40
|
ColumnProfile,
|
|
40
41
|
StructuralProfileResult,
|
|
41
42
|
RowMissingnessDistribution,
|
|
42
43
|
SemanticType,
|
|
44
|
+
TypeFlag,
|
|
43
45
|
Modality,
|
|
44
46
|
)
|
|
45
47
|
|
|
@@ -63,14 +65,16 @@ _COLUMN_PROFILER_REGISTRY: dict[SemanticType, type[ColumnBatchProfiler]] = { #
|
|
|
63
65
|
|
|
64
66
|
class StructuralProfiler:
|
|
65
67
|
|
|
66
|
-
def __init__(self, config:
|
|
67
|
-
self.config = config or
|
|
68
|
+
def __init__(self, config: PipelineConfig | None = None) -> None:
|
|
69
|
+
self.config: PipelineConfig = config or PipelineConfig()
|
|
70
|
+
# Keep sub-profilers aligned with the master column_overrides.
|
|
71
|
+
self.config.profiling.column_overrides = self.config.column_overrides
|
|
68
72
|
|
|
69
|
-
if self.config.modality == Modality.Tabular:
|
|
70
|
-
self.modality_profiler: ModalityProfiler = TabularProfiler(
|
|
73
|
+
if self.config.profiling.modality == Modality.Tabular:
|
|
74
|
+
self.modality_profiler: ModalityProfiler = TabularProfiler()
|
|
71
75
|
else:
|
|
72
76
|
raise NotImplementedError(
|
|
73
|
-
f"modality {self.config.modality} not supported yet"
|
|
77
|
+
f"modality {self.config.profiling.modality} not supported yet"
|
|
74
78
|
)
|
|
75
79
|
|
|
76
80
|
# ------------------------------------------------------------------
|
|
@@ -86,7 +90,17 @@ class StructuralProfiler:
|
|
|
86
90
|
|
|
87
91
|
result = StructuralProfileResult()
|
|
88
92
|
|
|
89
|
-
active_cols =
|
|
93
|
+
active_cols = self.config.resolve_active_columns(
|
|
94
|
+
PipelinePhase.Profiling, list(data.columns)
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Columns soft-excluded for Profiling: skipped but retained in the result.
|
|
98
|
+
hard_set = set(self.config.exclude_columns)
|
|
99
|
+
soft_retained = [
|
|
100
|
+
c for c in data.columns
|
|
101
|
+
if c in set(self.config.phase_exclusions.get(PipelinePhase.Profiling, []))
|
|
102
|
+
and c not in hard_set
|
|
103
|
+
]
|
|
90
104
|
|
|
91
105
|
# ── 1. Modality profiler ─────────────────────────────────────────
|
|
92
106
|
# Replaces default DatasetStats with the real one (row_count, memory,
|
|
@@ -96,7 +110,7 @@ class StructuralProfiler:
|
|
|
96
110
|
# ── 2. Missingness pre-pass ──────────────────────────────────────
|
|
97
111
|
# setdefault creates ColumnProfile entries; subsequent steps mutate
|
|
98
112
|
# the same objects via the same setdefault pattern.
|
|
99
|
-
missingness_result = MissingnessProfiler(
|
|
113
|
+
missingness_result = MissingnessProfiler().profile(
|
|
100
114
|
data, columns=active_cols
|
|
101
115
|
)
|
|
102
116
|
for col_name in missingness_result.analysed_columns:
|
|
@@ -111,7 +125,6 @@ class StructuralProfiler:
|
|
|
111
125
|
df=data,
|
|
112
126
|
cols=active_cols,
|
|
113
127
|
n_rows=data.height,
|
|
114
|
-
overrides=self.config.column_overrides,
|
|
115
128
|
)
|
|
116
129
|
|
|
117
130
|
# ── 4. Type detection ────────────────────────────────────────────
|
|
@@ -130,7 +143,10 @@ class StructuralProfiler:
|
|
|
130
143
|
# Overrides for excluded / non-existent columns are silently ignored.
|
|
131
144
|
for col_name, override_type in self.config.column_overrides.items():
|
|
132
145
|
if col_name in result.columns:
|
|
133
|
-
result.columns[col_name]
|
|
146
|
+
cp = result.columns[col_name]
|
|
147
|
+
cp.semantic_type = override_type
|
|
148
|
+
if TypeFlag.UserOverride not in cp.type_flags:
|
|
149
|
+
cp.type_flags.append(TypeFlag.UserOverride)
|
|
134
150
|
|
|
135
151
|
# ── 6. Per-column profiling routed by SemanticType ───────────────
|
|
136
152
|
# Batch all columns of the same SemanticType together and call each
|
|
@@ -149,7 +165,7 @@ class StructuralProfiler:
|
|
|
149
165
|
profiler_cls = _COLUMN_PROFILER_REGISTRY.get(sem_type) # type: ignore[arg-type]
|
|
150
166
|
if profiler_cls is None:
|
|
151
167
|
continue
|
|
152
|
-
profiler = profiler_cls(
|
|
168
|
+
profiler = profiler_cls()
|
|
153
169
|
try:
|
|
154
170
|
batch = profiler.profile(data, columns=cols)
|
|
155
171
|
for col_name in batch.analysed_columns:
|
|
@@ -161,13 +177,13 @@ class StructuralProfiler:
|
|
|
161
177
|
# ── 7. Target columns ────────────────────────────────────────────
|
|
162
178
|
# TargetProfiler produces target-specific analysis stored in
|
|
163
179
|
# result.targets. cp.stats is NOT overwritten — step 6 already set it.
|
|
164
|
-
if self.config.target_columns:
|
|
165
|
-
for target in self.config.target_columns:
|
|
180
|
+
if self.config.profiling.target_columns:
|
|
181
|
+
for target in self.config.profiling.target_columns:
|
|
166
182
|
if target not in data.columns:
|
|
167
183
|
continue
|
|
168
184
|
target_result = TargetProfiler(
|
|
169
185
|
target_column=target,
|
|
170
|
-
config=self.config,
|
|
186
|
+
config=self.config.profiling,
|
|
171
187
|
).profile(data)
|
|
172
188
|
result.targets[target] = target_result
|
|
173
189
|
|
|
@@ -176,7 +192,7 @@ class StructuralProfiler:
|
|
|
176
192
|
cp.is_target = True
|
|
177
193
|
|
|
178
194
|
# ── 8. Correlation ───────────────────────────────────────────────
|
|
179
|
-
if self.config.compute_correlation:
|
|
195
|
+
if self.config.profiling.compute_correlation:
|
|
180
196
|
# Resolve column lists by detected SemanticType (post-override).
|
|
181
197
|
numeric_cols = [
|
|
182
198
|
c
|
|
@@ -194,7 +210,7 @@ class StructuralProfiler:
|
|
|
194
210
|
corr_profiler = CorrelationProfiler(
|
|
195
211
|
numeric_columns=numeric_cols,
|
|
196
212
|
categorical_columns=categorical_cols,
|
|
197
|
-
config=self.config,
|
|
213
|
+
config=self.config.profiling,
|
|
198
214
|
)
|
|
199
215
|
|
|
200
216
|
# 8a. Feature-feature matrices — computed ONCE, target-independent.
|
|
@@ -205,7 +221,7 @@ class StructuralProfiler:
|
|
|
205
221
|
|
|
206
222
|
# 8b. Per-target analysis — matrices are NOT recomputed; each call
|
|
207
223
|
# shallow-copies feature_corr and appends target-specific fields.
|
|
208
|
-
for target in self.config.target_columns:
|
|
224
|
+
for target in self.config.profiling.target_columns:
|
|
209
225
|
if target not in data.columns:
|
|
210
226
|
continue
|
|
211
227
|
result.dataset.target_correlations[target] = (
|
|
@@ -214,6 +230,12 @@ class StructuralProfiler:
|
|
|
214
230
|
)
|
|
215
231
|
)
|
|
216
232
|
|
|
233
|
+
# ── Soft-excluded placeholders ───────────────────────────────────────
|
|
234
|
+
# Columns soft-excluded for Profiling are not profiled but must still
|
|
235
|
+
# appear in the result so downstream phases can reference them.
|
|
236
|
+
for col in soft_retained:
|
|
237
|
+
result.columns.setdefault(col, ColumnProfile(name=col))
|
|
238
|
+
|
|
217
239
|
return result
|
|
218
240
|
|
|
219
241
|
# ------------------------------------------------------------------
|
|
@@ -225,9 +247,8 @@ class StructuralProfiler:
|
|
|
225
247
|
df: pl.DataFrame,
|
|
226
248
|
cols: list[str],
|
|
227
249
|
n_rows: int,
|
|
228
|
-
overrides: dict[str, SemanticType],
|
|
229
250
|
) -> RowMissingnessDistribution:
|
|
230
|
-
from .
|
|
251
|
+
from ._null_detection import (
|
|
231
252
|
_sentinel_eligible,
|
|
232
253
|
_inf_eligible,
|
|
233
254
|
_SENTINEL_STRINGS,
|
|
@@ -242,10 +263,9 @@ class StructuralProfiler:
|
|
|
242
263
|
|
|
243
264
|
for col_name in cols:
|
|
244
265
|
dtype = df[col_name].dtype
|
|
245
|
-
override = overrides.get(col_name)
|
|
246
266
|
null_e = pl.col(col_name).is_null()
|
|
247
267
|
|
|
248
|
-
if _sentinel_eligible(dtype
|
|
268
|
+
if _sentinel_eligible(dtype):
|
|
249
269
|
eff = (
|
|
250
270
|
null_e
|
|
251
271
|
| (pl.col(col_name).str.strip_chars() == "")
|
|
@@ -82,8 +82,6 @@ _EXT_LOADERS: dict[str, callable] = {
|
|
|
82
82
|
|
|
83
83
|
|
|
84
84
|
class DataLoader:
|
|
85
|
-
def __init__(self, fmt: str | None = None) -> None:
|
|
86
|
-
self._fmt_override = fmt.lower() if fmt else None
|
|
87
85
|
|
|
88
86
|
def load(
|
|
89
87
|
self,
|
|
@@ -92,7 +90,7 @@ class DataLoader:
|
|
|
92
90
|
) -> pl.DataFrame:
|
|
93
91
|
raw, ext_from_path = _read_raw(source)
|
|
94
92
|
|
|
95
|
-
resolved_fmt = (
|
|
93
|
+
resolved_fmt = (ext_from_path or "").lower()
|
|
96
94
|
|
|
97
95
|
if resolved_fmt not in _EXT_LOADERS:
|
|
98
96
|
label = resolved_fmt if resolved_fmt else "<unknown>"
|
|
@@ -22,6 +22,7 @@ src/dataforge_ml/profiling/_datetime_config.py
|
|
|
22
22
|
src/dataforge_ml/profiling/_datetime_profiler.py
|
|
23
23
|
src/dataforge_ml/profiling/_missingness_config.py
|
|
24
24
|
src/dataforge_ml/profiling/_missingness_profiler.py
|
|
25
|
+
src/dataforge_ml/profiling/_null_detection.py
|
|
25
26
|
src/dataforge_ml/profiling/_numeric_config.py
|
|
26
27
|
src/dataforge_ml/profiling/_numeric_profiler.py
|
|
27
28
|
src/dataforge_ml/profiling/_tabular.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|