dataforge-ml 0.8.0__tar.gz → 0.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/PKG-INFO +3 -3
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/README.md +2 -2
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/pyproject.toml +1 -1
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/__init__.py +4 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_base.py +11 -14
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_boolean_profiler.py +4 -41
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_categorical.py +3 -44
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_correlation_profiler.py +1 -1
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_datetime_profiler.py +7 -34
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_missingness_profiler.py +8 -69
- dataforge_ml-0.10.0/src/dataforge_ml/profiling/_null_detection.py +22 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_numeric_profiler.py +3 -32
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_tabular.py +25 -26
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_target_profiler.py +3 -3
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_text_profiler.py +4 -42
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/config.py +101 -39
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/structural.py +34 -20
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml.egg-info/PKG-INFO +3 -3
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml.egg-info/SOURCES.txt +1 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/LICENSE +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/setup.cfg +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/__init__.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/models/__init__.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/models/_data_structure.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/models/_data_types.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_target_config.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_text_config.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_type_detector.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/splitting/__init__.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/splitting/_config.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/splitting/_splitter.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/utils/__init__.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/utils/data_loader.py +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml.egg-info/requires.txt +0 -0
- {dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataforge-ml
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.10.0
|
|
4
4
|
Summary: A automated feature engineering and designing pipeline library
|
|
5
5
|
License: MIT
|
|
6
6
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -21,9 +21,9 @@ Provides-Extra: dev
|
|
|
21
21
|
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
22
22
|
Dynamic: license-file
|
|
23
23
|
|
|
24
|
-
#
|
|
24
|
+
# DataForgeML
|
|
25
25
|
|
|
26
|
-
Automated feature engineering and data profiling pipeline library for
|
|
26
|
+
Automated feature engineering and data profiling pipeline library for datasets.
|
|
27
27
|
|
|
28
28
|
## Installation
|
|
29
29
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from .structural import StructuralProfiler
|
|
2
2
|
from .config import (
|
|
3
3
|
ProfileConfig,
|
|
4
|
+
PipelineConfig,
|
|
5
|
+
PipelinePhase,
|
|
4
6
|
SemanticType,
|
|
5
7
|
Modality,
|
|
6
8
|
TypeFlag,
|
|
@@ -19,6 +21,8 @@ from ._base import ModalityProfiler
|
|
|
19
21
|
__all__ = [
|
|
20
22
|
"StructuralProfiler",
|
|
21
23
|
"ProfileConfig",
|
|
24
|
+
"PipelineConfig",
|
|
25
|
+
"PipelinePhase",
|
|
22
26
|
"SemanticType",
|
|
23
27
|
"Modality",
|
|
24
28
|
"TypeFlag",
|
|
@@ -3,9 +3,9 @@ Abstract base classes for all structural profilers.
|
|
|
3
3
|
|
|
4
4
|
Hierarchy
|
|
5
5
|
---------
|
|
6
|
-
Profiling[R] — root:
|
|
7
|
-
├── ColumnBatchProfiler[R] — registry tier:
|
|
8
|
-
│ │
|
|
6
|
+
Profiling[R] — root: thin ABC, provides _resolve_columns
|
|
7
|
+
├── ColumnBatchProfiler[R] — registry tier: profile(df, columns) processes a
|
|
8
|
+
│ │ typed column batch; no config, no eligibility gates
|
|
9
9
|
│ ├── NumericProfiler
|
|
10
10
|
│ ├── CategoricalProfiler
|
|
11
11
|
│ ├── DatetimeProfiler
|
|
@@ -26,22 +26,19 @@ import polars as pl
|
|
|
26
26
|
from abc import abstractmethod, ABC
|
|
27
27
|
from typing import Generic, TypeVar
|
|
28
28
|
|
|
29
|
-
from .config import DatasetStats
|
|
29
|
+
from .config import DatasetStats
|
|
30
30
|
|
|
31
31
|
R = TypeVar("R")
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
class Profiling(ABC, Generic[R]):
|
|
35
35
|
"""
|
|
36
|
-
Root base for all profilers.
|
|
36
|
+
Root base for all profilers. Thin ABC — no config state.
|
|
37
37
|
|
|
38
|
-
|
|
39
|
-
|
|
38
|
+
Sub-processors are pure batch processors: given a DataFrame and a column
|
|
39
|
+
list, return a result. No routing, no scoping, no config.
|
|
40
40
|
"""
|
|
41
41
|
|
|
42
|
-
def __init__(self, config: ProfileConfig | None = None):
|
|
43
|
-
self.config = config or ProfileConfig()
|
|
44
|
-
|
|
45
42
|
@abstractmethod
|
|
46
43
|
def profile(self, data: pl.DataFrame, **kwargs) -> R: ...
|
|
47
44
|
|
|
@@ -62,11 +59,11 @@ class ColumnBatchProfiler(Profiling[R]):
|
|
|
62
59
|
|
|
63
60
|
Contract
|
|
64
61
|
--------
|
|
65
|
-
- __init__
|
|
66
|
-
StructuralProfiler to instantiate any registered profiler uniformly via
|
|
67
|
-
profiler_cls(config=self.config)
|
|
62
|
+
- __init__ takes no arguments (instantiated as profiler_cls()).
|
|
68
63
|
- profile(df, columns) receives the full DataFrame and the list of same-type
|
|
69
|
-
column names to process.
|
|
64
|
+
column names to process. Profiles every column in the list without any
|
|
65
|
+
internal eligibility gate or config consultation.
|
|
66
|
+
- Returns a result with:
|
|
70
67
|
.columns: dict[str, <Stats>] — per-column stats
|
|
71
68
|
.analysed_columns: list[str] — columns actually profiled
|
|
72
69
|
"""
|
|
@@ -22,11 +22,7 @@ from __future__ import annotations
|
|
|
22
22
|
import polars as pl
|
|
23
23
|
|
|
24
24
|
from ._base import ColumnBatchProfiler
|
|
25
|
-
from .config import
|
|
26
|
-
ProfileConfig,
|
|
27
|
-
BooleanStats,
|
|
28
|
-
SemanticType,
|
|
29
|
-
)
|
|
25
|
+
from .config import BooleanStats
|
|
30
26
|
from ._boolean_config import BooleanProfileResult
|
|
31
27
|
from ..models._data_types import _INT_DTYPES
|
|
32
28
|
|
|
@@ -42,22 +38,10 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
|
|
|
42
38
|
"""
|
|
43
39
|
Boolean column profiler for Polars DataFrames.
|
|
44
40
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
- Its dtype is an integer with values exclusively in {0, 1}, OR
|
|
48
|
-
- It has a SemanticType.Boolean override in ProfileConfig.column_overrides
|
|
49
|
-
|
|
50
|
-
Non-eligible columns in the provided list are silently skipped.
|
|
51
|
-
|
|
52
|
-
Parameters
|
|
53
|
-
----------
|
|
54
|
-
config : ProfileConfig | None
|
|
55
|
-
Shared profiling configuration.
|
|
41
|
+
Profiles every column passed to profile(df, columns) — no config,
|
|
42
|
+
no internal eligibility gate.
|
|
56
43
|
"""
|
|
57
44
|
|
|
58
|
-
def __init__(self, config: ProfileConfig | None = None) -> None:
|
|
59
|
-
super().__init__(config)
|
|
60
|
-
|
|
61
45
|
# ------------------------------------------------------------------
|
|
62
46
|
# Public API
|
|
63
47
|
# ------------------------------------------------------------------
|
|
@@ -69,23 +53,6 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
|
|
|
69
53
|
) -> BooleanProfileResult:
|
|
70
54
|
return self._run(data, columns)
|
|
71
55
|
|
|
72
|
-
# ------------------------------------------------------------------
|
|
73
|
-
# Eligibility
|
|
74
|
-
# ------------------------------------------------------------------
|
|
75
|
-
|
|
76
|
-
def _eligible(self, series: pl.Series) -> bool:
|
|
77
|
-
override = self.config.column_overrides.get(series.name)
|
|
78
|
-
|
|
79
|
-
# Explicit override — trust it
|
|
80
|
-
if override == SemanticType.Boolean:
|
|
81
|
-
return True
|
|
82
|
-
|
|
83
|
-
# Another override takes precedence over auto-detection
|
|
84
|
-
if override is not None:
|
|
85
|
-
return False
|
|
86
|
-
|
|
87
|
-
return True
|
|
88
|
-
|
|
89
56
|
# ------------------------------------------------------------------
|
|
90
57
|
# Orchestration
|
|
91
58
|
# ------------------------------------------------------------------
|
|
@@ -97,11 +64,7 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
|
|
|
97
64
|
) -> BooleanProfileResult:
|
|
98
65
|
result = BooleanProfileResult()
|
|
99
66
|
|
|
100
|
-
available =
|
|
101
|
-
c
|
|
102
|
-
for c in self._resolve_columns(df.columns, columns)
|
|
103
|
-
if self._eligible(df[c])
|
|
104
|
-
]
|
|
67
|
+
available = self._resolve_columns(df.columns, columns)
|
|
105
68
|
result.analysed_columns = available
|
|
106
69
|
|
|
107
70
|
for col_name in available:
|
|
@@ -45,10 +45,6 @@ from ._categorical_config import (
|
|
|
45
45
|
RareCategoryStats,
|
|
46
46
|
ImbalanceMetrics,
|
|
47
47
|
)
|
|
48
|
-
from .config import (
|
|
49
|
-
ProfileConfig,
|
|
50
|
-
SemanticType,
|
|
51
|
-
)
|
|
52
48
|
|
|
53
49
|
# ---------------------------------------------------------------------------
|
|
54
50
|
# Module-level thresholds (documented so callers can see what drives flags)
|
|
@@ -65,29 +61,10 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
|
|
|
65
61
|
"""
|
|
66
62
|
Categorical profiler for Polars DataFrames.
|
|
67
63
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
columns : list[str]
|
|
71
|
-
Columns to profile. The profiler intersects this list with
|
|
72
|
-
the DataFrame's actual columns at runtime.
|
|
73
|
-
config : ProfileConfig | None
|
|
74
|
-
Shared profiling configuration (used for chunk_size, etc.).
|
|
75
|
-
|
|
76
|
-
Usage
|
|
77
|
-
-----
|
|
78
|
-
>>> profiler = CategoricalProfiler(
|
|
79
|
-
... columns=["status", "country", "product_type"],
|
|
80
|
-
... )
|
|
81
|
-
>>> result = profiler.profile(df)
|
|
82
|
-
>>> print(result)
|
|
64
|
+
Profiles every column passed to profile(df, columns) — no config,
|
|
65
|
+
no internal eligibility gate.
|
|
83
66
|
"""
|
|
84
67
|
|
|
85
|
-
def __init__(
|
|
86
|
-
self,
|
|
87
|
-
config: ProfileConfig | None = None,
|
|
88
|
-
) -> None:
|
|
89
|
-
super().__init__(config)
|
|
90
|
-
|
|
91
68
|
# ------------------------------------------------------------------
|
|
92
69
|
# Public API
|
|
93
70
|
# ------------------------------------------------------------------
|
|
@@ -103,19 +80,6 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
|
|
|
103
80
|
# Orchestration
|
|
104
81
|
# ------------------------------------------------------------------
|
|
105
82
|
|
|
106
|
-
def _eligible(
|
|
107
|
-
self,
|
|
108
|
-
series: pl.Series,
|
|
109
|
-
) -> bool:
|
|
110
|
-
override = self.config.column_overrides.get(series.name)
|
|
111
|
-
if override == SemanticType.Categorical:
|
|
112
|
-
return True
|
|
113
|
-
|
|
114
|
-
if override is not None:
|
|
115
|
-
return False
|
|
116
|
-
|
|
117
|
-
return True
|
|
118
|
-
|
|
119
83
|
def _run(
|
|
120
84
|
self,
|
|
121
85
|
df: pl.DataFrame,
|
|
@@ -123,12 +87,7 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
|
|
|
123
87
|
) -> CategoricalProfileResult:
|
|
124
88
|
result = CategoricalProfileResult()
|
|
125
89
|
|
|
126
|
-
|
|
127
|
-
available = [
|
|
128
|
-
c
|
|
129
|
-
for c in self._resolve_columns(df.columns, columns)
|
|
130
|
-
if self._eligible(df[c])
|
|
131
|
-
]
|
|
90
|
+
available = self._resolve_columns(df.columns, columns)
|
|
132
91
|
result.analysed_columns = available
|
|
133
92
|
|
|
134
93
|
n_rows = df.height
|
{dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_correlation_profiler.py
RENAMED
|
@@ -125,7 +125,7 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
|
|
|
125
125
|
near_redundant_threshold: float = _NEAR_REDUNDANT_THRESHOLD,
|
|
126
126
|
top_n_feature_target: int = _TOP_N_FEATURE_TARGET,
|
|
127
127
|
) -> None:
|
|
128
|
-
super().__init__(
|
|
128
|
+
super().__init__()
|
|
129
129
|
self._numeric_columns = numeric_columns
|
|
130
130
|
self._categorical_columns = categorical_columns or []
|
|
131
131
|
self._threshold = near_redundant_threshold
|
|
@@ -43,10 +43,6 @@ from datetime import datetime, timezone
|
|
|
43
43
|
import polars as pl
|
|
44
44
|
|
|
45
45
|
from ._base import ColumnBatchProfiler
|
|
46
|
-
from .config import (
|
|
47
|
-
ProfileConfig,
|
|
48
|
-
SemanticType,
|
|
49
|
-
)
|
|
50
46
|
from ._datetime_config import (
|
|
51
47
|
DatetimeProfileResult,
|
|
52
48
|
DatetimeStats,
|
|
@@ -90,20 +86,11 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
|
|
|
90
86
|
"""
|
|
91
87
|
Datetime distribution profiler for Polars DataFrames.
|
|
92
88
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
columns
|
|
96
|
-
Columns to profile. Non-datetime columns are skipped with a warning.
|
|
97
|
-
config : ProfileConfig | None
|
|
98
|
-
Shared profiling configuration.
|
|
89
|
+
Profiles every column passed to profile(df, columns) — no config,
|
|
90
|
+
no internal eligibility gate. String columns are coerced to Datetime;
|
|
91
|
+
columns that cannot be coerced are silently skipped.
|
|
99
92
|
"""
|
|
100
93
|
|
|
101
|
-
def __init__(
|
|
102
|
-
self,
|
|
103
|
-
config: ProfileConfig | None = None,
|
|
104
|
-
) -> None:
|
|
105
|
-
super().__init__(config)
|
|
106
|
-
|
|
107
94
|
# ------------------------------------------------------------------
|
|
108
95
|
# Public API
|
|
109
96
|
# ------------------------------------------------------------------
|
|
@@ -119,35 +106,21 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
|
|
|
119
106
|
# Orchestration
|
|
120
107
|
# ------------------------------------------------------------------
|
|
121
108
|
|
|
122
|
-
def _eligible(self, series: pl.Series) -> bool:
|
|
123
|
-
override = self.config.column_overrides.get(series.name)
|
|
124
|
-
|
|
125
|
-
if override == SemanticType.Datetime:
|
|
126
|
-
return True
|
|
127
|
-
if override is not None:
|
|
128
|
-
return False
|
|
129
|
-
|
|
130
|
-
return _is_datetime_dtype(series.dtype) or series.dtype in (pl.Utf8, pl.String)
|
|
131
|
-
|
|
132
109
|
def _coerce_to_datetime(self, series: pl.Series) -> pl.Series | None:
|
|
133
110
|
if series.dtype in (pl.Utf8, pl.String):
|
|
134
111
|
coerced = series.str.to_datetime(strict=False)
|
|
135
112
|
return coerced if coerced.drop_nulls().len() > 0 else None
|
|
136
|
-
|
|
113
|
+
if _is_datetime_dtype(series.dtype):
|
|
114
|
+
return series
|
|
115
|
+
return None
|
|
137
116
|
|
|
138
117
|
def _run(self, df: pl.DataFrame, columns: list[str]) -> DatetimeProfileResult:
|
|
139
118
|
result = DatetimeProfileResult()
|
|
140
119
|
now = datetime.now(tz=timezone.utc)
|
|
141
120
|
|
|
142
|
-
candidates = [
|
|
143
|
-
c
|
|
144
|
-
for c in self._resolve_columns(df.columns, columns)
|
|
145
|
-
if self._eligible(df[c])
|
|
146
|
-
]
|
|
147
|
-
|
|
148
121
|
available = []
|
|
149
122
|
coerced_cache = {}
|
|
150
|
-
for col_name in
|
|
123
|
+
for col_name in self._resolve_columns(df.columns, columns):
|
|
151
124
|
series = self._coerce_to_datetime(df[col_name])
|
|
152
125
|
if series is not None:
|
|
153
126
|
available.append(col_name)
|
{dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_missingness_profiler.py
RENAMED
|
@@ -3,19 +3,10 @@ MissingnessProfiler – Phase 1 extension: Missingness Profiling.
|
|
|
3
3
|
|
|
4
4
|
Eligibility model
|
|
5
5
|
-----------------
|
|
6
|
-
Effective-null detection is
|
|
7
|
-
overrides acting only as suppressors, not as enablers:
|
|
6
|
+
Effective-null detection is purely dtype-driven — no SemanticType overrides:
|
|
8
7
|
|
|
9
|
-
sentinel-string detection → runs
|
|
10
|
-
|
|
11
|
-
(those types cannot have meaningful sentinel strings)
|
|
12
|
-
|
|
13
|
-
Inf / NaN expansion → runs when dtype is Float32/Float64
|
|
14
|
-
never suppressed (Inf in a float column is always
|
|
15
|
-
effectively missing regardless of semantic label)
|
|
16
|
-
|
|
17
|
-
column_overrides is SPARSE — most columns will have no entry.
|
|
18
|
-
Absence of an override is not a signal; it means "trust the dtype".
|
|
8
|
+
sentinel-string detection → runs for every String/Utf8 column unconditionally
|
|
9
|
+
Inf / NaN expansion → runs for every Float32/Float64 column unconditionally
|
|
19
10
|
"""
|
|
20
11
|
|
|
21
12
|
from __future__ import annotations
|
|
@@ -24,13 +15,13 @@ from __future__ import annotations
|
|
|
24
15
|
import polars as pl
|
|
25
16
|
|
|
26
17
|
from ._base import DatasetLevelProfiler
|
|
27
|
-
from .config import ProfileConfig, SemanticType
|
|
28
18
|
from ._missingness_config import (
|
|
29
19
|
ColumnMissingnessProfile,
|
|
30
20
|
MissingnessFlag,
|
|
31
21
|
MissingnessProfileResult,
|
|
32
22
|
MissingSeverity,
|
|
33
23
|
)
|
|
24
|
+
from ._null_detection import _SENTINEL_STRINGS, _inf_eligible, _sentinel_eligible
|
|
34
25
|
|
|
35
26
|
# ---------------------------------------------------------------------------
|
|
36
27
|
# Thresholds
|
|
@@ -43,52 +34,12 @@ _SEVERITY_HIGH = 0.20
|
|
|
43
34
|
_MAR_CORRELATION_THRESHOLD = 0.60
|
|
44
35
|
_COL_DROP_THRESHOLD = 0.50
|
|
45
36
|
|
|
46
|
-
_SENTINEL_STRINGS = frozenset({"NA", "NAN", "NULL", "NONE", "?"})
|
|
47
|
-
|
|
48
|
-
# Overrides that suppress sentinel-string detection on a String column.
|
|
49
|
-
# If a column is String but the user says "this is Numeric", treating
|
|
50
|
-
# "NA" as a sentinel is correct — but if they say Categorical or Text,
|
|
51
|
-
# sentinel detection still makes sense and should run.
|
|
52
|
-
_SENTINEL_SUPPRESSING_SEMANTICS = frozenset(
|
|
53
|
-
{
|
|
54
|
-
SemanticType.Numeric,
|
|
55
|
-
SemanticType.Datetime,
|
|
56
|
-
SemanticType.Boolean,
|
|
57
|
-
SemanticType.Identifier,
|
|
58
|
-
}
|
|
59
|
-
)
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def _sentinel_eligible(dtype: pl.DataType, override: SemanticType | None) -> bool:
|
|
63
|
-
"""True when sentinel-string detection should run for this column."""
|
|
64
|
-
if dtype not in (pl.Utf8, pl.String):
|
|
65
|
-
return False
|
|
66
|
-
# Override present and it's a non-text semantic → suppress
|
|
67
|
-
if override is not None and override in _SENTINEL_SUPPRESSING_SEMANTICS:
|
|
68
|
-
return False
|
|
69
|
-
return True
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def _inf_eligible(dtype: pl.DataType) -> bool:
|
|
73
|
-
"""True when Inf/NaN expansion should run. Always dtype-driven, never suppressed."""
|
|
74
|
-
return dtype in (pl.Float32, pl.Float64)
|
|
75
|
-
|
|
76
37
|
|
|
77
38
|
class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
|
|
78
|
-
"""
|
|
79
|
-
Missingness profiler for Polars DataFrames.
|
|
80
|
-
|
|
81
|
-
Column scoping
|
|
82
|
-
--------------
|
|
83
|
-
Resolution priority (high → low):
|
|
84
|
-
1. Explicit ``columns`` argument to ``profile()``.
|
|
85
|
-
2. ``config.exclude_columns`` — always removed.
|
|
86
|
-
3. All remaining DataFrame columns.
|
|
87
|
-
"""
|
|
39
|
+
"""Missingness profiler for Polars DataFrames."""
|
|
88
40
|
|
|
89
|
-
def __init__(self
|
|
90
|
-
super().__init__(
|
|
91
|
-
self._config: ProfileConfig = config or ProfileConfig()
|
|
41
|
+
def __init__(self) -> None:
|
|
42
|
+
super().__init__()
|
|
92
43
|
|
|
93
44
|
# ------------------------------------------------------------------
|
|
94
45
|
# Public API
|
|
@@ -117,16 +68,13 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
|
|
|
117
68
|
if n_rows == 0 or not cols:
|
|
118
69
|
return result
|
|
119
70
|
|
|
120
|
-
overrides = self._config.column_overrides # sparse — most keys absent
|
|
121
71
|
indicator_cols: list[pl.Series] = []
|
|
122
72
|
|
|
123
73
|
for col_name in cols:
|
|
124
|
-
override = overrides.get(col_name) # None for most columns
|
|
125
74
|
col_profile, indicator = self._profile_column(
|
|
126
75
|
series=df[col_name],
|
|
127
76
|
col_name=col_name,
|
|
128
77
|
n_rows=n_rows,
|
|
129
|
-
override=override,
|
|
130
78
|
)
|
|
131
79
|
result.columns[col_name] = col_profile
|
|
132
80
|
indicator_cols.append(indicator)
|
|
@@ -173,21 +121,12 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
|
|
|
173
121
|
series: pl.Series,
|
|
174
122
|
col_name: str,
|
|
175
123
|
n_rows: int,
|
|
176
|
-
override: SemanticType | None = None, # sparse — None is the common case
|
|
177
124
|
) -> tuple[ColumnMissingnessProfile, pl.Series]:
|
|
178
|
-
"""
|
|
179
|
-
Compute standard + effective null counts for one column.
|
|
180
|
-
|
|
181
|
-
Eligibility is dtype-first:
|
|
182
|
-
- sentinel strings → String dtype, unless override suppresses it
|
|
183
|
-
- Inf/NaN → Float dtype, always (never suppressed)
|
|
184
|
-
- everything else → standard Polars null only
|
|
185
|
-
"""
|
|
186
125
|
profile = ColumnMissingnessProfile(column=col_name, total_rows=n_rows)
|
|
187
126
|
dtype = series.dtype
|
|
188
127
|
std_null = series.is_null()
|
|
189
128
|
|
|
190
|
-
if _sentinel_eligible(dtype
|
|
129
|
+
if _sentinel_eligible(dtype):
|
|
191
130
|
eff_null = (
|
|
192
131
|
std_null
|
|
193
132
|
| (series.str.strip_chars() == "")
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""
|
|
2
|
+
_null_detection – shared dtype-driven null primitives for Phase 1.
|
|
3
|
+
|
|
4
|
+
Single authority for what counts as "effectively null" across the entire
|
|
5
|
+
Phase 1 implementation. No config, no SemanticType overrides, no state.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import polars as pl
|
|
11
|
+
|
|
12
|
+
_SENTINEL_STRINGS: frozenset[str] = frozenset({"NA", "NAN", "NULL", "NONE", "?"})
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _sentinel_eligible(dtype: pl.DataType) -> bool:
|
|
16
|
+
"""True when sentinel-string detection should run for this column (String/Utf8 only)."""
|
|
17
|
+
return dtype in (pl.Utf8, pl.String)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _inf_eligible(dtype: pl.DataType) -> bool:
|
|
21
|
+
"""True when Inf/NaN expansion should run (Float32/Float64 only)."""
|
|
22
|
+
return dtype in (pl.Float32, pl.Float64)
|
|
@@ -35,10 +35,6 @@ from __future__ import annotations
|
|
|
35
35
|
import polars as pl
|
|
36
36
|
|
|
37
37
|
from ._base import ColumnBatchProfiler
|
|
38
|
-
from .config import (
|
|
39
|
-
ProfileConfig,
|
|
40
|
-
SemanticType,
|
|
41
|
-
)
|
|
42
38
|
from ._correlation_profiler import _INT_DTYPES
|
|
43
39
|
from ._numeric_config import (
|
|
44
40
|
NumericProfileResult,
|
|
@@ -80,21 +76,10 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
80
76
|
"""
|
|
81
77
|
Numeric distribution profiler for Polars DataFrames.
|
|
82
78
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
columns : list[str]
|
|
86
|
-
Columns to profile. Non-numeric or absent columns are skipped
|
|
87
|
-
with a warning; they do not raise.
|
|
88
|
-
config : ProfileConfig | None
|
|
89
|
-
Shared profiling configuration.
|
|
79
|
+
Profiles every column passed to profile(df, columns) — no config,
|
|
80
|
+
no internal eligibility gate.
|
|
90
81
|
"""
|
|
91
82
|
|
|
92
|
-
def __init__(
|
|
93
|
-
self,
|
|
94
|
-
config: ProfileConfig | None = None,
|
|
95
|
-
) -> None:
|
|
96
|
-
super().__init__(config)
|
|
97
|
-
|
|
98
83
|
# ------------------------------------------------------------------
|
|
99
84
|
# Public API
|
|
100
85
|
# ------------------------------------------------------------------
|
|
@@ -110,16 +95,6 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
110
95
|
# Orchestration
|
|
111
96
|
# ------------------------------------------------------------------
|
|
112
97
|
|
|
113
|
-
def _eligible(self, series: pl.Series) -> bool:
|
|
114
|
-
override = self.config.column_overrides.get(series.name)
|
|
115
|
-
if override == SemanticType.Numeric:
|
|
116
|
-
return True
|
|
117
|
-
|
|
118
|
-
if override is not None:
|
|
119
|
-
return False
|
|
120
|
-
|
|
121
|
-
return True
|
|
122
|
-
|
|
123
98
|
def _run(
|
|
124
99
|
self,
|
|
125
100
|
df: pl.DataFrame,
|
|
@@ -128,11 +103,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
128
103
|
result = NumericProfileResult()
|
|
129
104
|
n_rows = df.height
|
|
130
105
|
|
|
131
|
-
available =
|
|
132
|
-
c
|
|
133
|
-
for c in self._resolve_columns(df.columns, columns)
|
|
134
|
-
if self._eligible(df[c])
|
|
135
|
-
]
|
|
106
|
+
available = self._resolve_columns(df.columns, columns)
|
|
136
107
|
result.analysed_columns = available
|
|
137
108
|
|
|
138
109
|
if not available:
|
|
@@ -3,16 +3,18 @@ TabularProfiler – Phase 1: Structural Profiling for tabular datasets.
|
|
|
3
3
|
|
|
4
4
|
All DataFrame operations use Polars (no pandas dependency).
|
|
5
5
|
|
|
6
|
+
A pipeline-agnostic data-catalog tool: receives the full raw DataFrame and
|
|
7
|
+
computes dataset-level stats over every column — no exclusion logic, no
|
|
8
|
+
config dependency.
|
|
9
|
+
|
|
6
10
|
Computes:
|
|
7
|
-
• row / column count (
|
|
11
|
+
• row / column count (full dataset)
|
|
8
12
|
• memory usage + per-column breakdown when threshold exceeded
|
|
9
|
-
• duplicate row count & ratio (
|
|
10
|
-
• overall sparsity (
|
|
11
|
-
• data-type detection (scoped to config.type_detection_columns;
|
|
12
|
-
skipped entirely when None)
|
|
13
|
+
• duplicate row count & ratio (all columns)
|
|
14
|
+
• overall sparsity (all columns)
|
|
13
15
|
|
|
14
16
|
Chunked processing is activated automatically when the DataFrame's
|
|
15
|
-
estimated memory exceeds
|
|
17
|
+
estimated memory exceeds _MEMORY_THRESHOLD_MB.
|
|
16
18
|
"""
|
|
17
19
|
|
|
18
20
|
from __future__ import annotations
|
|
@@ -24,31 +26,32 @@ import polars as pl
|
|
|
24
26
|
from ._base import ModalityProfiler
|
|
25
27
|
from .config import (
|
|
26
28
|
MemoryBreakdown,
|
|
27
|
-
ProfileConfig,
|
|
28
29
|
DatasetStats,
|
|
29
30
|
)
|
|
30
31
|
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# Module-level constants (previously sourced from ProfileConfig)
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
_MEMORY_THRESHOLD_MB: float = 500.0
|
|
37
|
+
_CHUNK_SIZE: int = 100_000
|
|
38
|
+
|
|
31
39
|
|
|
32
40
|
class TabularProfiler(ModalityProfiler):
|
|
33
41
|
"""
|
|
34
42
|
Structural profiler for Polars DataFrames.
|
|
35
43
|
|
|
44
|
+
Pipeline-agnostic: accepts no constructor arguments and applies no column
|
|
45
|
+
filtering. Computes dataset-level stats (row count, column count, memory,
|
|
46
|
+
duplicate ratio, overall sparsity) over the complete DataFrame it receives.
|
|
47
|
+
|
|
36
48
|
Usage
|
|
37
49
|
-----
|
|
38
|
-
>>>
|
|
39
|
-
... duplicate_columns=["user_id", "event_time"],
|
|
40
|
-
... sparsity_columns=["age", "income", "postcode"],
|
|
41
|
-
... type_detection_columns=["age", "income", "postcode", "created_at"],
|
|
42
|
-
... memory_threshold_mb=200,
|
|
43
|
-
... )
|
|
44
|
-
>>> profiler = TabularProfiler(config=cfg)
|
|
50
|
+
>>> profiler = TabularProfiler()
|
|
45
51
|
>>> result = profiler.profile(df)
|
|
46
52
|
>>> print(result)
|
|
47
53
|
"""
|
|
48
54
|
|
|
49
|
-
def __init__(self, config: ProfileConfig | None = None):
|
|
50
|
-
super().__init__(config)
|
|
51
|
-
|
|
52
55
|
# ------------------------------------------------------------------
|
|
53
56
|
# Public API
|
|
54
57
|
# ------------------------------------------------------------------
|
|
@@ -77,17 +80,13 @@ class TabularProfiler(ModalityProfiler):
|
|
|
77
80
|
if result.row_count == 0:
|
|
78
81
|
return result
|
|
79
82
|
|
|
80
|
-
# 3.
|
|
83
|
+
# 3. Operate on all columns — no exclusion logic
|
|
81
84
|
all_cols: list[str] = df.columns
|
|
82
|
-
analysed_cols = [c for c in all_cols if c not in self.config.exclude_columns]
|
|
83
|
-
|
|
84
|
-
dup_cols = analysed_cols
|
|
85
|
-
missingness_cols = analysed_cols
|
|
86
85
|
|
|
87
86
|
if use_chunks:
|
|
88
|
-
self._chunked_metrics(df,
|
|
87
|
+
self._chunked_metrics(df, all_cols, all_cols, result)
|
|
89
88
|
else:
|
|
90
|
-
self._full_metrics(df,
|
|
89
|
+
self._full_metrics(df, all_cols, all_cols, result)
|
|
91
90
|
|
|
92
91
|
return result
|
|
93
92
|
|
|
@@ -136,7 +135,7 @@ class TabularProfiler(ModalityProfiler):
|
|
|
136
135
|
total_bytes = sum(col_bytes.values())
|
|
137
136
|
|
|
138
137
|
result.memory_bytes = total_bytes
|
|
139
|
-
threshold_bytes =
|
|
138
|
+
threshold_bytes = _MEMORY_THRESHOLD_MB * 1024 * 1024
|
|
140
139
|
|
|
141
140
|
if total_bytes > threshold_bytes:
|
|
142
141
|
result.memory_breakdown = MemoryBreakdown(column_bytes=col_bytes)
|
|
@@ -189,7 +188,7 @@ class TabularProfiler(ModalityProfiler):
|
|
|
189
188
|
seen hashes — semantics match keep='first'.
|
|
190
189
|
Sparsity is accumulated as (missing_cells, total_cells).
|
|
191
190
|
"""
|
|
192
|
-
chunk_size =
|
|
191
|
+
chunk_size = _CHUNK_SIZE
|
|
193
192
|
n_chunks = math.ceil(result.row_count / chunk_size)
|
|
194
193
|
|
|
195
194
|
seen_hashes: set[int] = set()
|
|
@@ -36,7 +36,7 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
|
|
|
36
36
|
"""
|
|
37
37
|
|
|
38
38
|
def __init__(self, target_column: str, config: ProfileConfig | None = None) -> None:
|
|
39
|
-
super().__init__(
|
|
39
|
+
super().__init__()
|
|
40
40
|
self.target_column = target_column
|
|
41
41
|
|
|
42
42
|
def profile(self, data: pl.DataFrame, **kwargs) -> TargetProfileResult:
|
|
@@ -129,7 +129,7 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
|
|
|
129
129
|
self, series: pl.Series, n_rows: int, result: TargetProfileResult
|
|
130
130
|
) -> None:
|
|
131
131
|
"""Generates categorical metrics and checks for class imbalance."""
|
|
132
|
-
cat_profiler = CategoricalProfiler(
|
|
132
|
+
cat_profiler = CategoricalProfiler()
|
|
133
133
|
|
|
134
134
|
# Internally compute cardinality, top values, and imbalance metrics
|
|
135
135
|
cat_profile = cat_profiler._profile_column(series, self.target_column, n_rows)
|
|
@@ -146,7 +146,7 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
|
|
|
146
146
|
self, series: pl.Series, n_rows: int, result: TargetProfileResult
|
|
147
147
|
) -> None:
|
|
148
148
|
"""Generates numeric metrics and checks for target skewness."""
|
|
149
|
-
num_profiler = NumericProfiler(
|
|
149
|
+
num_profiler = NumericProfiler()
|
|
150
150
|
|
|
151
151
|
col_name = series.name
|
|
152
152
|
num_result = num_profiler.profile(series.to_frame(), [col_name])
|
|
@@ -54,11 +54,7 @@ from __future__ import annotations
|
|
|
54
54
|
import polars as pl
|
|
55
55
|
|
|
56
56
|
from ._base import ColumnBatchProfiler
|
|
57
|
-
from .config import
|
|
58
|
-
ProfileConfig,
|
|
59
|
-
TextStats,
|
|
60
|
-
SemanticType,
|
|
61
|
-
)
|
|
57
|
+
from .config import TextStats
|
|
62
58
|
from ._text_config import TextProfileResult
|
|
63
59
|
|
|
64
60
|
# Regex that counts non-whitespace token runs — used with str.count_matches.
|
|
@@ -69,22 +65,10 @@ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
|
|
|
69
65
|
"""
|
|
70
66
|
Free-text column profiler for Polars DataFrames.
|
|
71
67
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
``ProfileConfig.column_overrides``, OR
|
|
75
|
-
- Its Polars dtype is ``pl.Utf8`` / ``pl.String`` and no override is set.
|
|
76
|
-
|
|
77
|
-
Non-eligible columns are silently skipped.
|
|
78
|
-
|
|
79
|
-
Parameters
|
|
80
|
-
----------
|
|
81
|
-
config : ProfileConfig | None
|
|
82
|
-
Shared profiling configuration.
|
|
68
|
+
Profiles every column passed to profile(df, columns) — no config,
|
|
69
|
+
no internal eligibility gate.
|
|
83
70
|
"""
|
|
84
71
|
|
|
85
|
-
def __init__(self, config: ProfileConfig | None = None) -> None:
|
|
86
|
-
super().__init__(config)
|
|
87
|
-
|
|
88
72
|
# ------------------------------------------------------------------
|
|
89
73
|
# Public API
|
|
90
74
|
# ------------------------------------------------------------------
|
|
@@ -96,24 +80,6 @@ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
|
|
|
96
80
|
) -> TextProfileResult:
|
|
97
81
|
return self._run(data, columns)
|
|
98
82
|
|
|
99
|
-
# ------------------------------------------------------------------
|
|
100
|
-
# Eligibility
|
|
101
|
-
# ------------------------------------------------------------------
|
|
102
|
-
|
|
103
|
-
def _eligible(self, series: pl.Series) -> bool:
|
|
104
|
-
override = self.config.column_overrides.get(series.name)
|
|
105
|
-
|
|
106
|
-
if override == SemanticType.Text:
|
|
107
|
-
return True
|
|
108
|
-
|
|
109
|
-
# Any other explicit override takes precedence
|
|
110
|
-
if override is not None:
|
|
111
|
-
return False
|
|
112
|
-
|
|
113
|
-
# Native string dtype (pl.Utf8 is the canonical name; pl.String is
|
|
114
|
-
# an alias in newer Polars — check both for cross-version safety)
|
|
115
|
-
return series.dtype in (pl.Utf8, pl.String)
|
|
116
|
-
|
|
117
83
|
# ------------------------------------------------------------------
|
|
118
84
|
# Orchestration
|
|
119
85
|
# ------------------------------------------------------------------
|
|
@@ -125,11 +91,7 @@ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
|
|
|
125
91
|
) -> TextProfileResult:
|
|
126
92
|
result = TextProfileResult()
|
|
127
93
|
|
|
128
|
-
available =
|
|
129
|
-
c
|
|
130
|
-
for c in self._resolve_columns(df.columns, columns)
|
|
131
|
-
if self._eligible(df[c])
|
|
132
|
-
]
|
|
94
|
+
available = self._resolve_columns(df.columns, columns)
|
|
133
95
|
result.analysed_columns = available
|
|
134
96
|
|
|
135
97
|
for col_name in available:
|
|
@@ -52,6 +52,15 @@ class Modality(StrEnum):
|
|
|
52
52
|
# TimeSeries = "time_series"
|
|
53
53
|
|
|
54
54
|
|
|
55
|
+
class PipelinePhase(StrEnum):
|
|
56
|
+
Profiling = "profiling"
|
|
57
|
+
Imputation = "imputation"
|
|
58
|
+
OutlierDetection = "outlier_detection"
|
|
59
|
+
Normalization = "normalization"
|
|
60
|
+
Encoding = "encoding"
|
|
61
|
+
Scaling = "scaling"
|
|
62
|
+
|
|
63
|
+
|
|
55
64
|
# ---------------------------------------------------------------------------
|
|
56
65
|
# Type-detection enums — kept for TypeDetector compatibility
|
|
57
66
|
# ---------------------------------------------------------------------------
|
|
@@ -218,10 +227,6 @@ class ProfileConfig:
|
|
|
218
227
|
Data modality. Currently only Tabular is implemented.
|
|
219
228
|
target_column : Optional[str]
|
|
220
229
|
Name of the label/target column, if any.
|
|
221
|
-
column_overrides : dict[str, SemanticType]
|
|
222
|
-
Explicit semantic type assignments that override auto-detection.
|
|
223
|
-
exclude_columns : list[str]
|
|
224
|
-
Columns to skip entirely during profiling.
|
|
225
230
|
compute_correlation : bool
|
|
226
231
|
Whether to compute the feature-feature correlation matrix.
|
|
227
232
|
correlation_target_column : Optional[str]
|
|
@@ -234,29 +239,84 @@ class ProfileConfig:
|
|
|
234
239
|
|
|
235
240
|
modality: Modality = Modality.Tabular
|
|
236
241
|
target_columns: list[str] = field(default_factory=list)
|
|
237
|
-
column_overrides: dict[str, SemanticType] = field(default_factory=dict)
|
|
238
|
-
exclude_columns: list[str] = field(default_factory=list)
|
|
239
242
|
compute_correlation: bool = False
|
|
240
243
|
correlation_target_column: Optional[str] = None
|
|
241
244
|
memory_threshold_mb: float = 500.0
|
|
242
245
|
chunk_size: int = 100_000
|
|
243
246
|
|
|
244
|
-
|
|
247
|
+
|
|
248
|
+
def to_dict(self) -> dict:
|
|
249
|
+
return {
|
|
250
|
+
"modality": str(self.modality),
|
|
251
|
+
"target_columns": list(self.target_columns),
|
|
252
|
+
"compute_correlation": self.compute_correlation,
|
|
253
|
+
"correlation_target_column": self.correlation_target_column,
|
|
254
|
+
"memory_threshold_mb": self.memory_threshold_mb,
|
|
255
|
+
"chunk_size": self.chunk_size,
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
@classmethod
|
|
259
|
+
def from_dict(cls, data: dict) -> ProfileConfig:
|
|
260
|
+
return cls(
|
|
261
|
+
modality=Modality(data.get("modality", Modality.Tabular)),
|
|
262
|
+
target_columns=list(data.get("target_columns", [])),
|
|
263
|
+
compute_correlation=bool(data.get("compute_correlation", False)),
|
|
264
|
+
correlation_target_column=data.get("correlation_target_column"),
|
|
265
|
+
memory_threshold_mb=float(data.get("memory_threshold_mb", 500.0)),
|
|
266
|
+
chunk_size=int(data.get("chunk_size", 100_000)),
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
def to_json(self) -> str:
|
|
270
|
+
return json.dumps(self.to_dict())
|
|
271
|
+
|
|
272
|
+
@classmethod
|
|
273
|
+
def from_json(cls, json_str: str) -> ProfileConfig:
|
|
274
|
+
return cls.from_dict(json.loads(json_str))
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
@dataclass
|
|
278
|
+
class PipelineConfig:
|
|
279
|
+
"""
|
|
280
|
+
Master configuration for the full 6-phase feature engineering pipeline.
|
|
281
|
+
|
|
282
|
+
Parameters
|
|
283
|
+
----------
|
|
284
|
+
exclude_columns : list[str]
|
|
285
|
+
Hard exclusions — columns dropped globally from every phase.
|
|
286
|
+
phase_exclusions : dict[PipelinePhase, list[str]]
|
|
287
|
+
Soft exclusions — columns bypassed for a specific phase but retained
|
|
288
|
+
in the dataset.
|
|
289
|
+
column_overrides : dict[str, SemanticType]
|
|
290
|
+
Explicit semantic type assignments respected by all downstream phases.
|
|
291
|
+
profiling : ProfileConfig
|
|
292
|
+
Phase 1-specific parameters (correlation, chunking, memory threshold).
|
|
293
|
+
"""
|
|
294
|
+
|
|
295
|
+
exclude_columns: list[str] = field(default_factory=list)
|
|
296
|
+
phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict)
|
|
297
|
+
column_overrides: dict[str, SemanticType] = field(default_factory=dict)
|
|
298
|
+
profiling: ProfileConfig = field(default_factory=ProfileConfig)
|
|
299
|
+
|
|
300
|
+
def resolve_active_columns(
|
|
301
|
+
self, phase: PipelinePhase, available_columns: list[str]
|
|
302
|
+
) -> list[str]:
|
|
245
303
|
"""
|
|
246
|
-
|
|
304
|
+
Return the columns the given phase should operate on.
|
|
247
305
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
306
|
+
Hard exclusions are applied first, then phase-specific soft exclusions.
|
|
307
|
+
Columns absent from available_columns are silently ignored in both lists.
|
|
308
|
+
"""
|
|
309
|
+
hard_set = set(self.exclude_columns)
|
|
310
|
+
soft_set = set(self.phase_exclusions.get(phase, []))
|
|
311
|
+
excluded = hard_set | soft_set
|
|
312
|
+
return [c for c in available_columns if c not in excluded]
|
|
313
|
+
|
|
314
|
+
def set_column_type(
|
|
315
|
+
self, column: str, semantic_type: Union[str, "SemanticType"]
|
|
316
|
+
) -> None:
|
|
317
|
+
"""
|
|
318
|
+
Explicitly set the semantic type for a column, overriding auto-detection.
|
|
319
|
+
This override is respected by all downstream phases.
|
|
260
320
|
"""
|
|
261
321
|
if isinstance(semantic_type, str):
|
|
262
322
|
try:
|
|
@@ -271,36 +331,38 @@ class ProfileConfig:
|
|
|
271
331
|
|
|
272
332
|
def to_dict(self) -> dict:
|
|
273
333
|
return {
|
|
274
|
-
"modality": str(self.modality),
|
|
275
|
-
"target_columns": list(self.target_columns),
|
|
276
|
-
"column_overrides": {k: str(v) for k, v in self.column_overrides.items()},
|
|
277
334
|
"exclude_columns": list(self.exclude_columns),
|
|
278
|
-
"
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
335
|
+
"phase_exclusions": {
|
|
336
|
+
str(phase): list(cols)
|
|
337
|
+
for phase, cols in self.phase_exclusions.items()
|
|
338
|
+
},
|
|
339
|
+
"column_overrides": {
|
|
340
|
+
col: str(sem_type)
|
|
341
|
+
for col, sem_type in self.column_overrides.items()
|
|
342
|
+
},
|
|
343
|
+
"profiling": self.profiling.to_dict(),
|
|
282
344
|
}
|
|
283
345
|
|
|
284
346
|
@classmethod
|
|
285
|
-
def from_dict(cls, data: dict) ->
|
|
347
|
+
def from_dict(cls, data: dict) -> "PipelineConfig":
|
|
286
348
|
return cls(
|
|
287
|
-
|
|
288
|
-
|
|
349
|
+
exclude_columns=list(data.get("exclude_columns", [])),
|
|
350
|
+
phase_exclusions={
|
|
351
|
+
PipelinePhase(phase_str): list(cols)
|
|
352
|
+
for phase_str, cols in data.get("phase_exclusions", {}).items()
|
|
353
|
+
},
|
|
289
354
|
column_overrides={
|
|
290
|
-
|
|
355
|
+
col: SemanticType(sem_str)
|
|
356
|
+
for col, sem_str in data.get("column_overrides", {}).items()
|
|
291
357
|
},
|
|
292
|
-
|
|
293
|
-
compute_correlation=bool(data.get("compute_correlation", False)),
|
|
294
|
-
correlation_target_column=data.get("correlation_target_column"),
|
|
295
|
-
memory_threshold_mb=float(data.get("memory_threshold_mb", 500.0)),
|
|
296
|
-
chunk_size=int(data.get("chunk_size", 100_000)),
|
|
358
|
+
profiling=ProfileConfig.from_dict(data.get("profiling", {})),
|
|
297
359
|
)
|
|
298
360
|
|
|
299
|
-
def to_json(self) -> str:
|
|
300
|
-
return json.dumps(self.to_dict())
|
|
361
|
+
def to_json(self, indent: int = 2) -> str:
|
|
362
|
+
return json.dumps(self.to_dict(), indent=indent)
|
|
301
363
|
|
|
302
364
|
@classmethod
|
|
303
|
-
def from_json(cls, json_str: str) ->
|
|
365
|
+
def from_json(cls, json_str: str) -> "PipelineConfig":
|
|
304
366
|
return cls.from_dict(json.loads(json_str))
|
|
305
367
|
|
|
306
368
|
|
|
@@ -35,7 +35,8 @@ from ._target_profiler import TargetProfiler
|
|
|
35
35
|
from ._correlation_profiler import CorrelationProfiler
|
|
36
36
|
from ._type_detector import TypeDetector
|
|
37
37
|
from .config import (
|
|
38
|
-
|
|
38
|
+
PipelineConfig,
|
|
39
|
+
PipelinePhase,
|
|
39
40
|
ColumnProfile,
|
|
40
41
|
StructuralProfileResult,
|
|
41
42
|
RowMissingnessDistribution,
|
|
@@ -64,14 +65,14 @@ _COLUMN_PROFILER_REGISTRY: dict[SemanticType, type[ColumnBatchProfiler]] = { #
|
|
|
64
65
|
|
|
65
66
|
class StructuralProfiler:
|
|
66
67
|
|
|
67
|
-
def __init__(self, config:
|
|
68
|
-
self.config = config or
|
|
68
|
+
def __init__(self, config: PipelineConfig | None = None) -> None:
|
|
69
|
+
self.config: PipelineConfig = config or PipelineConfig()
|
|
69
70
|
|
|
70
|
-
if self.config.modality == Modality.Tabular:
|
|
71
|
-
self.modality_profiler: ModalityProfiler = TabularProfiler(
|
|
71
|
+
if self.config.profiling.modality == Modality.Tabular:
|
|
72
|
+
self.modality_profiler: ModalityProfiler = TabularProfiler()
|
|
72
73
|
else:
|
|
73
74
|
raise NotImplementedError(
|
|
74
|
-
f"modality {self.config.modality} not supported yet"
|
|
75
|
+
f"modality {self.config.profiling.modality} not supported yet"
|
|
75
76
|
)
|
|
76
77
|
|
|
77
78
|
# ------------------------------------------------------------------
|
|
@@ -87,7 +88,17 @@ class StructuralProfiler:
|
|
|
87
88
|
|
|
88
89
|
result = StructuralProfileResult()
|
|
89
90
|
|
|
90
|
-
active_cols =
|
|
91
|
+
active_cols = self.config.resolve_active_columns(
|
|
92
|
+
PipelinePhase.Profiling, list(data.columns)
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Columns soft-excluded for Profiling: skipped but retained in the result.
|
|
96
|
+
hard_set = set(self.config.exclude_columns)
|
|
97
|
+
soft_retained = [
|
|
98
|
+
c for c in data.columns
|
|
99
|
+
if c in set(self.config.phase_exclusions.get(PipelinePhase.Profiling, []))
|
|
100
|
+
and c not in hard_set
|
|
101
|
+
]
|
|
91
102
|
|
|
92
103
|
# ── 1. Modality profiler ─────────────────────────────────────────
|
|
93
104
|
# Replaces default DatasetStats with the real one (row_count, memory,
|
|
@@ -97,7 +108,7 @@ class StructuralProfiler:
|
|
|
97
108
|
# ── 2. Missingness pre-pass ──────────────────────────────────────
|
|
98
109
|
# setdefault creates ColumnProfile entries; subsequent steps mutate
|
|
99
110
|
# the same objects via the same setdefault pattern.
|
|
100
|
-
missingness_result = MissingnessProfiler(
|
|
111
|
+
missingness_result = MissingnessProfiler().profile(
|
|
101
112
|
data, columns=active_cols
|
|
102
113
|
)
|
|
103
114
|
for col_name in missingness_result.analysed_columns:
|
|
@@ -112,7 +123,6 @@ class StructuralProfiler:
|
|
|
112
123
|
df=data,
|
|
113
124
|
cols=active_cols,
|
|
114
125
|
n_rows=data.height,
|
|
115
|
-
overrides=self.config.column_overrides,
|
|
116
126
|
)
|
|
117
127
|
|
|
118
128
|
# ── 4. Type detection ────────────────────────────────────────────
|
|
@@ -153,7 +163,7 @@ class StructuralProfiler:
|
|
|
153
163
|
profiler_cls = _COLUMN_PROFILER_REGISTRY.get(sem_type) # type: ignore[arg-type]
|
|
154
164
|
if profiler_cls is None:
|
|
155
165
|
continue
|
|
156
|
-
profiler = profiler_cls(
|
|
166
|
+
profiler = profiler_cls()
|
|
157
167
|
try:
|
|
158
168
|
batch = profiler.profile(data, columns=cols)
|
|
159
169
|
for col_name in batch.analysed_columns:
|
|
@@ -165,13 +175,13 @@ class StructuralProfiler:
|
|
|
165
175
|
# ── 7. Target columns ────────────────────────────────────────────
|
|
166
176
|
# TargetProfiler produces target-specific analysis stored in
|
|
167
177
|
# result.targets. cp.stats is NOT overwritten — step 6 already set it.
|
|
168
|
-
if self.config.target_columns:
|
|
169
|
-
for target in self.config.target_columns:
|
|
178
|
+
if self.config.profiling.target_columns:
|
|
179
|
+
for target in self.config.profiling.target_columns:
|
|
170
180
|
if target not in data.columns:
|
|
171
181
|
continue
|
|
172
182
|
target_result = TargetProfiler(
|
|
173
183
|
target_column=target,
|
|
174
|
-
config=self.config,
|
|
184
|
+
config=self.config.profiling,
|
|
175
185
|
).profile(data)
|
|
176
186
|
result.targets[target] = target_result
|
|
177
187
|
|
|
@@ -180,7 +190,7 @@ class StructuralProfiler:
|
|
|
180
190
|
cp.is_target = True
|
|
181
191
|
|
|
182
192
|
# ── 8. Correlation ───────────────────────────────────────────────
|
|
183
|
-
if self.config.compute_correlation:
|
|
193
|
+
if self.config.profiling.compute_correlation:
|
|
184
194
|
# Resolve column lists by detected SemanticType (post-override).
|
|
185
195
|
numeric_cols = [
|
|
186
196
|
c
|
|
@@ -198,7 +208,7 @@ class StructuralProfiler:
|
|
|
198
208
|
corr_profiler = CorrelationProfiler(
|
|
199
209
|
numeric_columns=numeric_cols,
|
|
200
210
|
categorical_columns=categorical_cols,
|
|
201
|
-
config=self.config,
|
|
211
|
+
config=self.config.profiling,
|
|
202
212
|
)
|
|
203
213
|
|
|
204
214
|
# 8a. Feature-feature matrices — computed ONCE, target-independent.
|
|
@@ -209,7 +219,7 @@ class StructuralProfiler:
|
|
|
209
219
|
|
|
210
220
|
# 8b. Per-target analysis — matrices are NOT recomputed; each call
|
|
211
221
|
# shallow-copies feature_corr and appends target-specific fields.
|
|
212
|
-
for target in self.config.target_columns:
|
|
222
|
+
for target in self.config.profiling.target_columns:
|
|
213
223
|
if target not in data.columns:
|
|
214
224
|
continue
|
|
215
225
|
result.dataset.target_correlations[target] = (
|
|
@@ -218,6 +228,12 @@ class StructuralProfiler:
|
|
|
218
228
|
)
|
|
219
229
|
)
|
|
220
230
|
|
|
231
|
+
# ── Soft-excluded placeholders ───────────────────────────────────────
|
|
232
|
+
# Columns soft-excluded for Profiling are not profiled but must still
|
|
233
|
+
# appear in the result so downstream phases can reference them.
|
|
234
|
+
for col in soft_retained:
|
|
235
|
+
result.columns.setdefault(col, ColumnProfile(name=col))
|
|
236
|
+
|
|
221
237
|
return result
|
|
222
238
|
|
|
223
239
|
# ------------------------------------------------------------------
|
|
@@ -229,9 +245,8 @@ class StructuralProfiler:
|
|
|
229
245
|
df: pl.DataFrame,
|
|
230
246
|
cols: list[str],
|
|
231
247
|
n_rows: int,
|
|
232
|
-
overrides: dict[str, SemanticType],
|
|
233
248
|
) -> RowMissingnessDistribution:
|
|
234
|
-
from .
|
|
249
|
+
from ._null_detection import (
|
|
235
250
|
_sentinel_eligible,
|
|
236
251
|
_inf_eligible,
|
|
237
252
|
_SENTINEL_STRINGS,
|
|
@@ -246,10 +261,9 @@ class StructuralProfiler:
|
|
|
246
261
|
|
|
247
262
|
for col_name in cols:
|
|
248
263
|
dtype = df[col_name].dtype
|
|
249
|
-
override = overrides.get(col_name)
|
|
250
264
|
null_e = pl.col(col_name).is_null()
|
|
251
265
|
|
|
252
|
-
if _sentinel_eligible(dtype
|
|
266
|
+
if _sentinel_eligible(dtype):
|
|
253
267
|
eff = (
|
|
254
268
|
null_e
|
|
255
269
|
| (pl.col(col_name).str.strip_chars() == "")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataforge-ml
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.10.0
|
|
4
4
|
Summary: A automated feature engineering and designing pipeline library
|
|
5
5
|
License: MIT
|
|
6
6
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -21,9 +21,9 @@ Provides-Extra: dev
|
|
|
21
21
|
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
22
22
|
Dynamic: license-file
|
|
23
23
|
|
|
24
|
-
#
|
|
24
|
+
# DataForgeML
|
|
25
25
|
|
|
26
|
-
Automated feature engineering and data profiling pipeline library for
|
|
26
|
+
Automated feature engineering and data profiling pipeline library for datasets.
|
|
27
27
|
|
|
28
28
|
## Installation
|
|
29
29
|
|
|
@@ -22,6 +22,7 @@ src/dataforge_ml/profiling/_datetime_config.py
|
|
|
22
22
|
src/dataforge_ml/profiling/_datetime_profiler.py
|
|
23
23
|
src/dataforge_ml/profiling/_missingness_config.py
|
|
24
24
|
src/dataforge_ml/profiling/_missingness_profiler.py
|
|
25
|
+
src/dataforge_ml/profiling/_null_detection.py
|
|
25
26
|
src/dataforge_ml/profiling/_numeric_config.py
|
|
26
27
|
src/dataforge_ml/profiling/_numeric_profiler.py
|
|
27
28
|
src/dataforge_ml/profiling/_tabular.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_categorical_config.py
RENAMED
|
File without changes
|
{dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_correlation_config.py
RENAMED
|
File without changes
|
|
File without changes
|
{dataforge_ml-0.8.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_missingness_config.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|