dataforge-ml 0.5.0__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/PKG-INFO +1 -1
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/pyproject.toml +1 -1
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_boolean_profiler.py +1 -13
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_categorical.py +1 -2
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_correlation_config.py +134 -35
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_correlation_profiler.py +174 -3
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_missingness_profiler.py +2 -1
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_numeric_profiler.py +73 -118
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_target_profiler.py +4 -2
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_type_detector.py +78 -102
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/structural.py +1 -1
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml.egg-info/PKG-INFO +1 -1
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/LICENSE +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/README.md +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/setup.cfg +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/__init__.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/models/__init__.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/models/_data_structure.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/models/_data_types.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/__init__.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_base.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_datetime_profiler.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_tabular.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_target_config.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_text_config.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/config.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/splitting/__init__.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/splitting/_config.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/splitting/_splitter.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/utils/__init__.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/utils/data_loader.py +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml.egg-info/requires.txt +0 -0
- {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml.egg-info/top_level.txt +0 -0
|
@@ -84,19 +84,7 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
|
|
|
84
84
|
if override is not None:
|
|
85
85
|
return False
|
|
86
86
|
|
|
87
|
-
|
|
88
|
-
if series.dtype == pl.Boolean:
|
|
89
|
-
return True
|
|
90
|
-
|
|
91
|
-
# Integer {0, 1} column — check after dropping nulls
|
|
92
|
-
if series.dtype in _INT_DTYPES:
|
|
93
|
-
clean = series.drop_nulls()
|
|
94
|
-
if clean.len() == 0:
|
|
95
|
-
return False
|
|
96
|
-
unique_vals = set(clean.unique().to_list())
|
|
97
|
-
return unique_vals <= {0, 1}
|
|
98
|
-
|
|
99
|
-
return False
|
|
87
|
+
return True
|
|
100
88
|
|
|
101
89
|
# ------------------------------------------------------------------
|
|
102
90
|
# Orchestration
|
|
@@ -49,7 +49,6 @@ from .config import (
|
|
|
49
49
|
ProfileConfig,
|
|
50
50
|
SemanticType,
|
|
51
51
|
)
|
|
52
|
-
from ..models._data_types import _CAT_DTYPES
|
|
53
52
|
|
|
54
53
|
# ---------------------------------------------------------------------------
|
|
55
54
|
# Module-level thresholds (documented so callers can see what drives flags)
|
|
@@ -115,7 +114,7 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
|
|
|
115
114
|
if override is not None:
|
|
116
115
|
return False
|
|
117
116
|
|
|
118
|
-
return
|
|
117
|
+
return True
|
|
119
118
|
|
|
120
119
|
def _run(
|
|
121
120
|
self,
|
|
@@ -7,41 +7,41 @@ numeric/categorical column lists that are already resolved upstream).
|
|
|
7
7
|
|
|
8
8
|
Design notes
|
|
9
9
|
------------
|
|
10
|
-
- Pearson
|
|
11
|
-
-
|
|
12
|
-
|
|
13
|
-
- Near-redundancy
|
|
14
|
-
|
|
15
|
-
- Feature–target
|
|
16
|
-
|
|
17
|
-
- Mutual information: MI for all features vs target (classif or regression).
|
|
18
|
-
Captures non-linear dependencies correlation misses.
|
|
10
|
+
- Pearson / Spearman : linear / monotonic relationships between numeric columns.
|
|
11
|
+
- Cramér's V : association between categorical column pairs [0, 1].
|
|
12
|
+
- Eta-squared : numeric-categorical association via ANOVA [0, 1].
|
|
13
|
+
- Near-redundancy : Pearson/Spearman |r| > 0.95, Cramér's V > 0.80,
|
|
14
|
+
or eta² > 0.50 flagged — near-identical signal.
|
|
15
|
+
- Feature–target : Pearson (numeric target), ANOVA/eta² (categorical target).
|
|
16
|
+
- Mutual information : MI for all features vs target (classif or regression).
|
|
19
17
|
"""
|
|
18
|
+
|
|
20
19
|
from __future__ import annotations
|
|
21
20
|
|
|
22
21
|
from dataclasses import dataclass, field
|
|
23
22
|
from enum import StrEnum
|
|
24
23
|
from typing import Optional
|
|
25
24
|
|
|
26
|
-
|
|
27
25
|
# ---------------------------------------------------------------------------
|
|
28
26
|
# Enums
|
|
29
27
|
# ---------------------------------------------------------------------------
|
|
30
28
|
|
|
29
|
+
|
|
31
30
|
class CorrelationMethod(StrEnum):
|
|
32
|
-
Pearson
|
|
31
|
+
Pearson = "pearson"
|
|
33
32
|
Spearman = "spearman"
|
|
34
33
|
|
|
35
34
|
|
|
36
35
|
class TargetType(StrEnum):
|
|
37
|
-
Numeric
|
|
38
|
-
Categorical
|
|
36
|
+
Numeric = "numeric" # numeric target → Pearson + MI regression
|
|
37
|
+
Categorical = "categorical" # categorical target → ANOVA/eta² + MI classif
|
|
39
38
|
|
|
40
39
|
|
|
41
40
|
# ---------------------------------------------------------------------------
|
|
42
41
|
# Pairwise correlation result
|
|
43
42
|
# ---------------------------------------------------------------------------
|
|
44
43
|
|
|
44
|
+
|
|
45
45
|
@dataclass
|
|
46
46
|
class CorrelationPair:
|
|
47
47
|
"""
|
|
@@ -62,14 +62,74 @@ class CorrelationPair:
|
|
|
62
62
|
|
|
63
63
|
col_a: str
|
|
64
64
|
col_b: str
|
|
65
|
-
pearson_r:
|
|
65
|
+
pearson_r: Optional[float] = None
|
|
66
66
|
spearman_r: Optional[float] = None
|
|
67
67
|
near_redundant: bool = False
|
|
68
68
|
|
|
69
69
|
def to_dict(self) -> dict:
|
|
70
70
|
return {
|
|
71
|
-
"col_a": self.col_a,
|
|
72
|
-
"
|
|
71
|
+
"col_a": self.col_a,
|
|
72
|
+
"col_b": self.col_b,
|
|
73
|
+
"pearson_r": self.pearson_r,
|
|
74
|
+
"spearman_r": self.spearman_r,
|
|
75
|
+
"near_redundant": self.near_redundant,
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class CramerVPair:
|
|
81
|
+
"""
|
|
82
|
+
Cramér's V association between two categorical columns.
|
|
83
|
+
|
|
84
|
+
Attributes
|
|
85
|
+
----------
|
|
86
|
+
col_a, col_b : str
|
|
87
|
+
cramer_v : float | None
|
|
88
|
+
Cramér's V in [0, 1]. None when computation fails or sample too small.
|
|
89
|
+
near_redundant : bool
|
|
90
|
+
True when cramer_v exceeds the near-redundancy threshold (default 0.80).
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
col_a: str = ""
|
|
94
|
+
col_b: str = ""
|
|
95
|
+
cramer_v: Optional[float] = None
|
|
96
|
+
near_redundant: bool = False
|
|
97
|
+
|
|
98
|
+
def to_dict(self) -> dict:
|
|
99
|
+
return {
|
|
100
|
+
"col_a": self.col_a,
|
|
101
|
+
"col_b": self.col_b,
|
|
102
|
+
"cramer_v": self.cramer_v,
|
|
103
|
+
"near_redundant": self.near_redundant,
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@dataclass
|
|
108
|
+
class EtaSquaredPair:
|
|
109
|
+
"""
|
|
110
|
+
Eta-squared (η²) association between a numeric and a categorical column.
|
|
111
|
+
|
|
112
|
+
Attributes
|
|
113
|
+
----------
|
|
114
|
+
numeric_col : str
|
|
115
|
+
categorical_col : str
|
|
116
|
+
eta_squared : float | None
|
|
117
|
+
Effect size in [0, 1]. None when computation fails.
|
|
118
|
+
Rule of thumb: 0.01 small, 0.06 medium, 0.14 large.
|
|
119
|
+
near_redundant : bool
|
|
120
|
+
True when eta_squared exceeds the near-redundancy threshold (default 0.50).
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
numeric_col: str = ""
|
|
124
|
+
categorical_col: str = ""
|
|
125
|
+
eta_squared: Optional[float] = None
|
|
126
|
+
near_redundant: bool = False
|
|
127
|
+
|
|
128
|
+
def to_dict(self) -> dict:
|
|
129
|
+
return {
|
|
130
|
+
"numeric_col": self.numeric_col,
|
|
131
|
+
"categorical_col": self.categorical_col,
|
|
132
|
+
"eta_squared": self.eta_squared,
|
|
73
133
|
"near_redundant": self.near_redundant,
|
|
74
134
|
}
|
|
75
135
|
|
|
@@ -78,6 +138,7 @@ class CorrelationPair:
|
|
|
78
138
|
# Feature–target entries
|
|
79
139
|
# ---------------------------------------------------------------------------
|
|
80
140
|
|
|
141
|
+
|
|
81
142
|
@dataclass
|
|
82
143
|
class NumericTargetCorrelation:
|
|
83
144
|
"""
|
|
@@ -88,7 +149,8 @@ class NumericTargetCorrelation:
|
|
|
88
149
|
feature : str
|
|
89
150
|
pearson_r : float | None
|
|
90
151
|
"""
|
|
91
|
-
|
|
152
|
+
|
|
153
|
+
feature: str
|
|
92
154
|
pearson_r: Optional[float] = None
|
|
93
155
|
|
|
94
156
|
def to_dict(self) -> dict:
|
|
@@ -113,15 +175,18 @@ class CategoricalTargetCorrelation:
|
|
|
113
175
|
Effect size: SS_between / SS_total. Ranges [0, 1].
|
|
114
176
|
Rule of thumb: 0.01 small, 0.06 medium, 0.14 large.
|
|
115
177
|
"""
|
|
116
|
-
|
|
178
|
+
|
|
179
|
+
feature: str
|
|
117
180
|
f_statistic: Optional[float] = None
|
|
118
|
-
p_value:
|
|
181
|
+
p_value: Optional[float] = None
|
|
119
182
|
eta_squared: Optional[float] = None
|
|
120
183
|
|
|
121
184
|
def to_dict(self) -> dict:
|
|
122
185
|
return {
|
|
123
|
-
"feature": self.feature,
|
|
124
|
-
"
|
|
186
|
+
"feature": self.feature,
|
|
187
|
+
"f_statistic": self.f_statistic,
|
|
188
|
+
"p_value": self.p_value,
|
|
189
|
+
"eta_squared": self.eta_squared,
|
|
125
190
|
}
|
|
126
191
|
|
|
127
192
|
|
|
@@ -129,6 +194,7 @@ class CategoricalTargetCorrelation:
|
|
|
129
194
|
# Mutual information
|
|
130
195
|
# ---------------------------------------------------------------------------
|
|
131
196
|
|
|
197
|
+
|
|
132
198
|
@dataclass
|
|
133
199
|
class MutualInformationEntry:
|
|
134
200
|
"""
|
|
@@ -143,9 +209,10 @@ class MutualInformationEntry:
|
|
|
143
209
|
rank : int
|
|
144
210
|
1 = highest MI (most informative).
|
|
145
211
|
"""
|
|
146
|
-
|
|
212
|
+
|
|
213
|
+
feature: str
|
|
147
214
|
mi_score: float = 0.0
|
|
148
|
-
rank:
|
|
215
|
+
rank: int = 0
|
|
149
216
|
|
|
150
217
|
def to_dict(self) -> dict:
|
|
151
218
|
return {"feature": self.feature, "mi_score": self.mi_score, "rank": self.rank}
|
|
@@ -155,6 +222,7 @@ class MutualInformationEntry:
|
|
|
155
222
|
# Near-redundancy summary
|
|
156
223
|
# ---------------------------------------------------------------------------
|
|
157
224
|
|
|
225
|
+
|
|
158
226
|
@dataclass
|
|
159
227
|
class NearRedundancyGroup:
|
|
160
228
|
"""
|
|
@@ -164,17 +232,22 @@ class NearRedundancyGroup:
|
|
|
164
232
|
The suggested_drop list contains every column except the first
|
|
165
233
|
alphabetically — a simple, deterministic heuristic.
|
|
166
234
|
"""
|
|
167
|
-
|
|
235
|
+
|
|
236
|
+
columns: list[str] = field(default_factory=list)
|
|
168
237
|
suggested_drop: list[str] = field(default_factory=list)
|
|
169
238
|
|
|
170
239
|
def to_dict(self) -> dict:
|
|
171
|
-
return {
|
|
240
|
+
return {
|
|
241
|
+
"columns": list(self.columns),
|
|
242
|
+
"suggested_drop": list(self.suggested_drop),
|
|
243
|
+
}
|
|
172
244
|
|
|
173
245
|
|
|
174
246
|
# ---------------------------------------------------------------------------
|
|
175
247
|
# Top-level result
|
|
176
248
|
# ---------------------------------------------------------------------------
|
|
177
249
|
|
|
250
|
+
|
|
178
251
|
@dataclass
|
|
179
252
|
class CorrelationProfileResult:
|
|
180
253
|
"""
|
|
@@ -211,23 +284,34 @@ class CorrelationProfileResult:
|
|
|
211
284
|
|
|
212
285
|
# Column scope
|
|
213
286
|
analysed_numeric_columns: list[str] = field(default_factory=list)
|
|
287
|
+
analysed_categorical_columns: list[str] = field(default_factory=list)
|
|
214
288
|
|
|
215
289
|
# Pairwise matrices
|
|
216
|
-
pearson_matrix:
|
|
290
|
+
pearson_matrix: dict[str, dict[str, float]] = field(default_factory=dict)
|
|
217
291
|
spearman_matrix: dict[str, dict[str, float]] = field(default_factory=dict)
|
|
218
292
|
|
|
219
|
-
# Pairwise summaries
|
|
220
|
-
pairwise:
|
|
293
|
+
# Pairwise summaries — numeric ↔ numeric
|
|
294
|
+
pairwise: list[CorrelationPair] = field(default_factory=list)
|
|
221
295
|
near_redundant_pairs: list[CorrelationPair] = field(default_factory=list)
|
|
222
296
|
near_redundancy_groups: list[NearRedundancyGroup] = field(default_factory=list)
|
|
223
297
|
|
|
298
|
+
# Pairwise summaries — categorical ↔ categorical (Cramér's V)
|
|
299
|
+
cramer_v_pairs: list[CramerVPair] = field(default_factory=list)
|
|
300
|
+
near_redundant_cramer_v_pairs: list[CramerVPair] = field(default_factory=list)
|
|
301
|
+
|
|
302
|
+
# Pairwise summaries — numeric ↔ categorical (eta-squared)
|
|
303
|
+
eta_squared_pairs: list[EtaSquaredPair] = field(default_factory=list)
|
|
304
|
+
near_redundant_eta_squared_pairs: list[EtaSquaredPair] = field(default_factory=list)
|
|
305
|
+
|
|
224
306
|
# Target info
|
|
225
|
-
target_column: Optional[str]
|
|
226
|
-
target_type:
|
|
307
|
+
target_column: Optional[str] = None
|
|
308
|
+
target_type: Optional[TargetType] = None
|
|
227
309
|
|
|
228
310
|
# Feature–target correlations (top-10 each)
|
|
229
|
-
feature_target_numeric:
|
|
230
|
-
feature_target_categorical:
|
|
311
|
+
feature_target_numeric: list[NumericTargetCorrelation] = field(default_factory=list)
|
|
312
|
+
feature_target_categorical: list[CategoricalTargetCorrelation] = field(
|
|
313
|
+
default_factory=list
|
|
314
|
+
)
|
|
231
315
|
|
|
232
316
|
# Mutual information (all features, ranked)
|
|
233
317
|
mutual_information: list[MutualInformationEntry] = field(default_factory=list)
|
|
@@ -249,14 +333,29 @@ class CorrelationProfileResult:
|
|
|
249
333
|
def to_dict(self) -> dict:
|
|
250
334
|
return {
|
|
251
335
|
"analysed_numeric_columns": list(self.analysed_numeric_columns),
|
|
336
|
+
"analysed_categorical_columns": list(self.analysed_categorical_columns),
|
|
252
337
|
"pearson_matrix": {k: dict(v) for k, v in self.pearson_matrix.items()},
|
|
253
338
|
"spearman_matrix": {k: dict(v) for k, v in self.spearman_matrix.items()},
|
|
254
339
|
"pairwise": [p.to_dict() for p in self.pairwise],
|
|
255
340
|
"near_redundant_pairs": [p.to_dict() for p in self.near_redundant_pairs],
|
|
256
|
-
"near_redundancy_groups": [
|
|
341
|
+
"near_redundancy_groups": [
|
|
342
|
+
g.to_dict() for g in self.near_redundancy_groups
|
|
343
|
+
],
|
|
344
|
+
"cramer_v_pairs": [p.to_dict() for p in self.cramer_v_pairs],
|
|
345
|
+
"near_redundant_cramer_v_pairs": [
|
|
346
|
+
p.to_dict() for p in self.near_redundant_cramer_v_pairs
|
|
347
|
+
],
|
|
348
|
+
"eta_squared_pairs": [p.to_dict() for p in self.eta_squared_pairs],
|
|
349
|
+
"near_redundant_eta_squared_pairs": [
|
|
350
|
+
p.to_dict() for p in self.near_redundant_eta_squared_pairs
|
|
351
|
+
],
|
|
257
352
|
"target_column": self.target_column,
|
|
258
353
|
"target_type": str(self.target_type) if self.target_type else None,
|
|
259
|
-
"feature_target_numeric": [
|
|
260
|
-
|
|
354
|
+
"feature_target_numeric": [
|
|
355
|
+
f.to_dict() for f in self.feature_target_numeric
|
|
356
|
+
],
|
|
357
|
+
"feature_target_categorical": [
|
|
358
|
+
f.to_dict() for f in self.feature_target_categorical
|
|
359
|
+
],
|
|
261
360
|
"mutual_information": [m.to_dict() for m in self.mutual_information],
|
|
262
361
|
}
|
{dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_correlation_profiler.py
RENAMED
|
@@ -47,6 +47,8 @@ from ._correlation_config import (
|
|
|
47
47
|
CategoricalTargetCorrelation,
|
|
48
48
|
CorrelationPair,
|
|
49
49
|
CorrelationProfileResult,
|
|
50
|
+
CramerVPair,
|
|
51
|
+
EtaSquaredPair,
|
|
50
52
|
MutualInformationEntry,
|
|
51
53
|
NearRedundancyGroup,
|
|
52
54
|
NumericTargetCorrelation,
|
|
@@ -55,6 +57,8 @@ from ._correlation_config import (
|
|
|
55
57
|
from ..models._data_types import _NUMERIC_DTYPES, _INT_DTYPES
|
|
56
58
|
|
|
57
59
|
_NEAR_REDUNDANT_THRESHOLD: float = 0.95
|
|
60
|
+
_NEAR_REDUNDANT_CRAMER_V_THRESHOLD: float = 0.80
|
|
61
|
+
_NEAR_REDUNDANT_ETA_SQUARED_THRESHOLD: float = 0.50
|
|
58
62
|
_TOP_N_FEATURE_TARGET: int = 10
|
|
59
63
|
_MI_N_NEIGHBORS: int = 3
|
|
60
64
|
_MI_MIN_ROWS: int = 10 # min complete-case rows for a meaningful k-NN MI estimate
|
|
@@ -142,13 +146,14 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
|
|
|
142
146
|
self,
|
|
143
147
|
df: pl.DataFrame,
|
|
144
148
|
numeric_cols: list[str],
|
|
149
|
+
categorical_cols: Optional[list[str]] = None,
|
|
145
150
|
) -> CorrelationProfileResult:
|
|
146
151
|
"""
|
|
147
152
|
Compute pairwise feature-feature correlation matrices.
|
|
148
153
|
|
|
149
|
-
Pearson + Spearman
|
|
150
|
-
All target-specific fields
|
|
151
|
-
|
|
154
|
+
Pearson + Spearman for numeric pairs, Cramér's V for categorical pairs,
|
|
155
|
+
eta-squared for numeric-categorical pairs. All target-specific fields
|
|
156
|
+
are left at their defaults. Call profile_target() for target analysis.
|
|
152
157
|
"""
|
|
153
158
|
result = CorrelationProfileResult()
|
|
154
159
|
|
|
@@ -159,6 +164,9 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
|
|
|
159
164
|
]
|
|
160
165
|
result.analysed_numeric_columns = resolved_numeric
|
|
161
166
|
|
|
167
|
+
resolved_categorical = [c for c in (categorical_cols or []) if c in df.columns]
|
|
168
|
+
result.analysed_categorical_columns = resolved_categorical
|
|
169
|
+
|
|
162
170
|
if len(resolved_numeric) >= 2:
|
|
163
171
|
pearson_mat, spearman_mat = self._compute_matrices(df, resolved_numeric)
|
|
164
172
|
result.pearson_matrix = pearson_mat
|
|
@@ -171,6 +179,22 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
|
|
|
171
179
|
result.near_redundant_pairs
|
|
172
180
|
)
|
|
173
181
|
|
|
182
|
+
if len(resolved_categorical) >= 2:
|
|
183
|
+
result.cramer_v_pairs = self._compute_cramer_v_pairs(
|
|
184
|
+
df, resolved_categorical, _NEAR_REDUNDANT_CRAMER_V_THRESHOLD
|
|
185
|
+
)
|
|
186
|
+
result.near_redundant_cramer_v_pairs = [
|
|
187
|
+
p for p in result.cramer_v_pairs if p.near_redundant
|
|
188
|
+
]
|
|
189
|
+
|
|
190
|
+
if resolved_numeric and resolved_categorical:
|
|
191
|
+
result.eta_squared_pairs = self._compute_eta_squared_pairs(
|
|
192
|
+
df, resolved_numeric, resolved_categorical, _NEAR_REDUNDANT_ETA_SQUARED_THRESHOLD
|
|
193
|
+
)
|
|
194
|
+
result.near_redundant_eta_squared_pairs = [
|
|
195
|
+
p for p in result.eta_squared_pairs if p.near_redundant
|
|
196
|
+
]
|
|
197
|
+
|
|
174
198
|
return result
|
|
175
199
|
|
|
176
200
|
def profile_target(
|
|
@@ -316,6 +340,153 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
|
|
|
316
340
|
for members in uf.groups()
|
|
317
341
|
]
|
|
318
342
|
|
|
343
|
+
# ------------------------------------------------------------------
|
|
344
|
+
# Step 3b: Cramér's V — categorical ↔ categorical
|
|
345
|
+
# ------------------------------------------------------------------
|
|
346
|
+
|
|
347
|
+
@staticmethod
|
|
348
|
+
def _compute_cramer_v_pairs(
|
|
349
|
+
df: pl.DataFrame,
|
|
350
|
+
cat_cols: list[str],
|
|
351
|
+
threshold: float,
|
|
352
|
+
) -> list[CramerVPair]:
|
|
353
|
+
try:
|
|
354
|
+
from scipy.stats import chi2_contingency
|
|
355
|
+
except ImportError:
|
|
356
|
+
warnings.warn(
|
|
357
|
+
"scipy is required for Cramér's V. Install: pip install scipy",
|
|
358
|
+
stacklevel=3,
|
|
359
|
+
)
|
|
360
|
+
return []
|
|
361
|
+
|
|
362
|
+
import numpy as np
|
|
363
|
+
|
|
364
|
+
pairs: list[CramerVPair] = []
|
|
365
|
+
for col_a, col_b in itertools.combinations(cat_cols, 2):
|
|
366
|
+
pair_df = (
|
|
367
|
+
df.select([
|
|
368
|
+
pl.col(col_a).cast(pl.Utf8, strict=False),
|
|
369
|
+
pl.col(col_b).cast(pl.Utf8, strict=False),
|
|
370
|
+
])
|
|
371
|
+
.drop_nulls()
|
|
372
|
+
)
|
|
373
|
+
n = pair_df.height
|
|
374
|
+
if n < 5:
|
|
375
|
+
pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
|
|
376
|
+
continue
|
|
377
|
+
|
|
378
|
+
counts = pair_df.group_by([col_a, col_b]).agg(pl.len().alias("count"))
|
|
379
|
+
a_unique = sorted(counts[col_a].unique().to_list())
|
|
380
|
+
b_unique = sorted(counts[col_b].unique().to_list())
|
|
381
|
+
if len(a_unique) < 2 or len(b_unique) < 2:
|
|
382
|
+
pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
|
|
383
|
+
continue
|
|
384
|
+
|
|
385
|
+
a_idx = {v: i for i, v in enumerate(a_unique)}
|
|
386
|
+
b_idx = {v: i for i, v in enumerate(b_unique)}
|
|
387
|
+
ct = np.zeros((len(a_unique), len(b_unique)), dtype=int)
|
|
388
|
+
for a_val, b_val, cnt in zip(
|
|
389
|
+
counts[col_a].to_list(),
|
|
390
|
+
counts[col_b].to_list(),
|
|
391
|
+
counts["count"].to_list(),
|
|
392
|
+
):
|
|
393
|
+
ct[a_idx[a_val], b_idx[b_val]] = cnt
|
|
394
|
+
|
|
395
|
+
try:
|
|
396
|
+
chi2, _, _, _ = chi2_contingency(ct)
|
|
397
|
+
r, c = ct.shape
|
|
398
|
+
phi2 = chi2 / n
|
|
399
|
+
# Bergsma & Wicher (2013) bias correction
|
|
400
|
+
phi2_corr = max(0.0, phi2 - (r - 1) * (c - 1) / (n - 1))
|
|
401
|
+
r_corr = r - (r - 1) ** 2 / (n - 1)
|
|
402
|
+
c_corr = c - (c - 1) ** 2 / (n - 1)
|
|
403
|
+
denom = min(r_corr - 1, c_corr - 1)
|
|
404
|
+
if denom <= 0:
|
|
405
|
+
# Near-saturated contingency table (n_unique ≈ n_rows):
|
|
406
|
+
# bias correction collapses denominator; skip the pair.
|
|
407
|
+
pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
|
|
408
|
+
continue
|
|
409
|
+
v = float(np.sqrt(phi2_corr / denom))
|
|
410
|
+
v = max(0.0, min(1.0, v))
|
|
411
|
+
except Exception as exc:
|
|
412
|
+
warnings.warn(
|
|
413
|
+
f"Cramér's V failed for ({col_a}, {col_b}): {exc}", stacklevel=3
|
|
414
|
+
)
|
|
415
|
+
pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
|
|
416
|
+
continue
|
|
417
|
+
|
|
418
|
+
pairs.append(CramerVPair(
|
|
419
|
+
col_a=col_a, col_b=col_b,
|
|
420
|
+
cramer_v=v,
|
|
421
|
+
near_redundant=v > threshold,
|
|
422
|
+
))
|
|
423
|
+
|
|
424
|
+
return pairs
|
|
425
|
+
|
|
426
|
+
# ------------------------------------------------------------------
|
|
427
|
+
# Step 3c: Eta-squared — numeric ↔ categorical
|
|
428
|
+
# ------------------------------------------------------------------
|
|
429
|
+
|
|
430
|
+
@staticmethod
|
|
431
|
+
def _compute_eta_squared_pairs(
|
|
432
|
+
df: pl.DataFrame,
|
|
433
|
+
numeric_cols: list[str],
|
|
434
|
+
cat_cols: list[str],
|
|
435
|
+
threshold: float,
|
|
436
|
+
) -> list[EtaSquaredPair]:
|
|
437
|
+
try:
|
|
438
|
+
from scipy.stats import f_oneway
|
|
439
|
+
except ImportError:
|
|
440
|
+
warnings.warn(
|
|
441
|
+
"scipy is required for eta-squared. Install: pip install scipy",
|
|
442
|
+
stacklevel=3,
|
|
443
|
+
)
|
|
444
|
+
return []
|
|
445
|
+
|
|
446
|
+
pairs: list[EtaSquaredPair] = []
|
|
447
|
+
for num_col in numeric_cols:
|
|
448
|
+
feat = df[num_col].cast(pl.Float64)
|
|
449
|
+
valid_feat = feat.drop_nulls()
|
|
450
|
+
if valid_feat.len() == 0:
|
|
451
|
+
continue
|
|
452
|
+
grand_mean = float(valid_feat.mean()) # type: ignore[arg-type]
|
|
453
|
+
ss_total = float(((valid_feat - grand_mean) ** 2).sum() or 0.0)
|
|
454
|
+
|
|
455
|
+
for cat_col in cat_cols:
|
|
456
|
+
target = df[cat_col]
|
|
457
|
+
categories = target.drop_nulls().unique().to_list()
|
|
458
|
+
groups = [
|
|
459
|
+
feat.filter(target == cat).drop_nulls().to_numpy()
|
|
460
|
+
for cat in categories
|
|
461
|
+
]
|
|
462
|
+
non_empty = [g for g in groups if len(g) > 0]
|
|
463
|
+
if len(non_empty) < 2:
|
|
464
|
+
pairs.append(EtaSquaredPair(numeric_col=num_col, categorical_col=cat_col))
|
|
465
|
+
continue
|
|
466
|
+
try:
|
|
467
|
+
f_oneway(*non_empty)
|
|
468
|
+
ss_between = sum(
|
|
469
|
+
len(g) * (float(g.mean()) - grand_mean) ** 2
|
|
470
|
+
for g in non_empty
|
|
471
|
+
)
|
|
472
|
+
eta_sq = ss_between / ss_total if ss_total > 0 else 0.0
|
|
473
|
+
eta_sq = max(0.0, min(1.0, eta_sq))
|
|
474
|
+
except Exception as exc:
|
|
475
|
+
warnings.warn(
|
|
476
|
+
f"Eta-squared failed for ({num_col}, {cat_col}): {exc}",
|
|
477
|
+
stacklevel=3,
|
|
478
|
+
)
|
|
479
|
+
pairs.append(EtaSquaredPair(numeric_col=num_col, categorical_col=cat_col))
|
|
480
|
+
continue
|
|
481
|
+
|
|
482
|
+
pairs.append(EtaSquaredPair(
|
|
483
|
+
numeric_col=num_col, categorical_col=cat_col,
|
|
484
|
+
eta_squared=eta_sq,
|
|
485
|
+
near_redundant=eta_sq > threshold,
|
|
486
|
+
))
|
|
487
|
+
|
|
488
|
+
return pairs
|
|
489
|
+
|
|
319
490
|
# ------------------------------------------------------------------
|
|
320
491
|
# Step 5a: Feature–target Pearson (unchanged)
|
|
321
492
|
# ------------------------------------------------------------------
|
{dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_missingness_profiler.py
RENAMED
|
@@ -207,7 +207,8 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
|
|
|
207
207
|
profile.effective_null_ratio = eff_count / n_rows if n_rows else 0.0
|
|
208
208
|
|
|
209
209
|
r = profile.effective_null_ratio
|
|
210
|
-
|
|
210
|
+
|
|
211
|
+
if r < _SEVERITY_MINOR and r != 0:
|
|
211
212
|
profile.severity = MissingSeverity.Minor
|
|
212
213
|
elif r < _SEVERITY_MODERATE:
|
|
213
214
|
profile.severity = MissingSeverity.Moderate
|
|
@@ -50,7 +50,6 @@ from ._numeric_config import (
|
|
|
50
50
|
NumericTopValueEntry,
|
|
51
51
|
HistogramBin,
|
|
52
52
|
)
|
|
53
|
-
from ..models._data_types import _NUMERIC_DTYPES
|
|
54
53
|
|
|
55
54
|
# ---------------------------------------------------------------------------
|
|
56
55
|
# Thresholds (documented so callers can see what drives labels / flags)
|
|
@@ -119,7 +118,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
119
118
|
if override is not None:
|
|
120
119
|
return False
|
|
121
120
|
|
|
122
|
-
return
|
|
121
|
+
return True
|
|
123
122
|
|
|
124
123
|
def _run(
|
|
125
124
|
self,
|
|
@@ -127,9 +126,8 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
127
126
|
columns: list[str],
|
|
128
127
|
) -> NumericProfileResult:
|
|
129
128
|
result = NumericProfileResult()
|
|
130
|
-
|
|
131
129
|
n_rows = df.height
|
|
132
|
-
|
|
130
|
+
|
|
133
131
|
available = [
|
|
134
132
|
c
|
|
135
133
|
for c in self._resolve_columns(df.columns, columns)
|
|
@@ -137,15 +135,78 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
137
135
|
]
|
|
138
136
|
result.analysed_columns = available
|
|
139
137
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
138
|
+
if not available:
|
|
139
|
+
return result
|
|
140
|
+
|
|
141
|
+
# One df.select([...]) for all scalar stats across all columns so
|
|
142
|
+
# Polars can parallelise expression evaluation rather than running
|
|
143
|
+
# independent query plans per column.
|
|
144
|
+
exprs: list[pl.Expr] = []
|
|
145
|
+
for col in available:
|
|
146
|
+
c = pl.col(col).cast(pl.Float64, strict=False)
|
|
147
|
+
exprs.append(c.mean().alias(f"{col}__mean"))
|
|
148
|
+
exprs.append(c.median().alias(f"{col}__median"))
|
|
149
|
+
exprs.append(c.min().alias(f"{col}__min"))
|
|
150
|
+
exprs.append(c.max().alias(f"{col}__max"))
|
|
151
|
+
exprs.append(c.std(ddof=1).alias(f"{col}__std"))
|
|
152
|
+
for q in _QUANTILE_LEVELS:
|
|
153
|
+
exprs.append(
|
|
154
|
+
c.quantile(q, interpolation="linear").alias(f"{col}__q{q}")
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
batch = df.select(exprs).row(0, named=True)
|
|
158
|
+
|
|
159
|
+
for col in available:
|
|
160
|
+
series = df[col]
|
|
161
|
+
f64 = series.cast(pl.Float64, strict=False)
|
|
162
|
+
clean = f64.drop_nulls()
|
|
163
|
+
profile = NumericStats()
|
|
164
|
+
|
|
165
|
+
if clean.len() == 0:
|
|
166
|
+
result.columns[col] = profile
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
# Central tendency
|
|
170
|
+
mean = float(batch[f"{col}__mean"])
|
|
171
|
+
median = float(batch[f"{col}__median"])
|
|
172
|
+
profile.mean = mean
|
|
173
|
+
profile.median = median
|
|
174
|
+
if median == 0.0:
|
|
175
|
+
profile.mean_median_ratio = float("inf") if mean != 0.0 else 1.0
|
|
176
|
+
else:
|
|
177
|
+
profile.mean_median_ratio = mean / median
|
|
178
|
+
|
|
179
|
+
# Range
|
|
180
|
+
profile.min = float(batch[f"{col}__min"])
|
|
181
|
+
profile.max = float(batch[f"{col}__max"])
|
|
182
|
+
|
|
183
|
+
# Spread — Polars returns null for std with ddof=1 on a single row
|
|
184
|
+
std_val = batch[f"{col}__std"]
|
|
185
|
+
profile.std = float(std_val) if std_val is not None else 0.0
|
|
186
|
+
profile.variance = profile.std ** 2
|
|
187
|
+
|
|
188
|
+
# Percentiles
|
|
189
|
+
q_vals = [batch[f"{col}__q{q}"] for q in _QUANTILE_LEVELS]
|
|
190
|
+
profile.percentiles = PercentileSnapshot(
|
|
191
|
+
p1=q_vals[0], p5=q_vals[1], p25=q_vals[2], p50=q_vals[3],
|
|
192
|
+
p75=q_vals[4], p95=q_vals[5], p99=q_vals[6],
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# Frequency / distribution stays per-column (returns a frame, not a scalar)
|
|
196
|
+
self._compute_frequency_and_distribution(series, clean, profile, n_rows)
|
|
197
|
+
|
|
198
|
+
# Shape stays per-column (delegates to scipy on a numpy array)
|
|
199
|
+
self._compute_shape(clean, profile)
|
|
200
|
+
|
|
201
|
+
self._check_scale_anomaly(profile)
|
|
202
|
+
|
|
203
|
+
result.columns[col] = profile
|
|
144
204
|
|
|
145
205
|
return result
|
|
146
206
|
|
|
147
207
|
# ------------------------------------------------------------------
|
|
148
|
-
# Per-column
|
|
208
|
+
# Per-column helpers (frequency/distribution and shape only —
|
|
209
|
+
# scalar stats are now batched in _run above)
|
|
149
210
|
# ------------------------------------------------------------------
|
|
150
211
|
|
|
151
212
|
@staticmethod
|
|
@@ -196,7 +257,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
196
257
|
# --- 20-Bin Histogram Distribution (Continuous) ---
|
|
197
258
|
import numpy as np
|
|
198
259
|
|
|
199
|
-
counts, bin_edges = np.histogram(clean_f64.to_numpy(), bins=
|
|
260
|
+
counts, bin_edges = np.histogram(clean_f64.to_numpy(), bins="auto")
|
|
200
261
|
profile.histogram = [
|
|
201
262
|
HistogramBin(
|
|
202
263
|
lower_bound=float(bin_edges[i]),
|
|
@@ -207,73 +268,8 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
207
268
|
for i in range(len(counts))
|
|
208
269
|
]
|
|
209
270
|
|
|
210
|
-
def _profile_column(
|
|
211
|
-
self,
|
|
212
|
-
series: pl.Series,
|
|
213
|
-
n_rows: int,
|
|
214
|
-
) -> NumericStats:
|
|
215
|
-
profile = NumericStats()
|
|
216
|
-
|
|
217
|
-
f64 = series.cast(pl.Float64)
|
|
218
|
-
clean = f64.drop_nulls()
|
|
219
|
-
|
|
220
|
-
if clean.len() == 0:
|
|
221
|
-
return profile
|
|
222
|
-
|
|
223
|
-
self._compute_central_tendency(clean, profile)
|
|
224
|
-
self._compute_range(clean, profile)
|
|
225
|
-
self._compute_frequency_and_distribution(series, clean, profile, n_rows)
|
|
226
|
-
self._compute_percentiles(clean, profile)
|
|
227
|
-
self._compute_spread(clean, profile)
|
|
228
|
-
self._compute_shape(clean, profile)
|
|
229
|
-
self._check_scale_anomaly(profile)
|
|
230
|
-
|
|
231
|
-
return profile
|
|
232
|
-
|
|
233
271
|
# ------------------------------------------------------------------
|
|
234
|
-
# Step
|
|
235
|
-
# ------------------------------------------------------------------
|
|
236
|
-
|
|
237
|
-
@staticmethod
|
|
238
|
-
def _compute_central_tendency(
|
|
239
|
-
clean: pl.Series,
|
|
240
|
-
profile: NumericStats,
|
|
241
|
-
) -> None:
|
|
242
|
-
mean = float(clean.mean()) # type: ignore[arg-type]
|
|
243
|
-
median = float(clean.median()) # type: ignore[arg-type]
|
|
244
|
-
|
|
245
|
-
profile.mean = mean
|
|
246
|
-
profile.median = median
|
|
247
|
-
|
|
248
|
-
# Mean/median ratio: primary skew indicator at a glance.
|
|
249
|
-
# Guard against division by zero (e.g. a column of all zeros).
|
|
250
|
-
if median == 0.0:
|
|
251
|
-
profile.mean_median_ratio = float("inf") if mean != 0.0 else 1.0
|
|
252
|
-
else:
|
|
253
|
-
profile.mean_median_ratio = mean / median
|
|
254
|
-
|
|
255
|
-
# ------------------------------------------------------------------
|
|
256
|
-
# Step 2: Spread
|
|
257
|
-
# ------------------------------------------------------------------
|
|
258
|
-
|
|
259
|
-
@staticmethod
|
|
260
|
-
def _compute_spread(
|
|
261
|
-
clean: pl.Series,
|
|
262
|
-
profile: NumericStats,
|
|
263
|
-
) -> None:
|
|
264
|
-
n = clean.len()
|
|
265
|
-
if n < 2:
|
|
266
|
-
# Std / variance undefined for a single observation
|
|
267
|
-
profile.std = 0.0
|
|
268
|
-
profile.variance = 0.0
|
|
269
|
-
return
|
|
270
|
-
|
|
271
|
-
std = float(clean.std(ddof=1)) # type: ignore[arg-type]
|
|
272
|
-
profile.std = std
|
|
273
|
-
profile.variance = std**2
|
|
274
|
-
|
|
275
|
-
# ------------------------------------------------------------------
|
|
276
|
-
# Step 3: Shape — skewness and kurtosis
|
|
272
|
+
# Step 2: Shape — skewness and kurtosis
|
|
277
273
|
# ------------------------------------------------------------------
|
|
278
274
|
|
|
279
275
|
@staticmethod
|
|
@@ -315,48 +311,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
315
311
|
profile.kurtosis_tag = KurtosisTag.Mesokurtic
|
|
316
312
|
|
|
317
313
|
# ------------------------------------------------------------------
|
|
318
|
-
# Step
|
|
319
|
-
# ------------------------------------------------------------------
|
|
320
|
-
|
|
321
|
-
@staticmethod
|
|
322
|
-
def _compute_range(
|
|
323
|
-
clean: pl.Series,
|
|
324
|
-
profile: NumericStats,
|
|
325
|
-
) -> None:
|
|
326
|
-
profile.min = float(clean.min()) # type: ignore[arg-type]
|
|
327
|
-
profile.max = float(clean.max()) # type: ignore[arg-type]
|
|
328
|
-
|
|
329
|
-
# ------------------------------------------------------------------
|
|
330
|
-
# Step 5: Percentiles
|
|
331
|
-
# ------------------------------------------------------------------
|
|
332
|
-
|
|
333
|
-
@staticmethod
|
|
334
|
-
def _compute_percentiles(
|
|
335
|
-
clean: pl.Series,
|
|
336
|
-
profile: NumericStats,
|
|
337
|
-
) -> None:
|
|
338
|
-
# Polars quantile() is O(n log n) once; compute all at once via select
|
|
339
|
-
# to avoid repeated passes.
|
|
340
|
-
quantile_frame = pl.DataFrame({"v": clean}).select(
|
|
341
|
-
[
|
|
342
|
-
pl.col("v").quantile(q, interpolation="linear").alias(f"q{i}")
|
|
343
|
-
for i, q in enumerate(_QUANTILE_LEVELS)
|
|
344
|
-
]
|
|
345
|
-
)
|
|
346
|
-
row = quantile_frame.row(0)
|
|
347
|
-
# row order: p1, p5, p25, p50, p75, p95, p99
|
|
348
|
-
profile.percentiles = PercentileSnapshot(
|
|
349
|
-
p1=row[0],
|
|
350
|
-
p5=row[1],
|
|
351
|
-
p25=row[2],
|
|
352
|
-
p50=row[3],
|
|
353
|
-
p75=row[4],
|
|
354
|
-
p95=row[5],
|
|
355
|
-
p99=row[6],
|
|
356
|
-
)
|
|
357
|
-
|
|
358
|
-
# ------------------------------------------------------------------
|
|
359
|
-
# Step 6: Scale-anomaly flag
|
|
314
|
+
# Step 3: Scale-anomaly flag
|
|
360
315
|
# ------------------------------------------------------------------
|
|
361
316
|
|
|
362
317
|
@staticmethod
|
|
@@ -148,9 +148,11 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
|
|
|
148
148
|
"""Generates numeric metrics and checks for target skewness."""
|
|
149
149
|
num_profiler = NumericProfiler(config=self.config)
|
|
150
150
|
|
|
151
|
-
|
|
151
|
+
col_name = series.name
|
|
152
|
+
num_result = num_profiler.profile(series.to_frame(), [col_name])
|
|
153
|
+
num_profile = num_result.columns.get(col_name)
|
|
152
154
|
result.numeric_profile = num_profile
|
|
153
155
|
|
|
154
156
|
# Flag Skewness (Highly skewed targets often require Log/Yeo-Johnson transforms)
|
|
155
|
-
if num_profile.skewness_severity in (SkewSeverity.High, SkewSeverity.Severe):
|
|
157
|
+
if num_profile and num_profile.skewness_severity in (SkewSeverity.High, SkewSeverity.Severe):
|
|
156
158
|
result.flags.append(TargetFlag.HighlySkewed)
|
|
@@ -35,10 +35,11 @@ _IDENTIFIER_UNIQUE_RATIO = 0.99 # >99 % unique → identifier
|
|
|
35
35
|
_IDENTIFIER_MAX_MEDIAN_LENGTH = 40
|
|
36
36
|
_DISCRETE_NUNIQUE_THRESHOLD = 20 # numeric with <20 unique values → discrete
|
|
37
37
|
|
|
38
|
-
_FREE_TEXT_AVG_WORDS: int =
|
|
39
|
-
_FREE_TEXT_MEDIAN_CHARS: int =
|
|
40
|
-
_FREE_TEXT_P90_CHARS: int =
|
|
38
|
+
_FREE_TEXT_AVG_WORDS: int = 3
|
|
39
|
+
_FREE_TEXT_MEDIAN_CHARS: int = 20
|
|
40
|
+
_FREE_TEXT_P90_CHARS: int = 35
|
|
41
41
|
_FREE_TEXT_MIN_UNIQUE_RATIO: float = 0.40
|
|
42
|
+
_FREE_TEXT_HIGH_UNIQUE_WITH_SPACES: float = 0.70 # unique ratio above which multi-token strings → Text
|
|
42
43
|
|
|
43
44
|
|
|
44
45
|
# Common boolean string values (lowercased)
|
|
@@ -77,115 +78,87 @@ class TypeDetector:
|
|
|
77
78
|
original_dtype=original_dtype,
|
|
78
79
|
inferred_dtype=original_dtype,
|
|
79
80
|
)
|
|
80
|
-
|
|
81
|
-
# Work with a copy that may be re-assigned after coercion
|
|
82
81
|
working = series
|
|
83
82
|
|
|
84
83
|
# 1 & 2: Coercion for string columns
|
|
85
|
-
if series.dtype
|
|
84
|
+
if series.dtype in (pl.Utf8, pl.String):
|
|
86
85
|
coerced, flag = self._try_numeric_coerce(series, n_rows)
|
|
87
86
|
if coerced is not None:
|
|
88
87
|
info.inferred_dtype = str(coerced.dtype)
|
|
89
88
|
info.flags.append(flag) # type: ignore[arg-type]
|
|
90
89
|
working = coerced
|
|
91
|
-
|
|
92
|
-
self._check_coerced_encoded_category(working, info, n_rows)
|
|
90
|
+
self._check_coerced_encoded_category(working, info)
|
|
93
91
|
else:
|
|
94
92
|
coerced_dt, flag_dt = self._try_datetime_coerce(
|
|
95
|
-
series,
|
|
93
|
+
series, n_rows
|
|
96
94
|
)
|
|
97
95
|
if coerced_dt is not None:
|
|
98
96
|
info.inferred_dtype = str(coerced_dt.dtype)
|
|
99
97
|
info.flags.append(flag_dt) # type: ignore[arg-type]
|
|
100
|
-
working = coerced_dt
|
|
101
|
-
|
|
102
98
|
info.semantic_type = SemanticType.Datetime
|
|
103
99
|
results[col_name] = info
|
|
104
100
|
continue
|
|
105
101
|
|
|
106
102
|
# 3: Boolean candidate
|
|
107
103
|
self._check_boolean_candidate(working, info)
|
|
104
|
+
if TypeFlag.BooleanCandidate in info.flags:
|
|
105
|
+
info.semantic_type = SemanticType.Boolean
|
|
106
|
+
results[col_name] = info
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
# Native datetime types
|
|
110
|
+
if working.dtype in (pl.Date, pl.Datetime, pl.Duration, pl.Time) or isinstance(
|
|
111
|
+
working.dtype, pl.Datetime
|
|
112
|
+
):
|
|
113
|
+
info.semantic_type = SemanticType.Datetime
|
|
114
|
+
results[col_name] = info
|
|
115
|
+
continue
|
|
108
116
|
|
|
109
|
-
#
|
|
117
|
+
# 4–7: Numeric path
|
|
110
118
|
if working.dtype in _NUMERIC_DTYPES:
|
|
111
|
-
# 4 & 5: Encoded category and identifier checks — integers only.
|
|
112
|
-
# Continuous floats have high cardinality by nature and are never
|
|
113
|
-
# identifiers; restricting these checks prevents false Identifier
|
|
114
|
-
# classification of genuine numeric features.
|
|
115
119
|
if working.dtype in _INT_DTYPES:
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
info.
|
|
129
|
-
|
|
130
|
-
TypeFlag.IdentifierColumn,
|
|
131
|
-
TypeFlag.SequentialIndex,
|
|
132
|
-
TypeFlag.FloatSequentialIndex,
|
|
133
|
-
)
|
|
134
|
-
):
|
|
120
|
+
# EncodedCategory and IdentifierColumn are mutually exclusive:
|
|
121
|
+
# low-cardinality and near-unique cannot both be true.
|
|
122
|
+
# Check encoded category first; skip identifier if it matches.
|
|
123
|
+
self._check_encoded_category(working, info)
|
|
124
|
+
if TypeFlag.EncodedCategory not in info.flags:
|
|
125
|
+
self._check_identifier(working, info, n_rows)
|
|
126
|
+
if TypeFlag.IdentifierColumn in info.flags:
|
|
127
|
+
self._check_sequential_index(working, info, n_rows)
|
|
128
|
+
|
|
129
|
+
if TypeFlag.EncodedCategory in info.flags:
|
|
130
|
+
info.semantic_type = SemanticType.Categorical
|
|
131
|
+
elif TypeFlag.IdentifierColumn in info.flags:
|
|
132
|
+
info.semantic_type = SemanticType.Identifier
|
|
133
|
+
else:
|
|
135
134
|
self._classify_numeric_kind(working, info)
|
|
135
|
+
info.semantic_type = SemanticType.Numeric
|
|
136
136
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
self._check_identifier(working, info, n_rows)
|
|
137
|
+
results[col_name] = info
|
|
138
|
+
continue
|
|
140
139
|
|
|
140
|
+
# String path
|
|
141
|
+
if working.dtype in (pl.Utf8, pl.String):
|
|
141
142
|
self._check_free_text(working, info, n_rows)
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
n_rows
|
|
147
|
-
|
|
148
|
-
|
|
143
|
+
if TypeFlag.FreeTextCandidate in info.flags:
|
|
144
|
+
info.semantic_type = SemanticType.Text
|
|
145
|
+
results[col_name] = info
|
|
146
|
+
continue
|
|
147
|
+
self._check_identifier(working, info, n_rows)
|
|
148
|
+
info.semantic_type = (
|
|
149
|
+
SemanticType.Identifier
|
|
150
|
+
if TypeFlag.IdentifierColumn in info.flags
|
|
151
|
+
else SemanticType.Categorical
|
|
152
|
+
)
|
|
153
|
+
results[col_name] = info
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
# Fallback
|
|
157
|
+
info.semantic_type = SemanticType.Text
|
|
149
158
|
results[col_name] = info
|
|
150
159
|
|
|
151
160
|
return results
|
|
152
161
|
|
|
153
|
-
@staticmethod
|
|
154
|
-
def _derive_semantic_type(
|
|
155
|
-
info: ColumnTypeInfo,
|
|
156
|
-
working: pl.Series,
|
|
157
|
-
n_rows: int,
|
|
158
|
-
) -> SemanticType:
|
|
159
|
-
if TypeFlag.IdentifierColumn in info.flags:
|
|
160
|
-
return SemanticType.Identifier
|
|
161
|
-
|
|
162
|
-
if TypeFlag.BooleanCandidate in info.flags:
|
|
163
|
-
return SemanticType.Boolean
|
|
164
|
-
|
|
165
|
-
is_native_datetime = working.dtype in (
|
|
166
|
-
pl.Date,
|
|
167
|
-
pl.Datetime,
|
|
168
|
-
pl.Duration,
|
|
169
|
-
pl.Time,
|
|
170
|
-
) or (hasattr(pl, "Datetime") and isinstance(working.dtype, pl.Datetime))
|
|
171
|
-
|
|
172
|
-
if is_native_datetime or TypeFlag.DatetimeCoerced in info.flags:
|
|
173
|
-
return SemanticType.Datetime
|
|
174
|
-
|
|
175
|
-
if TypeFlag.EncodedCategory in info.flags:
|
|
176
|
-
return SemanticType.Categorical
|
|
177
|
-
|
|
178
|
-
if working.dtype in (pl.Utf8, pl.String):
|
|
179
|
-
if TypeFlag.FreeTextCandidate in info.flags:
|
|
180
|
-
return SemanticType.Text
|
|
181
|
-
|
|
182
|
-
return SemanticType.Categorical
|
|
183
|
-
|
|
184
|
-
if working.dtype in _NUMERIC_DTYPES:
|
|
185
|
-
return SemanticType.Numeric
|
|
186
|
-
|
|
187
|
-
return SemanticType.Categorical
|
|
188
|
-
|
|
189
162
|
# ------------------------------------------------------------------
|
|
190
163
|
# Step 1: Numeric coercion
|
|
191
164
|
# ------------------------------------------------------------------
|
|
@@ -221,7 +194,7 @@ class TypeDetector:
|
|
|
221
194
|
|
|
222
195
|
@staticmethod
|
|
223
196
|
def _try_datetime_coerce(
|
|
224
|
-
series: pl.Series,
|
|
197
|
+
series: pl.Series, n_rows: int
|
|
225
198
|
) -> tuple[pl.Series, TypeFlag] | tuple[None, None]:
|
|
226
199
|
"""
|
|
227
200
|
Attempt datetime coercion if the column name looks date-like.
|
|
@@ -269,7 +242,7 @@ class TypeDetector:
|
|
|
269
242
|
|
|
270
243
|
@staticmethod
|
|
271
244
|
def _check_coerced_encoded_category(
|
|
272
|
-
series: pl.Series, info: ColumnTypeInfo
|
|
245
|
+
series: pl.Series, info: ColumnTypeInfo
|
|
273
246
|
) -> None:
|
|
274
247
|
"""
|
|
275
248
|
Post-coercion low-cardinality check for Float64 series that originated
|
|
@@ -312,9 +285,8 @@ class TypeDetector:
|
|
|
312
285
|
|
|
313
286
|
@staticmethod
|
|
314
287
|
def _check_encoded_category(
|
|
315
|
-
series: pl.Series, info: ColumnTypeInfo
|
|
288
|
+
series: pl.Series, info: ColumnTypeInfo
|
|
316
289
|
) -> None:
|
|
317
|
-
# Skip if already flagged as boolean candidate (subset of {0,1})
|
|
318
290
|
if TypeFlag.BooleanCandidate in info.flags:
|
|
319
291
|
return
|
|
320
292
|
|
|
@@ -357,16 +329,17 @@ class TypeDetector:
|
|
|
357
329
|
return
|
|
358
330
|
|
|
359
331
|
if series.dtype in (pl.Utf8, pl.String):
|
|
360
|
-
|
|
361
|
-
if
|
|
332
|
+
non_null = series.drop_nulls()
|
|
333
|
+
if non_null.len() == 0:
|
|
362
334
|
return
|
|
363
335
|
|
|
364
|
-
median_length =
|
|
336
|
+
median_length = non_null.str.len_chars().median()
|
|
337
|
+
if median_length is not None and median_length > _IDENTIFIER_MAX_MEDIAN_LENGTH:
|
|
338
|
+
return
|
|
365
339
|
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
):
|
|
340
|
+
# Real identifiers are single tokens — no spaces.
|
|
341
|
+
# Sentences and descriptions have median_spaces > 0.
|
|
342
|
+
if float(non_null.str.count_matches(r"\s+").median() or 0.0) > 0:
|
|
370
343
|
return
|
|
371
344
|
|
|
372
345
|
info.flags.append(TypeFlag.IdentifierColumn)
|
|
@@ -440,24 +413,27 @@ class TypeDetector:
|
|
|
440
413
|
|
|
441
414
|
char_lengths = non_null.str.len_chars()
|
|
442
415
|
median_chars = float(char_lengths.median() or 0.0)
|
|
416
|
+
space_counts = non_null.str.count_matches(r"\s+")
|
|
417
|
+
median_spaces = float(space_counts.median() or 0.0)
|
|
418
|
+
median_words = median_spaces + 1.0
|
|
419
|
+
unique_ratio = series.n_unique() / n_rows if n_rows > 0 else 0.0
|
|
443
420
|
|
|
444
|
-
|
|
421
|
+
# Multi-word strings of medium length: names, addresses, short descriptions
|
|
422
|
+
if median_chars > _FREE_TEXT_MEDIAN_CHARS and median_spaces >= 1.0:
|
|
445
423
|
info.flags.append(TypeFlag.FreeTextCandidate)
|
|
446
424
|
return
|
|
447
425
|
|
|
448
|
-
|
|
449
|
-
median_words = float(space_counts.median() or 0.0) + 1.0
|
|
450
|
-
|
|
426
|
+
# Long average word count: sentences, paragraphs
|
|
451
427
|
if median_words > _FREE_TEXT_AVG_WORDS:
|
|
452
428
|
info.flags.append(TypeFlag.FreeTextCandidate)
|
|
453
429
|
return
|
|
454
430
|
|
|
455
431
|
p90_chars = float(char_lengths.quantile(0.9) or 0.0)
|
|
432
|
+
if p90_chars > _FREE_TEXT_P90_CHARS and unique_ratio > _FREE_TEXT_MIN_UNIQUE_RATIO:
|
|
433
|
+
info.flags.append(TypeFlag.FreeTextCandidate)
|
|
434
|
+
return
|
|
456
435
|
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
if
|
|
460
|
-
p90_chars > _FREE_TEXT_P90_CHARS
|
|
461
|
-
and unique_ratio > _FREE_TEXT_MIN_UNIQUE_RATIO
|
|
462
|
-
):
|
|
436
|
+
# High-cardinality multi-token strings that don't meet char thresholds:
|
|
437
|
+
# e.g. short full names like "John Smith", compound tokens
|
|
438
|
+
if unique_ratio >= _FREE_TEXT_HIGH_UNIQUE_WITH_SPACES and median_spaces >= 1.0:
|
|
463
439
|
info.flags.append(TypeFlag.FreeTextCandidate)
|
|
@@ -199,7 +199,7 @@ class StructuralProfiler:
|
|
|
199
199
|
|
|
200
200
|
# 8a. Feature-feature matrices — computed ONCE, target-independent.
|
|
201
201
|
feature_corr = corr_profiler.profile_features(
|
|
202
|
-
data, numeric_cols
|
|
202
|
+
data, numeric_cols, categorical_cols
|
|
203
203
|
)
|
|
204
204
|
result.dataset.feature_correlation = feature_corr
|
|
205
205
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|