dataforge-ml 0.4.0__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/PKG-INFO +3 -1
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/pyproject.toml +3 -1
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_boolean_profiler.py +1 -13
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_categorical.py +1 -2
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_correlation_config.py +134 -35
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_correlation_profiler.py +168 -3
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_missingness_profiler.py +2 -1
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_numeric_profiler.py +73 -118
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_target_profiler.py +4 -2
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_type_detector.py +8 -7
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/structural.py +1 -1
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml.egg-info/PKG-INFO +3 -1
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml.egg-info/requires.txt +2 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/LICENSE +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/README.md +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/setup.cfg +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/__init__.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/models/__init__.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/models/_data_structure.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/models/_data_types.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/__init__.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_base.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_datetime_profiler.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_tabular.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_target_config.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_text_config.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/config.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/splitting/__init__.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/splitting/_config.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/splitting/_splitter.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/utils/__init__.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/utils/data_loader.py +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
- {dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataforge-ml
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: A automated feature engineering and designing pipeline library
|
|
5
5
|
License: MIT
|
|
6
6
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -15,6 +15,8 @@ Requires-Dist: polars>=1.0.0
|
|
|
15
15
|
Requires-Dist: scikit-learn>=1.0.0
|
|
16
16
|
Requires-Dist: scipy>=1.10.0
|
|
17
17
|
Requires-Dist: numpy>=2.0.0
|
|
18
|
+
Requires-Dist: pandas>=2.0.0
|
|
19
|
+
Requires-Dist: chardet>=5.0.0
|
|
18
20
|
Provides-Extra: dev
|
|
19
21
|
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
20
22
|
Dynamic: license-file
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "dataforge-ml"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.6.0"
|
|
8
8
|
description = "A automated feature engineering and designing pipeline library"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -21,6 +21,8 @@ dependencies = [
|
|
|
21
21
|
"scikit-learn>=1.0.0",
|
|
22
22
|
"scipy>=1.10.0",
|
|
23
23
|
"numpy>=2.0.0",
|
|
24
|
+
"pandas>=2.0.0",
|
|
25
|
+
"chardet>=5.0.0",
|
|
24
26
|
]
|
|
25
27
|
|
|
26
28
|
[project.optional-dependencies]
|
|
@@ -84,19 +84,7 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
|
|
|
84
84
|
if override is not None:
|
|
85
85
|
return False
|
|
86
86
|
|
|
87
|
-
|
|
88
|
-
if series.dtype == pl.Boolean:
|
|
89
|
-
return True
|
|
90
|
-
|
|
91
|
-
# Integer {0, 1} column — check after dropping nulls
|
|
92
|
-
if series.dtype in _INT_DTYPES:
|
|
93
|
-
clean = series.drop_nulls()
|
|
94
|
-
if clean.len() == 0:
|
|
95
|
-
return False
|
|
96
|
-
unique_vals = set(clean.unique().to_list())
|
|
97
|
-
return unique_vals <= {0, 1}
|
|
98
|
-
|
|
99
|
-
return False
|
|
87
|
+
return True
|
|
100
88
|
|
|
101
89
|
# ------------------------------------------------------------------
|
|
102
90
|
# Orchestration
|
|
@@ -49,7 +49,6 @@ from .config import (
|
|
|
49
49
|
ProfileConfig,
|
|
50
50
|
SemanticType,
|
|
51
51
|
)
|
|
52
|
-
from ..models._data_types import _CAT_DTYPES
|
|
53
52
|
|
|
54
53
|
# ---------------------------------------------------------------------------
|
|
55
54
|
# Module-level thresholds (documented so callers can see what drives flags)
|
|
@@ -115,7 +114,7 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
|
|
|
115
114
|
if override is not None:
|
|
116
115
|
return False
|
|
117
116
|
|
|
118
|
-
return
|
|
117
|
+
return True
|
|
119
118
|
|
|
120
119
|
def _run(
|
|
121
120
|
self,
|
|
@@ -7,41 +7,41 @@ numeric/categorical column lists that are already resolved upstream).
|
|
|
7
7
|
|
|
8
8
|
Design notes
|
|
9
9
|
------------
|
|
10
|
-
- Pearson
|
|
11
|
-
-
|
|
12
|
-
|
|
13
|
-
- Near-redundancy
|
|
14
|
-
|
|
15
|
-
- Feature–target
|
|
16
|
-
|
|
17
|
-
- Mutual information: MI for all features vs target (classif or regression).
|
|
18
|
-
Captures non-linear dependencies correlation misses.
|
|
10
|
+
- Pearson / Spearman : linear / monotonic relationships between numeric columns.
|
|
11
|
+
- Cramér's V : association between categorical column pairs [0, 1].
|
|
12
|
+
- Eta-squared : numeric-categorical association via ANOVA [0, 1].
|
|
13
|
+
- Near-redundancy : Pearson/Spearman |r| > 0.95, Cramér's V > 0.80,
|
|
14
|
+
or eta² > 0.50 flagged — near-identical signal.
|
|
15
|
+
- Feature–target : Pearson (numeric target), ANOVA/eta² (categorical target).
|
|
16
|
+
- Mutual information : MI for all features vs target (classif or regression).
|
|
19
17
|
"""
|
|
18
|
+
|
|
20
19
|
from __future__ import annotations
|
|
21
20
|
|
|
22
21
|
from dataclasses import dataclass, field
|
|
23
22
|
from enum import StrEnum
|
|
24
23
|
from typing import Optional
|
|
25
24
|
|
|
26
|
-
|
|
27
25
|
# ---------------------------------------------------------------------------
|
|
28
26
|
# Enums
|
|
29
27
|
# ---------------------------------------------------------------------------
|
|
30
28
|
|
|
29
|
+
|
|
31
30
|
class CorrelationMethod(StrEnum):
|
|
32
|
-
Pearson
|
|
31
|
+
Pearson = "pearson"
|
|
33
32
|
Spearman = "spearman"
|
|
34
33
|
|
|
35
34
|
|
|
36
35
|
class TargetType(StrEnum):
|
|
37
|
-
Numeric
|
|
38
|
-
Categorical
|
|
36
|
+
Numeric = "numeric" # numeric target → Pearson + MI regression
|
|
37
|
+
Categorical = "categorical" # categorical target → ANOVA/eta² + MI classif
|
|
39
38
|
|
|
40
39
|
|
|
41
40
|
# ---------------------------------------------------------------------------
|
|
42
41
|
# Pairwise correlation result
|
|
43
42
|
# ---------------------------------------------------------------------------
|
|
44
43
|
|
|
44
|
+
|
|
45
45
|
@dataclass
|
|
46
46
|
class CorrelationPair:
|
|
47
47
|
"""
|
|
@@ -62,14 +62,74 @@ class CorrelationPair:
|
|
|
62
62
|
|
|
63
63
|
col_a: str
|
|
64
64
|
col_b: str
|
|
65
|
-
pearson_r:
|
|
65
|
+
pearson_r: Optional[float] = None
|
|
66
66
|
spearman_r: Optional[float] = None
|
|
67
67
|
near_redundant: bool = False
|
|
68
68
|
|
|
69
69
|
def to_dict(self) -> dict:
|
|
70
70
|
return {
|
|
71
|
-
"col_a": self.col_a,
|
|
72
|
-
"
|
|
71
|
+
"col_a": self.col_a,
|
|
72
|
+
"col_b": self.col_b,
|
|
73
|
+
"pearson_r": self.pearson_r,
|
|
74
|
+
"spearman_r": self.spearman_r,
|
|
75
|
+
"near_redundant": self.near_redundant,
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class CramerVPair:
|
|
81
|
+
"""
|
|
82
|
+
Cramér's V association between two categorical columns.
|
|
83
|
+
|
|
84
|
+
Attributes
|
|
85
|
+
----------
|
|
86
|
+
col_a, col_b : str
|
|
87
|
+
cramer_v : float | None
|
|
88
|
+
Cramér's V in [0, 1]. None when computation fails or sample too small.
|
|
89
|
+
near_redundant : bool
|
|
90
|
+
True when cramer_v exceeds the near-redundancy threshold (default 0.80).
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
col_a: str = ""
|
|
94
|
+
col_b: str = ""
|
|
95
|
+
cramer_v: Optional[float] = None
|
|
96
|
+
near_redundant: bool = False
|
|
97
|
+
|
|
98
|
+
def to_dict(self) -> dict:
|
|
99
|
+
return {
|
|
100
|
+
"col_a": self.col_a,
|
|
101
|
+
"col_b": self.col_b,
|
|
102
|
+
"cramer_v": self.cramer_v,
|
|
103
|
+
"near_redundant": self.near_redundant,
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@dataclass
|
|
108
|
+
class EtaSquaredPair:
|
|
109
|
+
"""
|
|
110
|
+
Eta-squared (η²) association between a numeric and a categorical column.
|
|
111
|
+
|
|
112
|
+
Attributes
|
|
113
|
+
----------
|
|
114
|
+
numeric_col : str
|
|
115
|
+
categorical_col : str
|
|
116
|
+
eta_squared : float | None
|
|
117
|
+
Effect size in [0, 1]. None when computation fails.
|
|
118
|
+
Rule of thumb: 0.01 small, 0.06 medium, 0.14 large.
|
|
119
|
+
near_redundant : bool
|
|
120
|
+
True when eta_squared exceeds the near-redundancy threshold (default 0.50).
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
numeric_col: str = ""
|
|
124
|
+
categorical_col: str = ""
|
|
125
|
+
eta_squared: Optional[float] = None
|
|
126
|
+
near_redundant: bool = False
|
|
127
|
+
|
|
128
|
+
def to_dict(self) -> dict:
|
|
129
|
+
return {
|
|
130
|
+
"numeric_col": self.numeric_col,
|
|
131
|
+
"categorical_col": self.categorical_col,
|
|
132
|
+
"eta_squared": self.eta_squared,
|
|
73
133
|
"near_redundant": self.near_redundant,
|
|
74
134
|
}
|
|
75
135
|
|
|
@@ -78,6 +138,7 @@ class CorrelationPair:
|
|
|
78
138
|
# Feature–target entries
|
|
79
139
|
# ---------------------------------------------------------------------------
|
|
80
140
|
|
|
141
|
+
|
|
81
142
|
@dataclass
|
|
82
143
|
class NumericTargetCorrelation:
|
|
83
144
|
"""
|
|
@@ -88,7 +149,8 @@ class NumericTargetCorrelation:
|
|
|
88
149
|
feature : str
|
|
89
150
|
pearson_r : float | None
|
|
90
151
|
"""
|
|
91
|
-
|
|
152
|
+
|
|
153
|
+
feature: str
|
|
92
154
|
pearson_r: Optional[float] = None
|
|
93
155
|
|
|
94
156
|
def to_dict(self) -> dict:
|
|
@@ -113,15 +175,18 @@ class CategoricalTargetCorrelation:
|
|
|
113
175
|
Effect size: SS_between / SS_total. Ranges [0, 1].
|
|
114
176
|
Rule of thumb: 0.01 small, 0.06 medium, 0.14 large.
|
|
115
177
|
"""
|
|
116
|
-
|
|
178
|
+
|
|
179
|
+
feature: str
|
|
117
180
|
f_statistic: Optional[float] = None
|
|
118
|
-
p_value:
|
|
181
|
+
p_value: Optional[float] = None
|
|
119
182
|
eta_squared: Optional[float] = None
|
|
120
183
|
|
|
121
184
|
def to_dict(self) -> dict:
|
|
122
185
|
return {
|
|
123
|
-
"feature": self.feature,
|
|
124
|
-
"
|
|
186
|
+
"feature": self.feature,
|
|
187
|
+
"f_statistic": self.f_statistic,
|
|
188
|
+
"p_value": self.p_value,
|
|
189
|
+
"eta_squared": self.eta_squared,
|
|
125
190
|
}
|
|
126
191
|
|
|
127
192
|
|
|
@@ -129,6 +194,7 @@ class CategoricalTargetCorrelation:
|
|
|
129
194
|
# Mutual information
|
|
130
195
|
# ---------------------------------------------------------------------------
|
|
131
196
|
|
|
197
|
+
|
|
132
198
|
@dataclass
|
|
133
199
|
class MutualInformationEntry:
|
|
134
200
|
"""
|
|
@@ -143,9 +209,10 @@ class MutualInformationEntry:
|
|
|
143
209
|
rank : int
|
|
144
210
|
1 = highest MI (most informative).
|
|
145
211
|
"""
|
|
146
|
-
|
|
212
|
+
|
|
213
|
+
feature: str
|
|
147
214
|
mi_score: float = 0.0
|
|
148
|
-
rank:
|
|
215
|
+
rank: int = 0
|
|
149
216
|
|
|
150
217
|
def to_dict(self) -> dict:
|
|
151
218
|
return {"feature": self.feature, "mi_score": self.mi_score, "rank": self.rank}
|
|
@@ -155,6 +222,7 @@ class MutualInformationEntry:
|
|
|
155
222
|
# Near-redundancy summary
|
|
156
223
|
# ---------------------------------------------------------------------------
|
|
157
224
|
|
|
225
|
+
|
|
158
226
|
@dataclass
|
|
159
227
|
class NearRedundancyGroup:
|
|
160
228
|
"""
|
|
@@ -164,17 +232,22 @@ class NearRedundancyGroup:
|
|
|
164
232
|
The suggested_drop list contains every column except the first
|
|
165
233
|
alphabetically — a simple, deterministic heuristic.
|
|
166
234
|
"""
|
|
167
|
-
|
|
235
|
+
|
|
236
|
+
columns: list[str] = field(default_factory=list)
|
|
168
237
|
suggested_drop: list[str] = field(default_factory=list)
|
|
169
238
|
|
|
170
239
|
def to_dict(self) -> dict:
|
|
171
|
-
return {
|
|
240
|
+
return {
|
|
241
|
+
"columns": list(self.columns),
|
|
242
|
+
"suggested_drop": list(self.suggested_drop),
|
|
243
|
+
}
|
|
172
244
|
|
|
173
245
|
|
|
174
246
|
# ---------------------------------------------------------------------------
|
|
175
247
|
# Top-level result
|
|
176
248
|
# ---------------------------------------------------------------------------
|
|
177
249
|
|
|
250
|
+
|
|
178
251
|
@dataclass
|
|
179
252
|
class CorrelationProfileResult:
|
|
180
253
|
"""
|
|
@@ -211,23 +284,34 @@ class CorrelationProfileResult:
|
|
|
211
284
|
|
|
212
285
|
# Column scope
|
|
213
286
|
analysed_numeric_columns: list[str] = field(default_factory=list)
|
|
287
|
+
analysed_categorical_columns: list[str] = field(default_factory=list)
|
|
214
288
|
|
|
215
289
|
# Pairwise matrices
|
|
216
|
-
pearson_matrix:
|
|
290
|
+
pearson_matrix: dict[str, dict[str, float]] = field(default_factory=dict)
|
|
217
291
|
spearman_matrix: dict[str, dict[str, float]] = field(default_factory=dict)
|
|
218
292
|
|
|
219
|
-
# Pairwise summaries
|
|
220
|
-
pairwise:
|
|
293
|
+
# Pairwise summaries — numeric ↔ numeric
|
|
294
|
+
pairwise: list[CorrelationPair] = field(default_factory=list)
|
|
221
295
|
near_redundant_pairs: list[CorrelationPair] = field(default_factory=list)
|
|
222
296
|
near_redundancy_groups: list[NearRedundancyGroup] = field(default_factory=list)
|
|
223
297
|
|
|
298
|
+
# Pairwise summaries — categorical ↔ categorical (Cramér's V)
|
|
299
|
+
cramer_v_pairs: list[CramerVPair] = field(default_factory=list)
|
|
300
|
+
near_redundant_cramer_v_pairs: list[CramerVPair] = field(default_factory=list)
|
|
301
|
+
|
|
302
|
+
# Pairwise summaries — numeric ↔ categorical (eta-squared)
|
|
303
|
+
eta_squared_pairs: list[EtaSquaredPair] = field(default_factory=list)
|
|
304
|
+
near_redundant_eta_squared_pairs: list[EtaSquaredPair] = field(default_factory=list)
|
|
305
|
+
|
|
224
306
|
# Target info
|
|
225
|
-
target_column: Optional[str]
|
|
226
|
-
target_type:
|
|
307
|
+
target_column: Optional[str] = None
|
|
308
|
+
target_type: Optional[TargetType] = None
|
|
227
309
|
|
|
228
310
|
# Feature–target correlations (top-10 each)
|
|
229
|
-
feature_target_numeric:
|
|
230
|
-
feature_target_categorical:
|
|
311
|
+
feature_target_numeric: list[NumericTargetCorrelation] = field(default_factory=list)
|
|
312
|
+
feature_target_categorical: list[CategoricalTargetCorrelation] = field(
|
|
313
|
+
default_factory=list
|
|
314
|
+
)
|
|
231
315
|
|
|
232
316
|
# Mutual information (all features, ranked)
|
|
233
317
|
mutual_information: list[MutualInformationEntry] = field(default_factory=list)
|
|
@@ -249,14 +333,29 @@ class CorrelationProfileResult:
|
|
|
249
333
|
def to_dict(self) -> dict:
|
|
250
334
|
return {
|
|
251
335
|
"analysed_numeric_columns": list(self.analysed_numeric_columns),
|
|
336
|
+
"analysed_categorical_columns": list(self.analysed_categorical_columns),
|
|
252
337
|
"pearson_matrix": {k: dict(v) for k, v in self.pearson_matrix.items()},
|
|
253
338
|
"spearman_matrix": {k: dict(v) for k, v in self.spearman_matrix.items()},
|
|
254
339
|
"pairwise": [p.to_dict() for p in self.pairwise],
|
|
255
340
|
"near_redundant_pairs": [p.to_dict() for p in self.near_redundant_pairs],
|
|
256
|
-
"near_redundancy_groups": [
|
|
341
|
+
"near_redundancy_groups": [
|
|
342
|
+
g.to_dict() for g in self.near_redundancy_groups
|
|
343
|
+
],
|
|
344
|
+
"cramer_v_pairs": [p.to_dict() for p in self.cramer_v_pairs],
|
|
345
|
+
"near_redundant_cramer_v_pairs": [
|
|
346
|
+
p.to_dict() for p in self.near_redundant_cramer_v_pairs
|
|
347
|
+
],
|
|
348
|
+
"eta_squared_pairs": [p.to_dict() for p in self.eta_squared_pairs],
|
|
349
|
+
"near_redundant_eta_squared_pairs": [
|
|
350
|
+
p.to_dict() for p in self.near_redundant_eta_squared_pairs
|
|
351
|
+
],
|
|
257
352
|
"target_column": self.target_column,
|
|
258
353
|
"target_type": str(self.target_type) if self.target_type else None,
|
|
259
|
-
"feature_target_numeric": [
|
|
260
|
-
|
|
354
|
+
"feature_target_numeric": [
|
|
355
|
+
f.to_dict() for f in self.feature_target_numeric
|
|
356
|
+
],
|
|
357
|
+
"feature_target_categorical": [
|
|
358
|
+
f.to_dict() for f in self.feature_target_categorical
|
|
359
|
+
],
|
|
261
360
|
"mutual_information": [m.to_dict() for m in self.mutual_information],
|
|
262
361
|
}
|
{dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_correlation_profiler.py
RENAMED
|
@@ -47,6 +47,8 @@ from ._correlation_config import (
|
|
|
47
47
|
CategoricalTargetCorrelation,
|
|
48
48
|
CorrelationPair,
|
|
49
49
|
CorrelationProfileResult,
|
|
50
|
+
CramerVPair,
|
|
51
|
+
EtaSquaredPair,
|
|
50
52
|
MutualInformationEntry,
|
|
51
53
|
NearRedundancyGroup,
|
|
52
54
|
NumericTargetCorrelation,
|
|
@@ -55,6 +57,8 @@ from ._correlation_config import (
|
|
|
55
57
|
from ..models._data_types import _NUMERIC_DTYPES, _INT_DTYPES
|
|
56
58
|
|
|
57
59
|
_NEAR_REDUNDANT_THRESHOLD: float = 0.95
|
|
60
|
+
_NEAR_REDUNDANT_CRAMER_V_THRESHOLD: float = 0.80
|
|
61
|
+
_NEAR_REDUNDANT_ETA_SQUARED_THRESHOLD: float = 0.50
|
|
58
62
|
_TOP_N_FEATURE_TARGET: int = 10
|
|
59
63
|
_MI_N_NEIGHBORS: int = 3
|
|
60
64
|
_MI_MIN_ROWS: int = 10 # min complete-case rows for a meaningful k-NN MI estimate
|
|
@@ -142,13 +146,14 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
|
|
|
142
146
|
self,
|
|
143
147
|
df: pl.DataFrame,
|
|
144
148
|
numeric_cols: list[str],
|
|
149
|
+
categorical_cols: Optional[list[str]] = None,
|
|
145
150
|
) -> CorrelationProfileResult:
|
|
146
151
|
"""
|
|
147
152
|
Compute pairwise feature-feature correlation matrices.
|
|
148
153
|
|
|
149
|
-
Pearson + Spearman
|
|
150
|
-
All target-specific fields
|
|
151
|
-
|
|
154
|
+
Pearson + Spearman for numeric pairs, Cramér's V for categorical pairs,
|
|
155
|
+
eta-squared for numeric-categorical pairs. All target-specific fields
|
|
156
|
+
are left at their defaults. Call profile_target() for target analysis.
|
|
152
157
|
"""
|
|
153
158
|
result = CorrelationProfileResult()
|
|
154
159
|
|
|
@@ -159,6 +164,9 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
|
|
|
159
164
|
]
|
|
160
165
|
result.analysed_numeric_columns = resolved_numeric
|
|
161
166
|
|
|
167
|
+
resolved_categorical = [c for c in (categorical_cols or []) if c in df.columns]
|
|
168
|
+
result.analysed_categorical_columns = resolved_categorical
|
|
169
|
+
|
|
162
170
|
if len(resolved_numeric) >= 2:
|
|
163
171
|
pearson_mat, spearman_mat = self._compute_matrices(df, resolved_numeric)
|
|
164
172
|
result.pearson_matrix = pearson_mat
|
|
@@ -171,6 +179,22 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
|
|
|
171
179
|
result.near_redundant_pairs
|
|
172
180
|
)
|
|
173
181
|
|
|
182
|
+
if len(resolved_categorical) >= 2:
|
|
183
|
+
result.cramer_v_pairs = self._compute_cramer_v_pairs(
|
|
184
|
+
df, resolved_categorical, _NEAR_REDUNDANT_CRAMER_V_THRESHOLD
|
|
185
|
+
)
|
|
186
|
+
result.near_redundant_cramer_v_pairs = [
|
|
187
|
+
p for p in result.cramer_v_pairs if p.near_redundant
|
|
188
|
+
]
|
|
189
|
+
|
|
190
|
+
if resolved_numeric and resolved_categorical:
|
|
191
|
+
result.eta_squared_pairs = self._compute_eta_squared_pairs(
|
|
192
|
+
df, resolved_numeric, resolved_categorical, _NEAR_REDUNDANT_ETA_SQUARED_THRESHOLD
|
|
193
|
+
)
|
|
194
|
+
result.near_redundant_eta_squared_pairs = [
|
|
195
|
+
p for p in result.eta_squared_pairs if p.near_redundant
|
|
196
|
+
]
|
|
197
|
+
|
|
174
198
|
return result
|
|
175
199
|
|
|
176
200
|
def profile_target(
|
|
@@ -316,6 +340,147 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
|
|
|
316
340
|
for members in uf.groups()
|
|
317
341
|
]
|
|
318
342
|
|
|
343
|
+
# ------------------------------------------------------------------
|
|
344
|
+
# Step 3b: Cramér's V — categorical ↔ categorical
|
|
345
|
+
# ------------------------------------------------------------------
|
|
346
|
+
|
|
347
|
+
@staticmethod
|
|
348
|
+
def _compute_cramer_v_pairs(
|
|
349
|
+
df: pl.DataFrame,
|
|
350
|
+
cat_cols: list[str],
|
|
351
|
+
threshold: float,
|
|
352
|
+
) -> list[CramerVPair]:
|
|
353
|
+
try:
|
|
354
|
+
from scipy.stats import chi2_contingency
|
|
355
|
+
except ImportError:
|
|
356
|
+
warnings.warn(
|
|
357
|
+
"scipy is required for Cramér's V. Install: pip install scipy",
|
|
358
|
+
stacklevel=3,
|
|
359
|
+
)
|
|
360
|
+
return []
|
|
361
|
+
|
|
362
|
+
import numpy as np
|
|
363
|
+
|
|
364
|
+
pairs: list[CramerVPair] = []
|
|
365
|
+
for col_a, col_b in itertools.combinations(cat_cols, 2):
|
|
366
|
+
pair_df = (
|
|
367
|
+
df.select([
|
|
368
|
+
pl.col(col_a).cast(pl.Utf8, strict=False),
|
|
369
|
+
pl.col(col_b).cast(pl.Utf8, strict=False),
|
|
370
|
+
])
|
|
371
|
+
.drop_nulls()
|
|
372
|
+
)
|
|
373
|
+
n = pair_df.height
|
|
374
|
+
if n < 5:
|
|
375
|
+
pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
|
|
376
|
+
continue
|
|
377
|
+
|
|
378
|
+
counts = pair_df.group_by([col_a, col_b]).agg(pl.len().alias("count"))
|
|
379
|
+
a_unique = sorted(counts[col_a].unique().to_list())
|
|
380
|
+
b_unique = sorted(counts[col_b].unique().to_list())
|
|
381
|
+
if len(a_unique) < 2 or len(b_unique) < 2:
|
|
382
|
+
pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
|
|
383
|
+
continue
|
|
384
|
+
|
|
385
|
+
a_idx = {v: i for i, v in enumerate(a_unique)}
|
|
386
|
+
b_idx = {v: i for i, v in enumerate(b_unique)}
|
|
387
|
+
ct = np.zeros((len(a_unique), len(b_unique)), dtype=int)
|
|
388
|
+
for a_val, b_val, cnt in zip(
|
|
389
|
+
counts[col_a].to_list(),
|
|
390
|
+
counts[col_b].to_list(),
|
|
391
|
+
counts["count"].to_list(),
|
|
392
|
+
):
|
|
393
|
+
ct[a_idx[a_val], b_idx[b_val]] = cnt
|
|
394
|
+
|
|
395
|
+
try:
|
|
396
|
+
chi2, _, _, _ = chi2_contingency(ct)
|
|
397
|
+
r, c = ct.shape
|
|
398
|
+
phi2 = chi2 / n
|
|
399
|
+
# Bergsma & Wicher (2013) bias correction
|
|
400
|
+
phi2_corr = max(0.0, phi2 - (r - 1) * (c - 1) / (n - 1))
|
|
401
|
+
r_corr = r - (r - 1) ** 2 / (n - 1)
|
|
402
|
+
c_corr = c - (c - 1) ** 2 / (n - 1)
|
|
403
|
+
v = float(np.sqrt(phi2_corr / min(r_corr - 1, c_corr - 1)))
|
|
404
|
+
v = max(0.0, min(1.0, v))
|
|
405
|
+
except Exception as exc:
|
|
406
|
+
warnings.warn(
|
|
407
|
+
f"Cramér's V failed for ({col_a}, {col_b}): {exc}", stacklevel=3
|
|
408
|
+
)
|
|
409
|
+
pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
|
|
410
|
+
continue
|
|
411
|
+
|
|
412
|
+
pairs.append(CramerVPair(
|
|
413
|
+
col_a=col_a, col_b=col_b,
|
|
414
|
+
cramer_v=v,
|
|
415
|
+
near_redundant=v > threshold,
|
|
416
|
+
))
|
|
417
|
+
|
|
418
|
+
return pairs
|
|
419
|
+
|
|
420
|
+
# ------------------------------------------------------------------
|
|
421
|
+
# Step 3c: Eta-squared — numeric ↔ categorical
|
|
422
|
+
# ------------------------------------------------------------------
|
|
423
|
+
|
|
424
|
+
@staticmethod
|
|
425
|
+
def _compute_eta_squared_pairs(
|
|
426
|
+
df: pl.DataFrame,
|
|
427
|
+
numeric_cols: list[str],
|
|
428
|
+
cat_cols: list[str],
|
|
429
|
+
threshold: float,
|
|
430
|
+
) -> list[EtaSquaredPair]:
|
|
431
|
+
try:
|
|
432
|
+
from scipy.stats import f_oneway
|
|
433
|
+
except ImportError:
|
|
434
|
+
warnings.warn(
|
|
435
|
+
"scipy is required for eta-squared. Install: pip install scipy",
|
|
436
|
+
stacklevel=3,
|
|
437
|
+
)
|
|
438
|
+
return []
|
|
439
|
+
|
|
440
|
+
pairs: list[EtaSquaredPair] = []
|
|
441
|
+
for num_col in numeric_cols:
|
|
442
|
+
feat = df[num_col].cast(pl.Float64)
|
|
443
|
+
valid_feat = feat.drop_nulls()
|
|
444
|
+
if valid_feat.len() == 0:
|
|
445
|
+
continue
|
|
446
|
+
grand_mean = float(valid_feat.mean()) # type: ignore[arg-type]
|
|
447
|
+
ss_total = float(((valid_feat - grand_mean) ** 2).sum() or 0.0)
|
|
448
|
+
|
|
449
|
+
for cat_col in cat_cols:
|
|
450
|
+
target = df[cat_col]
|
|
451
|
+
categories = target.drop_nulls().unique().to_list()
|
|
452
|
+
groups = [
|
|
453
|
+
feat.filter(target == cat).drop_nulls().to_numpy()
|
|
454
|
+
for cat in categories
|
|
455
|
+
]
|
|
456
|
+
non_empty = [g for g in groups if len(g) > 0]
|
|
457
|
+
if len(non_empty) < 2:
|
|
458
|
+
pairs.append(EtaSquaredPair(numeric_col=num_col, categorical_col=cat_col))
|
|
459
|
+
continue
|
|
460
|
+
try:
|
|
461
|
+
f_oneway(*non_empty)
|
|
462
|
+
ss_between = sum(
|
|
463
|
+
len(g) * (float(g.mean()) - grand_mean) ** 2
|
|
464
|
+
for g in non_empty
|
|
465
|
+
)
|
|
466
|
+
eta_sq = ss_between / ss_total if ss_total > 0 else 0.0
|
|
467
|
+
eta_sq = max(0.0, min(1.0, eta_sq))
|
|
468
|
+
except Exception as exc:
|
|
469
|
+
warnings.warn(
|
|
470
|
+
f"Eta-squared failed for ({num_col}, {cat_col}): {exc}",
|
|
471
|
+
stacklevel=3,
|
|
472
|
+
)
|
|
473
|
+
pairs.append(EtaSquaredPair(numeric_col=num_col, categorical_col=cat_col))
|
|
474
|
+
continue
|
|
475
|
+
|
|
476
|
+
pairs.append(EtaSquaredPair(
|
|
477
|
+
numeric_col=num_col, categorical_col=cat_col,
|
|
478
|
+
eta_squared=eta_sq,
|
|
479
|
+
near_redundant=eta_sq > threshold,
|
|
480
|
+
))
|
|
481
|
+
|
|
482
|
+
return pairs
|
|
483
|
+
|
|
319
484
|
# ------------------------------------------------------------------
|
|
320
485
|
# Step 5a: Feature–target Pearson (unchanged)
|
|
321
486
|
# ------------------------------------------------------------------
|
{dataforge_ml-0.4.0 → dataforge_ml-0.6.0}/src/dataforge_ml/profiling/_missingness_profiler.py
RENAMED
|
@@ -207,7 +207,8 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
|
|
|
207
207
|
profile.effective_null_ratio = eff_count / n_rows if n_rows else 0.0
|
|
208
208
|
|
|
209
209
|
r = profile.effective_null_ratio
|
|
210
|
-
|
|
210
|
+
|
|
211
|
+
if r < _SEVERITY_MINOR and r != 0:
|
|
211
212
|
profile.severity = MissingSeverity.Minor
|
|
212
213
|
elif r < _SEVERITY_MODERATE:
|
|
213
214
|
profile.severity = MissingSeverity.Moderate
|
|
@@ -50,7 +50,6 @@ from ._numeric_config import (
|
|
|
50
50
|
NumericTopValueEntry,
|
|
51
51
|
HistogramBin,
|
|
52
52
|
)
|
|
53
|
-
from ..models._data_types import _NUMERIC_DTYPES
|
|
54
53
|
|
|
55
54
|
# ---------------------------------------------------------------------------
|
|
56
55
|
# Thresholds (documented so callers can see what drives labels / flags)
|
|
@@ -119,7 +118,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
119
118
|
if override is not None:
|
|
120
119
|
return False
|
|
121
120
|
|
|
122
|
-
return
|
|
121
|
+
return True
|
|
123
122
|
|
|
124
123
|
def _run(
|
|
125
124
|
self,
|
|
@@ -127,9 +126,8 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
127
126
|
columns: list[str],
|
|
128
127
|
) -> NumericProfileResult:
|
|
129
128
|
result = NumericProfileResult()
|
|
130
|
-
|
|
131
129
|
n_rows = df.height
|
|
132
|
-
|
|
130
|
+
|
|
133
131
|
available = [
|
|
134
132
|
c
|
|
135
133
|
for c in self._resolve_columns(df.columns, columns)
|
|
@@ -137,15 +135,78 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
137
135
|
]
|
|
138
136
|
result.analysed_columns = available
|
|
139
137
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
138
|
+
if not available:
|
|
139
|
+
return result
|
|
140
|
+
|
|
141
|
+
# One df.select([...]) for all scalar stats across all columns so
|
|
142
|
+
# Polars can parallelise expression evaluation rather than running
|
|
143
|
+
# independent query plans per column.
|
|
144
|
+
exprs: list[pl.Expr] = []
|
|
145
|
+
for col in available:
|
|
146
|
+
c = pl.col(col).cast(pl.Float64, strict=False)
|
|
147
|
+
exprs.append(c.mean().alias(f"{col}__mean"))
|
|
148
|
+
exprs.append(c.median().alias(f"{col}__median"))
|
|
149
|
+
exprs.append(c.min().alias(f"{col}__min"))
|
|
150
|
+
exprs.append(c.max().alias(f"{col}__max"))
|
|
151
|
+
exprs.append(c.std(ddof=1).alias(f"{col}__std"))
|
|
152
|
+
for q in _QUANTILE_LEVELS:
|
|
153
|
+
exprs.append(
|
|
154
|
+
c.quantile(q, interpolation="linear").alias(f"{col}__q{q}")
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
batch = df.select(exprs).row(0, named=True)
|
|
158
|
+
|
|
159
|
+
for col in available:
|
|
160
|
+
series = df[col]
|
|
161
|
+
f64 = series.cast(pl.Float64, strict=False)
|
|
162
|
+
clean = f64.drop_nulls()
|
|
163
|
+
profile = NumericStats()
|
|
164
|
+
|
|
165
|
+
if clean.len() == 0:
|
|
166
|
+
result.columns[col] = profile
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
# Central tendency
|
|
170
|
+
mean = float(batch[f"{col}__mean"])
|
|
171
|
+
median = float(batch[f"{col}__median"])
|
|
172
|
+
profile.mean = mean
|
|
173
|
+
profile.median = median
|
|
174
|
+
if median == 0.0:
|
|
175
|
+
profile.mean_median_ratio = float("inf") if mean != 0.0 else 1.0
|
|
176
|
+
else:
|
|
177
|
+
profile.mean_median_ratio = mean / median
|
|
178
|
+
|
|
179
|
+
# Range
|
|
180
|
+
profile.min = float(batch[f"{col}__min"])
|
|
181
|
+
profile.max = float(batch[f"{col}__max"])
|
|
182
|
+
|
|
183
|
+
# Spread — Polars returns null for std with ddof=1 on a single row
|
|
184
|
+
std_val = batch[f"{col}__std"]
|
|
185
|
+
profile.std = float(std_val) if std_val is not None else 0.0
|
|
186
|
+
profile.variance = profile.std ** 2
|
|
187
|
+
|
|
188
|
+
# Percentiles
|
|
189
|
+
q_vals = [batch[f"{col}__q{q}"] for q in _QUANTILE_LEVELS]
|
|
190
|
+
profile.percentiles = PercentileSnapshot(
|
|
191
|
+
p1=q_vals[0], p5=q_vals[1], p25=q_vals[2], p50=q_vals[3],
|
|
192
|
+
p75=q_vals[4], p95=q_vals[5], p99=q_vals[6],
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# Frequency / distribution stays per-column (returns a frame, not a scalar)
|
|
196
|
+
self._compute_frequency_and_distribution(series, clean, profile, n_rows)
|
|
197
|
+
|
|
198
|
+
# Shape stays per-column (delegates to scipy on a numpy array)
|
|
199
|
+
self._compute_shape(clean, profile)
|
|
200
|
+
|
|
201
|
+
self._check_scale_anomaly(profile)
|
|
202
|
+
|
|
203
|
+
result.columns[col] = profile
|
|
144
204
|
|
|
145
205
|
return result
|
|
146
206
|
|
|
147
207
|
# ------------------------------------------------------------------
|
|
148
|
-
# Per-column
|
|
208
|
+
# Per-column helpers (frequency/distribution and shape only —
|
|
209
|
+
# scalar stats are now batched in _run above)
|
|
149
210
|
# ------------------------------------------------------------------
|
|
150
211
|
|
|
151
212
|
@staticmethod
|
|
@@ -196,7 +257,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
196
257
|
# --- 20-Bin Histogram Distribution (Continuous) ---
|
|
197
258
|
import numpy as np
|
|
198
259
|
|
|
199
|
-
counts, bin_edges = np.histogram(clean_f64.to_numpy(), bins=
|
|
260
|
+
counts, bin_edges = np.histogram(clean_f64.to_numpy(), bins="auto")
|
|
200
261
|
profile.histogram = [
|
|
201
262
|
HistogramBin(
|
|
202
263
|
lower_bound=float(bin_edges[i]),
|
|
@@ -207,73 +268,8 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
207
268
|
for i in range(len(counts))
|
|
208
269
|
]
|
|
209
270
|
|
|
210
|
-
def _profile_column(
|
|
211
|
-
self,
|
|
212
|
-
series: pl.Series,
|
|
213
|
-
n_rows: int,
|
|
214
|
-
) -> NumericStats:
|
|
215
|
-
profile = NumericStats()
|
|
216
|
-
|
|
217
|
-
f64 = series.cast(pl.Float64)
|
|
218
|
-
clean = f64.drop_nulls()
|
|
219
|
-
|
|
220
|
-
if clean.len() == 0:
|
|
221
|
-
return profile
|
|
222
|
-
|
|
223
|
-
self._compute_central_tendency(clean, profile)
|
|
224
|
-
self._compute_range(clean, profile)
|
|
225
|
-
self._compute_frequency_and_distribution(series, clean, profile, n_rows)
|
|
226
|
-
self._compute_percentiles(clean, profile)
|
|
227
|
-
self._compute_spread(clean, profile)
|
|
228
|
-
self._compute_shape(clean, profile)
|
|
229
|
-
self._check_scale_anomaly(profile)
|
|
230
|
-
|
|
231
|
-
return profile
|
|
232
|
-
|
|
233
271
|
# ------------------------------------------------------------------
|
|
234
|
-
# Step
|
|
235
|
-
# ------------------------------------------------------------------
|
|
236
|
-
|
|
237
|
-
@staticmethod
|
|
238
|
-
def _compute_central_tendency(
|
|
239
|
-
clean: pl.Series,
|
|
240
|
-
profile: NumericStats,
|
|
241
|
-
) -> None:
|
|
242
|
-
mean = float(clean.mean()) # type: ignore[arg-type]
|
|
243
|
-
median = float(clean.median()) # type: ignore[arg-type]
|
|
244
|
-
|
|
245
|
-
profile.mean = mean
|
|
246
|
-
profile.median = median
|
|
247
|
-
|
|
248
|
-
# Mean/median ratio: primary skew indicator at a glance.
|
|
249
|
-
# Guard against division by zero (e.g. a column of all zeros).
|
|
250
|
-
if median == 0.0:
|
|
251
|
-
profile.mean_median_ratio = float("inf") if mean != 0.0 else 1.0
|
|
252
|
-
else:
|
|
253
|
-
profile.mean_median_ratio = mean / median
|
|
254
|
-
|
|
255
|
-
# ------------------------------------------------------------------
|
|
256
|
-
# Step 2: Spread
|
|
257
|
-
# ------------------------------------------------------------------
|
|
258
|
-
|
|
259
|
-
@staticmethod
|
|
260
|
-
def _compute_spread(
|
|
261
|
-
clean: pl.Series,
|
|
262
|
-
profile: NumericStats,
|
|
263
|
-
) -> None:
|
|
264
|
-
n = clean.len()
|
|
265
|
-
if n < 2:
|
|
266
|
-
# Std / variance undefined for a single observation
|
|
267
|
-
profile.std = 0.0
|
|
268
|
-
profile.variance = 0.0
|
|
269
|
-
return
|
|
270
|
-
|
|
271
|
-
std = float(clean.std(ddof=1)) # type: ignore[arg-type]
|
|
272
|
-
profile.std = std
|
|
273
|
-
profile.variance = std**2
|
|
274
|
-
|
|
275
|
-
# ------------------------------------------------------------------
|
|
276
|
-
# Step 3: Shape — skewness and kurtosis
|
|
272
|
+
# Step 2: Shape — skewness and kurtosis
|
|
277
273
|
# ------------------------------------------------------------------
|
|
278
274
|
|
|
279
275
|
@staticmethod
|
|
@@ -315,48 +311,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
315
311
|
profile.kurtosis_tag = KurtosisTag.Mesokurtic
|
|
316
312
|
|
|
317
313
|
# ------------------------------------------------------------------
|
|
318
|
-
# Step
|
|
319
|
-
# ------------------------------------------------------------------
|
|
320
|
-
|
|
321
|
-
@staticmethod
|
|
322
|
-
def _compute_range(
|
|
323
|
-
clean: pl.Series,
|
|
324
|
-
profile: NumericStats,
|
|
325
|
-
) -> None:
|
|
326
|
-
profile.min = float(clean.min()) # type: ignore[arg-type]
|
|
327
|
-
profile.max = float(clean.max()) # type: ignore[arg-type]
|
|
328
|
-
|
|
329
|
-
# ------------------------------------------------------------------
|
|
330
|
-
# Step 5: Percentiles
|
|
331
|
-
# ------------------------------------------------------------------
|
|
332
|
-
|
|
333
|
-
@staticmethod
|
|
334
|
-
def _compute_percentiles(
|
|
335
|
-
clean: pl.Series,
|
|
336
|
-
profile: NumericStats,
|
|
337
|
-
) -> None:
|
|
338
|
-
# Polars quantile() is O(n log n) once; compute all at once via select
|
|
339
|
-
# to avoid repeated passes.
|
|
340
|
-
quantile_frame = pl.DataFrame({"v": clean}).select(
|
|
341
|
-
[
|
|
342
|
-
pl.col("v").quantile(q, interpolation="linear").alias(f"q{i}")
|
|
343
|
-
for i, q in enumerate(_QUANTILE_LEVELS)
|
|
344
|
-
]
|
|
345
|
-
)
|
|
346
|
-
row = quantile_frame.row(0)
|
|
347
|
-
# row order: p1, p5, p25, p50, p75, p95, p99
|
|
348
|
-
profile.percentiles = PercentileSnapshot(
|
|
349
|
-
p1=row[0],
|
|
350
|
-
p5=row[1],
|
|
351
|
-
p25=row[2],
|
|
352
|
-
p50=row[3],
|
|
353
|
-
p75=row[4],
|
|
354
|
-
p95=row[5],
|
|
355
|
-
p99=row[6],
|
|
356
|
-
)
|
|
357
|
-
|
|
358
|
-
# ------------------------------------------------------------------
|
|
359
|
-
# Step 6: Scale-anomaly flag
|
|
314
|
+
# Step 3: Scale-anomaly flag
|
|
360
315
|
# ------------------------------------------------------------------
|
|
361
316
|
|
|
362
317
|
@staticmethod
|
|
@@ -148,9 +148,11 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
|
|
|
148
148
|
"""Generates numeric metrics and checks for target skewness."""
|
|
149
149
|
num_profiler = NumericProfiler(config=self.config)
|
|
150
150
|
|
|
151
|
-
|
|
151
|
+
col_name = series.name
|
|
152
|
+
num_result = num_profiler.profile(series.to_frame(), [col_name])
|
|
153
|
+
num_profile = num_result.columns.get(col_name)
|
|
152
154
|
result.numeric_profile = num_profile
|
|
153
155
|
|
|
154
156
|
# Flag Skewness (Highly skewed targets often require Log/Yeo-Johnson transforms)
|
|
155
|
-
if num_profile.skewness_severity in (SkewSeverity.High, SkewSeverity.Severe):
|
|
157
|
+
if num_profile and num_profile.skewness_severity in (SkewSeverity.High, SkewSeverity.Severe):
|
|
156
158
|
result.flags.append(TargetFlag.HighlySkewed)
|
|
@@ -357,16 +357,17 @@ class TypeDetector:
|
|
|
357
357
|
return
|
|
358
358
|
|
|
359
359
|
if series.dtype in (pl.Utf8, pl.String):
|
|
360
|
-
|
|
361
|
-
if
|
|
360
|
+
non_null = series.drop_nulls()
|
|
361
|
+
if non_null.len() == 0:
|
|
362
362
|
return
|
|
363
363
|
|
|
364
|
-
median_length =
|
|
364
|
+
median_length = non_null.str.len_chars().median()
|
|
365
|
+
if median_length is not None and median_length > _IDENTIFIER_MAX_MEDIAN_LENGTH:
|
|
366
|
+
return
|
|
365
367
|
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
):
|
|
368
|
+
# Real identifiers are single tokens — no spaces.
|
|
369
|
+
# Sentences and descriptions have median_spaces > 0.
|
|
370
|
+
if float(non_null.str.count_matches(r"\s+").median() or 0.0) > 0:
|
|
370
371
|
return
|
|
371
372
|
|
|
372
373
|
info.flags.append(TypeFlag.IdentifierColumn)
|
|
@@ -199,7 +199,7 @@ class StructuralProfiler:
|
|
|
199
199
|
|
|
200
200
|
# 8a. Feature-feature matrices — computed ONCE, target-independent.
|
|
201
201
|
feature_corr = corr_profiler.profile_features(
|
|
202
|
-
data, numeric_cols
|
|
202
|
+
data, numeric_cols, categorical_cols
|
|
203
203
|
)
|
|
204
204
|
result.dataset.feature_correlation = feature_corr
|
|
205
205
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataforge-ml
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: A automated feature engineering and designing pipeline library
|
|
5
5
|
License: MIT
|
|
6
6
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -15,6 +15,8 @@ Requires-Dist: polars>=1.0.0
|
|
|
15
15
|
Requires-Dist: scikit-learn>=1.0.0
|
|
16
16
|
Requires-Dist: scipy>=1.10.0
|
|
17
17
|
Requires-Dist: numpy>=2.0.0
|
|
18
|
+
Requires-Dist: pandas>=2.0.0
|
|
19
|
+
Requires-Dist: chardet>=5.0.0
|
|
18
20
|
Provides-Extra: dev
|
|
19
21
|
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
20
22
|
Dynamic: license-file
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|