dataforge-ml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataforge_ml-0.1.0.dist-info/METADATA +34 -0
- dataforge_ml-0.1.0.dist-info/RECORD +54 -0
- dataforge_ml-0.1.0.dist-info/WHEEL +5 -0
- dataforge_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
- dataforge_ml-0.1.0.dist-info/top_level.txt +5 -0
- models/__init__.py +0 -0
- models/_data_structure.py +7 -0
- models/_data_types.py +12 -0
- profiling/__init__.py +35 -0
- profiling/_base.py +101 -0
- profiling/_boolean_config.py +37 -0
- profiling/_boolean_profiler.py +191 -0
- profiling/_categorical.py +315 -0
- profiling/_categorical_config.py +87 -0
- profiling/_correlation_config.py +225 -0
- profiling/_correlation_profiler.py +544 -0
- profiling/_datetime_config.py +98 -0
- profiling/_datetime_profiler.py +406 -0
- profiling/_missingness_config.py +137 -0
- profiling/_missingness_profiler.py +252 -0
- profiling/_numeric_config.py +116 -0
- profiling/_numeric_profiler.py +403 -0
- profiling/_tabular.py +249 -0
- profiling/_target_config.py +74 -0
- profiling/_target_profiler.py +156 -0
- profiling/_text_config.py +40 -0
- profiling/_text_profiler.py +194 -0
- profiling/_type_detector.py +463 -0
- profiling/config.py +236 -0
- profiling/structural.py +280 -0
- splitting/__init__.py +4 -0
- splitting/_config.py +56 -0
- splitting/_splitter.py +202 -0
- tests/__init__.py +0 -0
- tests/conftest.py +7 -0
- tests/integration/__init__.py +0 -0
- tests/integration/conftest.py +82 -0
- tests/integration/test_structural_end_to_end.py +219 -0
- tests/unit/__init__.py +0 -0
- tests/unit/profiling/__init__.py +0 -0
- tests/unit/profiling/conftest.py +81 -0
- tests/unit/profiling/test_boolean_profiler.py +91 -0
- tests/unit/profiling/test_categorical_profiler.py +182 -0
- tests/unit/profiling/test_correlation_profiler.py +124 -0
- tests/unit/profiling/test_datetime_profiler.py +133 -0
- tests/unit/profiling/test_missingness_profiler.py +51 -0
- tests/unit/profiling/test_numeric_profiler.py +212 -0
- tests/unit/profiling/test_target_profiler.py +44 -0
- tests/unit/profiling/test_text_profiler.py +61 -0
- tests/unit/profiling/test_type_detector.py +32 -0
- tests/unit/splitting/__init__.py +0 -0
- tests/unit/splitting/test_data_splitter.py +417 -0
- utils/__init__.py +0 -0
- utils/data_loader.py +110 -0
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MissingnessProfiler – Phase 1 extension: Missingness Profiling.
|
|
3
|
+
|
|
4
|
+
Eligibility model
|
|
5
|
+
-----------------
|
|
6
|
+
Effective-null detection is based on **dtype first**, with SemanticType
|
|
7
|
+
overrides acting only as suppressors, not as enablers:
|
|
8
|
+
|
|
9
|
+
sentinel-string detection → runs when dtype is Utf8/String
|
|
10
|
+
suppressed if override is Numeric / Datetime / Boolean
|
|
11
|
+
(those types cannot have meaningful sentinel strings)
|
|
12
|
+
|
|
13
|
+
Inf / NaN expansion → runs when dtype is Float32/Float64
|
|
14
|
+
never suppressed (Inf in a float column is always
|
|
15
|
+
effectively missing regardless of semantic label)
|
|
16
|
+
|
|
17
|
+
column_overrides is SPARSE — most columns will have no entry.
|
|
18
|
+
Absence of an override is not a signal; it means "trust the dtype".
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
import polars as pl
|
|
25
|
+
|
|
26
|
+
from ._base import DatasetLevelProfiler
|
|
27
|
+
from .config import ProfileConfig, SemanticType
|
|
28
|
+
from ._missingness_config import (
|
|
29
|
+
ColumnMissingnessProfile,
|
|
30
|
+
MissingnessFlag,
|
|
31
|
+
MissingnessProfileResult,
|
|
32
|
+
MissingSeverity,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
# Thresholds
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
_SEVERITY_MINOR = 0.01
|
|
40
|
+
_SEVERITY_MODERATE = 0.05
|
|
41
|
+
_SEVERITY_HIGH = 0.20
|
|
42
|
+
|
|
43
|
+
_MAR_CORRELATION_THRESHOLD = 0.60
|
|
44
|
+
_COL_DROP_THRESHOLD = 0.50
|
|
45
|
+
|
|
46
|
+
_SENTINEL_STRINGS = frozenset({"NA", "NAN", "NULL", "NONE", "?"})
|
|
47
|
+
|
|
48
|
+
# Overrides that suppress sentinel-string detection on a String column.
|
|
49
|
+
# If a column is String but the user says "this is Numeric", treating
|
|
50
|
+
# "NA" as a sentinel is correct — but if they say Categorical or Text,
|
|
51
|
+
# sentinel detection still makes sense and should run.
|
|
52
|
+
_SENTINEL_SUPPRESSING_SEMANTICS = frozenset(
|
|
53
|
+
{
|
|
54
|
+
SemanticType.Numeric,
|
|
55
|
+
SemanticType.Datetime,
|
|
56
|
+
SemanticType.Boolean,
|
|
57
|
+
SemanticType.Identifier,
|
|
58
|
+
}
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _sentinel_eligible(dtype: pl.DataType, override: SemanticType | None) -> bool:
|
|
63
|
+
"""True when sentinel-string detection should run for this column."""
|
|
64
|
+
if dtype not in (pl.Utf8, pl.String):
|
|
65
|
+
return False
|
|
66
|
+
# Override present and it's a non-text semantic → suppress
|
|
67
|
+
if override is not None and override in _SENTINEL_SUPPRESSING_SEMANTICS:
|
|
68
|
+
return False
|
|
69
|
+
return True
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _inf_eligible(dtype: pl.DataType) -> bool:
|
|
73
|
+
"""True when Inf/NaN expansion should run. Always dtype-driven, never suppressed."""
|
|
74
|
+
return dtype in (pl.Float32, pl.Float64)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
|
|
78
|
+
"""
|
|
79
|
+
Missingness profiler for Polars DataFrames.
|
|
80
|
+
|
|
81
|
+
Column scoping
|
|
82
|
+
--------------
|
|
83
|
+
Resolution priority (high → low):
|
|
84
|
+
1. Explicit ``columns`` argument to ``profile()``.
|
|
85
|
+
2. ``config.exclude_columns`` — always removed.
|
|
86
|
+
3. All remaining DataFrame columns.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def __init__(self, config: ProfileConfig | None = None) -> None:
|
|
90
|
+
super().__init__(config)
|
|
91
|
+
self._config: ProfileConfig = config or ProfileConfig()
|
|
92
|
+
|
|
93
|
+
# ------------------------------------------------------------------
|
|
94
|
+
# Public API
|
|
95
|
+
# ------------------------------------------------------------------
|
|
96
|
+
|
|
97
|
+
def profile(
|
|
98
|
+
self,
|
|
99
|
+
data: pl.DataFrame,
|
|
100
|
+
columns: list[str] | None = None,
|
|
101
|
+
) -> MissingnessProfileResult:
|
|
102
|
+
return self._run(data, columns)
|
|
103
|
+
|
|
104
|
+
# ------------------------------------------------------------------
|
|
105
|
+
# Scope resolution
|
|
106
|
+
# ------------------------------------------------------------------
|
|
107
|
+
|
|
108
|
+
# ------------------------------------------------------------------
|
|
109
|
+
# Orchestration
|
|
110
|
+
# ------------------------------------------------------------------
|
|
111
|
+
|
|
112
|
+
def _run(self, df: pl.DataFrame, cols: list[str]) -> MissingnessProfileResult:
|
|
113
|
+
result = MissingnessProfileResult()
|
|
114
|
+
result.analysed_columns = cols
|
|
115
|
+
n_rows = df.height
|
|
116
|
+
|
|
117
|
+
if n_rows == 0 or not cols:
|
|
118
|
+
return result
|
|
119
|
+
|
|
120
|
+
overrides = self._config.column_overrides # sparse — most keys absent
|
|
121
|
+
indicator_cols: list[pl.Series] = []
|
|
122
|
+
|
|
123
|
+
for col_name in cols:
|
|
124
|
+
override = overrides.get(col_name) # None for most columns
|
|
125
|
+
col_profile, indicator = self._profile_column(
|
|
126
|
+
series=df[col_name],
|
|
127
|
+
col_name=col_name,
|
|
128
|
+
n_rows=n_rows,
|
|
129
|
+
override=override,
|
|
130
|
+
)
|
|
131
|
+
result.columns[col_name] = col_profile
|
|
132
|
+
indicator_cols.append(indicator)
|
|
133
|
+
|
|
134
|
+
ratio = col_profile.effective_null_ratio
|
|
135
|
+
if ratio == 1.0:
|
|
136
|
+
result.fully_null_columns.append(col_name)
|
|
137
|
+
col_profile.flags.append(MissingnessFlag.FullyNull)
|
|
138
|
+
elif ratio > _COL_DROP_THRESHOLD:
|
|
139
|
+
col_profile.flags.append(MissingnessFlag.DropCandidate)
|
|
140
|
+
|
|
141
|
+
# ── Missingness correlation matrix ────────────────────────────
|
|
142
|
+
cols_with_missing = [
|
|
143
|
+
c for c in cols if result.columns[c].effective_null_count > 0
|
|
144
|
+
]
|
|
145
|
+
if len(cols_with_missing) >= 2:
|
|
146
|
+
indicator_frame = pl.DataFrame(
|
|
147
|
+
{s.name: s for s in indicator_cols if s.name in cols_with_missing}
|
|
148
|
+
)
|
|
149
|
+
corr_matrix = self._compute_correlation_matrix(
|
|
150
|
+
indicator_frame, cols_with_missing
|
|
151
|
+
)
|
|
152
|
+
result.correlation_matrix = corr_matrix
|
|
153
|
+
|
|
154
|
+
for col_a in cols_with_missing:
|
|
155
|
+
mar_peers = [
|
|
156
|
+
col_b
|
|
157
|
+
for col_b, r in corr_matrix.get(col_a, {}).items()
|
|
158
|
+
if col_b != col_a and r > _MAR_CORRELATION_THRESHOLD
|
|
159
|
+
]
|
|
160
|
+
if mar_peers:
|
|
161
|
+
result.columns[col_a].correlated_with = mar_peers
|
|
162
|
+
if MissingnessFlag.MARSuspect not in result.columns[col_a].flags:
|
|
163
|
+
result.columns[col_a].flags.append(MissingnessFlag.MARSuspect)
|
|
164
|
+
|
|
165
|
+
return result
|
|
166
|
+
|
|
167
|
+
# ------------------------------------------------------------------
|
|
168
|
+
# Per-column profiling
|
|
169
|
+
# ------------------------------------------------------------------
|
|
170
|
+
|
|
171
|
+
@staticmethod
|
|
172
|
+
def _profile_column(
|
|
173
|
+
series: pl.Series,
|
|
174
|
+
col_name: str,
|
|
175
|
+
n_rows: int,
|
|
176
|
+
override: SemanticType | None = None, # sparse — None is the common case
|
|
177
|
+
) -> tuple[ColumnMissingnessProfile, pl.Series]:
|
|
178
|
+
"""
|
|
179
|
+
Compute standard + effective null counts for one column.
|
|
180
|
+
|
|
181
|
+
Eligibility is dtype-first:
|
|
182
|
+
- sentinel strings → String dtype, unless override suppresses it
|
|
183
|
+
- Inf/NaN → Float dtype, always (never suppressed)
|
|
184
|
+
- everything else → standard Polars null only
|
|
185
|
+
"""
|
|
186
|
+
profile = ColumnMissingnessProfile(column=col_name, total_rows=n_rows)
|
|
187
|
+
dtype = series.dtype
|
|
188
|
+
std_null = series.is_null()
|
|
189
|
+
|
|
190
|
+
if _sentinel_eligible(dtype, override):
|
|
191
|
+
eff_null = (
|
|
192
|
+
std_null
|
|
193
|
+
| (series.str.strip_chars() == "")
|
|
194
|
+
| series.str.to_uppercase().is_in(list(_SENTINEL_STRINGS))
|
|
195
|
+
)
|
|
196
|
+
elif _inf_eligible(dtype):
|
|
197
|
+
eff_null = std_null | series.is_nan() | series.is_infinite()
|
|
198
|
+
else:
|
|
199
|
+
eff_null = std_null
|
|
200
|
+
|
|
201
|
+
std_count = int(std_null.sum())
|
|
202
|
+
eff_count = int(eff_null.sum())
|
|
203
|
+
|
|
204
|
+
profile.standard_null_count = std_count
|
|
205
|
+
profile.effective_null_count = eff_count
|
|
206
|
+
profile.standard_null_ratio = std_count / n_rows if n_rows else 0.0
|
|
207
|
+
profile.effective_null_ratio = eff_count / n_rows if n_rows else 0.0
|
|
208
|
+
|
|
209
|
+
r = profile.effective_null_ratio
|
|
210
|
+
if r < _SEVERITY_MINOR:
|
|
211
|
+
profile.severity = MissingSeverity.Minor
|
|
212
|
+
elif r < _SEVERITY_MODERATE:
|
|
213
|
+
profile.severity = MissingSeverity.Moderate
|
|
214
|
+
elif r < _SEVERITY_HIGH:
|
|
215
|
+
profile.severity = MissingSeverity.High
|
|
216
|
+
else:
|
|
217
|
+
profile.severity = MissingSeverity.Severe
|
|
218
|
+
|
|
219
|
+
indicator = eff_null.cast(pl.Int8).rename(col_name)
|
|
220
|
+
return profile, indicator
|
|
221
|
+
|
|
222
|
+
# ------------------------------------------------------------------
|
|
223
|
+
# Correlation matrix
|
|
224
|
+
# ------------------------------------------------------------------
|
|
225
|
+
|
|
226
|
+
@staticmethod
|
|
227
|
+
def _compute_correlation_matrix(
|
|
228
|
+
indicator_frame: pl.DataFrame,
|
|
229
|
+
cols: list[str],
|
|
230
|
+
) -> dict[str, dict[str, float]]:
|
|
231
|
+
import itertools
|
|
232
|
+
|
|
233
|
+
matrix: dict[str, dict[str, float]] = {c: {c: 1.0} for c in cols}
|
|
234
|
+
if len(cols) < 2:
|
|
235
|
+
return matrix
|
|
236
|
+
|
|
237
|
+
pairs = list(itertools.combinations(cols, 2))
|
|
238
|
+
exprs = [
|
|
239
|
+
pl.corr(col_a, col_b, method="pearson")
|
|
240
|
+
.fill_nan(0.0)
|
|
241
|
+
.fill_null(0.0)
|
|
242
|
+
.alias(f"{col_a}|{col_b}")
|
|
243
|
+
for col_a, col_b in pairs
|
|
244
|
+
]
|
|
245
|
+
result_row = indicator_frame.select(exprs).to_dicts()[0]
|
|
246
|
+
|
|
247
|
+
for (col_a, col_b), r_value in zip(pairs, result_row.values()):
|
|
248
|
+
r = max(-1.0, min(1.0, float(r_value)))
|
|
249
|
+
matrix[col_a][col_b] = r
|
|
250
|
+
matrix[col_b][col_a] = r
|
|
251
|
+
|
|
252
|
+
return matrix
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Result dataclasses for numeric distribution profiling.
|
|
3
|
+
|
|
4
|
+
Populated by NumericProfiler, which is opt-in via
|
|
5
|
+
ProfileConfig.numeric_columns.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from enum import StrEnum
|
|
12
|
+
from typing import Optional, List
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class PercentileSnapshot:
|
|
17
|
+
p1: Optional[float] = None
|
|
18
|
+
p5: Optional[float] = None
|
|
19
|
+
p25: Optional[float] = None
|
|
20
|
+
p50: Optional[float] = None
|
|
21
|
+
p75: Optional[float] = None
|
|
22
|
+
p95: Optional[float] = None
|
|
23
|
+
p99: Optional[float] = None
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def iqr(self) -> Optional[float]:
|
|
27
|
+
if self.p25 is not None and self.p75 is not None:
|
|
28
|
+
return self.p75 - self.p25
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class SkewSeverity(StrEnum):
|
|
33
|
+
Normal = "normal"
|
|
34
|
+
Moderate = "moderate"
|
|
35
|
+
High = "high"
|
|
36
|
+
Severe = "severe"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class KurtosisTag(StrEnum):
|
|
40
|
+
Platykurtic = "platykurtic"
|
|
41
|
+
Mesokurtic = "mesokurtic"
|
|
42
|
+
Leptokurtic = "leptokurtic"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class NumericFlag(StrEnum):
|
|
46
|
+
ScaleAnomaly = "scale_anomaly"
|
|
47
|
+
NearConstant = "near_constant"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class NumericTopValueEntry:
|
|
52
|
+
value: float
|
|
53
|
+
count: int
|
|
54
|
+
percentage: float
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class HistogramBin:
|
|
59
|
+
lower_bound: float
|
|
60
|
+
upper_bound: float
|
|
61
|
+
count: int
|
|
62
|
+
percentage: float
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class NumericStats:
|
|
67
|
+
mean: Optional[float] = None
|
|
68
|
+
median: Optional[float] = None
|
|
69
|
+
mean_median_ratio: Optional[float] = None
|
|
70
|
+
mode: Optional[float] = None
|
|
71
|
+
mode_frequency: float = 0.0
|
|
72
|
+
top_values: list[NumericTopValueEntry] = field(default_factory=list)
|
|
73
|
+
histogram: list[HistogramBin] = field(default_factory=list)
|
|
74
|
+
std: Optional[float] = None
|
|
75
|
+
variance: Optional[float] = None
|
|
76
|
+
min: Optional[float] = None
|
|
77
|
+
max: Optional[float] = None
|
|
78
|
+
percentiles: PercentileSnapshot = field(default_factory=PercentileSnapshot)
|
|
79
|
+
skewness: Optional[float] = None
|
|
80
|
+
kurtosis: Optional[float] = None
|
|
81
|
+
skewness_severity: Optional[SkewSeverity] = None
|
|
82
|
+
kurtosis_tag: Optional[KurtosisTag] = None
|
|
83
|
+
flags: List[NumericFlag] = field(default_factory=list)
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def iqr(self) -> Optional[float]:
|
|
87
|
+
return self.percentiles.iqr
|
|
88
|
+
|
|
89
|
+
def has_flag(self, flag: NumericFlag) -> bool:
|
|
90
|
+
return flag in self.flags
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
ColumnNumericProfile = NumericStats
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@dataclass
|
|
97
|
+
class NumericProfileResult:
|
|
98
|
+
"""
|
|
99
|
+
Numeric distribution profile for all opted-in columns.
|
|
100
|
+
|
|
101
|
+
Attributes
|
|
102
|
+
----------
|
|
103
|
+
columns : dict[str, ColumnNumericProfile]
|
|
104
|
+
Per-column profiles, keyed by column name.
|
|
105
|
+
analysed_columns : list[str]
|
|
106
|
+
Columns that were actually profiled (after schema intersection).
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
columns: dict[str, NumericStats] = field(default_factory=dict)
|
|
110
|
+
analysed_columns: list[str] = field(default_factory=list)
|
|
111
|
+
|
|
112
|
+
def __str__(self) -> str: # pragma: no cover
|
|
113
|
+
lines = ["=== Numeric Distribution Profile ==="]
|
|
114
|
+
for profile in self.columns.values():
|
|
115
|
+
lines.append(str(profile))
|
|
116
|
+
return "\n".join(lines)
|