dataforge-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. dataforge_ml-0.1.0.dist-info/METADATA +34 -0
  2. dataforge_ml-0.1.0.dist-info/RECORD +54 -0
  3. dataforge_ml-0.1.0.dist-info/WHEEL +5 -0
  4. dataforge_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
  5. dataforge_ml-0.1.0.dist-info/top_level.txt +5 -0
  6. models/__init__.py +0 -0
  7. models/_data_structure.py +7 -0
  8. models/_data_types.py +12 -0
  9. profiling/__init__.py +35 -0
  10. profiling/_base.py +101 -0
  11. profiling/_boolean_config.py +37 -0
  12. profiling/_boolean_profiler.py +191 -0
  13. profiling/_categorical.py +315 -0
  14. profiling/_categorical_config.py +87 -0
  15. profiling/_correlation_config.py +225 -0
  16. profiling/_correlation_profiler.py +544 -0
  17. profiling/_datetime_config.py +98 -0
  18. profiling/_datetime_profiler.py +406 -0
  19. profiling/_missingness_config.py +137 -0
  20. profiling/_missingness_profiler.py +252 -0
  21. profiling/_numeric_config.py +116 -0
  22. profiling/_numeric_profiler.py +403 -0
  23. profiling/_tabular.py +249 -0
  24. profiling/_target_config.py +74 -0
  25. profiling/_target_profiler.py +156 -0
  26. profiling/_text_config.py +40 -0
  27. profiling/_text_profiler.py +194 -0
  28. profiling/_type_detector.py +463 -0
  29. profiling/config.py +236 -0
  30. profiling/structural.py +280 -0
  31. splitting/__init__.py +4 -0
  32. splitting/_config.py +56 -0
  33. splitting/_splitter.py +202 -0
  34. tests/__init__.py +0 -0
  35. tests/conftest.py +7 -0
  36. tests/integration/__init__.py +0 -0
  37. tests/integration/conftest.py +82 -0
  38. tests/integration/test_structural_end_to_end.py +219 -0
  39. tests/unit/__init__.py +0 -0
  40. tests/unit/profiling/__init__.py +0 -0
  41. tests/unit/profiling/conftest.py +81 -0
  42. tests/unit/profiling/test_boolean_profiler.py +91 -0
  43. tests/unit/profiling/test_categorical_profiler.py +182 -0
  44. tests/unit/profiling/test_correlation_profiler.py +124 -0
  45. tests/unit/profiling/test_datetime_profiler.py +133 -0
  46. tests/unit/profiling/test_missingness_profiler.py +51 -0
  47. tests/unit/profiling/test_numeric_profiler.py +212 -0
  48. tests/unit/profiling/test_target_profiler.py +44 -0
  49. tests/unit/profiling/test_text_profiler.py +61 -0
  50. tests/unit/profiling/test_type_detector.py +32 -0
  51. tests/unit/splitting/__init__.py +0 -0
  52. tests/unit/splitting/test_data_splitter.py +417 -0
  53. utils/__init__.py +0 -0
  54. utils/data_loader.py +110 -0
@@ -0,0 +1,252 @@
1
+ """
2
+ MissingnessProfiler – Phase 1 extension: Missingness Profiling.
3
+
4
+ Eligibility model
5
+ -----------------
6
+ Effective-null detection is based on **dtype first**, with SemanticType
7
+ overrides acting only as suppressors, not as enablers:
8
+
9
+ sentinel-string detection → runs when dtype is Utf8/String
10
+ suppressed if override is Numeric / Datetime / Boolean
11
+ (those types cannot have meaningful sentinel strings)
12
+
13
+ Inf / NaN expansion → runs when dtype is Float32/Float64
14
+ never suppressed (Inf in a float column is always
15
+ effectively missing regardless of semantic label)
16
+
17
+ column_overrides is SPARSE — most columns will have no entry.
18
+ Absence of an override is not a signal; it means "trust the dtype".
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+
24
+ import polars as pl
25
+
26
+ from ._base import DatasetLevelProfiler
27
+ from .config import ProfileConfig, SemanticType
28
+ from ._missingness_config import (
29
+ ColumnMissingnessProfile,
30
+ MissingnessFlag,
31
+ MissingnessProfileResult,
32
+ MissingSeverity,
33
+ )
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # Thresholds
37
+ # ---------------------------------------------------------------------------
38
+
39
+ _SEVERITY_MINOR = 0.01
40
+ _SEVERITY_MODERATE = 0.05
41
+ _SEVERITY_HIGH = 0.20
42
+
43
+ _MAR_CORRELATION_THRESHOLD = 0.60
44
+ _COL_DROP_THRESHOLD = 0.50
45
+
46
+ _SENTINEL_STRINGS = frozenset({"NA", "NAN", "NULL", "NONE", "?"})
47
+
48
+ # Overrides that suppress sentinel-string detection on a String column.
49
+ # If a column is String but the user says "this is Numeric", treating
50
+ # "NA" as a sentinel is correct — but if they say Categorical or Text,
51
+ # sentinel detection still makes sense and should run.
52
+ _SENTINEL_SUPPRESSING_SEMANTICS = frozenset(
53
+ {
54
+ SemanticType.Numeric,
55
+ SemanticType.Datetime,
56
+ SemanticType.Boolean,
57
+ SemanticType.Identifier,
58
+ }
59
+ )
60
+
61
+
62
+ def _sentinel_eligible(dtype: pl.DataType, override: SemanticType | None) -> bool:
63
+ """True when sentinel-string detection should run for this column."""
64
+ if dtype not in (pl.Utf8, pl.String):
65
+ return False
66
+ # Override present and it's a non-text semantic → suppress
67
+ if override is not None and override in _SENTINEL_SUPPRESSING_SEMANTICS:
68
+ return False
69
+ return True
70
+
71
+
72
+ def _inf_eligible(dtype: pl.DataType) -> bool:
73
+ """True when Inf/NaN expansion should run. Always dtype-driven, never suppressed."""
74
+ return dtype in (pl.Float32, pl.Float64)
75
+
76
+
77
+ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
78
+ """
79
+ Missingness profiler for Polars DataFrames.
80
+
81
+ Column scoping
82
+ --------------
83
+ Resolution priority (high → low):
84
+ 1. Explicit ``columns`` argument to ``profile()``.
85
+ 2. ``config.exclude_columns`` — always removed.
86
+ 3. All remaining DataFrame columns.
87
+ """
88
+
89
+ def __init__(self, config: ProfileConfig | None = None) -> None:
90
+ super().__init__(config)
91
+ self._config: ProfileConfig = config or ProfileConfig()
92
+
93
+ # ------------------------------------------------------------------
94
+ # Public API
95
+ # ------------------------------------------------------------------
96
+
97
+ def profile(
98
+ self,
99
+ data: pl.DataFrame,
100
+ columns: list[str] | None = None,
101
+ ) -> MissingnessProfileResult:
102
+ return self._run(data, columns)
103
+
104
+ # ------------------------------------------------------------------
105
+ # Scope resolution
106
+ # ------------------------------------------------------------------
107
+
108
+ # ------------------------------------------------------------------
109
+ # Orchestration
110
+ # ------------------------------------------------------------------
111
+
112
+ def _run(self, df: pl.DataFrame, cols: list[str]) -> MissingnessProfileResult:
113
+ result = MissingnessProfileResult()
114
+ result.analysed_columns = cols
115
+ n_rows = df.height
116
+
117
+ if n_rows == 0 or not cols:
118
+ return result
119
+
120
+ overrides = self._config.column_overrides # sparse — most keys absent
121
+ indicator_cols: list[pl.Series] = []
122
+
123
+ for col_name in cols:
124
+ override = overrides.get(col_name) # None for most columns
125
+ col_profile, indicator = self._profile_column(
126
+ series=df[col_name],
127
+ col_name=col_name,
128
+ n_rows=n_rows,
129
+ override=override,
130
+ )
131
+ result.columns[col_name] = col_profile
132
+ indicator_cols.append(indicator)
133
+
134
+ ratio = col_profile.effective_null_ratio
135
+ if ratio == 1.0:
136
+ result.fully_null_columns.append(col_name)
137
+ col_profile.flags.append(MissingnessFlag.FullyNull)
138
+ elif ratio > _COL_DROP_THRESHOLD:
139
+ col_profile.flags.append(MissingnessFlag.DropCandidate)
140
+
141
+ # ── Missingness correlation matrix ────────────────────────────
142
+ cols_with_missing = [
143
+ c for c in cols if result.columns[c].effective_null_count > 0
144
+ ]
145
+ if len(cols_with_missing) >= 2:
146
+ indicator_frame = pl.DataFrame(
147
+ {s.name: s for s in indicator_cols if s.name in cols_with_missing}
148
+ )
149
+ corr_matrix = self._compute_correlation_matrix(
150
+ indicator_frame, cols_with_missing
151
+ )
152
+ result.correlation_matrix = corr_matrix
153
+
154
+ for col_a in cols_with_missing:
155
+ mar_peers = [
156
+ col_b
157
+ for col_b, r in corr_matrix.get(col_a, {}).items()
158
+ if col_b != col_a and r > _MAR_CORRELATION_THRESHOLD
159
+ ]
160
+ if mar_peers:
161
+ result.columns[col_a].correlated_with = mar_peers
162
+ if MissingnessFlag.MARSuspect not in result.columns[col_a].flags:
163
+ result.columns[col_a].flags.append(MissingnessFlag.MARSuspect)
164
+
165
+ return result
166
+
167
+ # ------------------------------------------------------------------
168
+ # Per-column profiling
169
+ # ------------------------------------------------------------------
170
+
171
+ @staticmethod
172
+ def _profile_column(
173
+ series: pl.Series,
174
+ col_name: str,
175
+ n_rows: int,
176
+ override: SemanticType | None = None, # sparse — None is the common case
177
+ ) -> tuple[ColumnMissingnessProfile, pl.Series]:
178
+ """
179
+ Compute standard + effective null counts for one column.
180
+
181
+ Eligibility is dtype-first:
182
+ - sentinel strings → String dtype, unless override suppresses it
183
+ - Inf/NaN → Float dtype, always (never suppressed)
184
+ - everything else → standard Polars null only
185
+ """
186
+ profile = ColumnMissingnessProfile(column=col_name, total_rows=n_rows)
187
+ dtype = series.dtype
188
+ std_null = series.is_null()
189
+
190
+ if _sentinel_eligible(dtype, override):
191
+ eff_null = (
192
+ std_null
193
+ | (series.str.strip_chars() == "")
194
+ | series.str.to_uppercase().is_in(list(_SENTINEL_STRINGS))
195
+ )
196
+ elif _inf_eligible(dtype):
197
+ eff_null = std_null | series.is_nan() | series.is_infinite()
198
+ else:
199
+ eff_null = std_null
200
+
201
+ std_count = int(std_null.sum())
202
+ eff_count = int(eff_null.sum())
203
+
204
+ profile.standard_null_count = std_count
205
+ profile.effective_null_count = eff_count
206
+ profile.standard_null_ratio = std_count / n_rows if n_rows else 0.0
207
+ profile.effective_null_ratio = eff_count / n_rows if n_rows else 0.0
208
+
209
+ r = profile.effective_null_ratio
210
+ if r < _SEVERITY_MINOR:
211
+ profile.severity = MissingSeverity.Minor
212
+ elif r < _SEVERITY_MODERATE:
213
+ profile.severity = MissingSeverity.Moderate
214
+ elif r < _SEVERITY_HIGH:
215
+ profile.severity = MissingSeverity.High
216
+ else:
217
+ profile.severity = MissingSeverity.Severe
218
+
219
+ indicator = eff_null.cast(pl.Int8).rename(col_name)
220
+ return profile, indicator
221
+
222
+ # ------------------------------------------------------------------
223
+ # Correlation matrix
224
+ # ------------------------------------------------------------------
225
+
226
+ @staticmethod
227
+ def _compute_correlation_matrix(
228
+ indicator_frame: pl.DataFrame,
229
+ cols: list[str],
230
+ ) -> dict[str, dict[str, float]]:
231
+ import itertools
232
+
233
+ matrix: dict[str, dict[str, float]] = {c: {c: 1.0} for c in cols}
234
+ if len(cols) < 2:
235
+ return matrix
236
+
237
+ pairs = list(itertools.combinations(cols, 2))
238
+ exprs = [
239
+ pl.corr(col_a, col_b, method="pearson")
240
+ .fill_nan(0.0)
241
+ .fill_null(0.0)
242
+ .alias(f"{col_a}|{col_b}")
243
+ for col_a, col_b in pairs
244
+ ]
245
+ result_row = indicator_frame.select(exprs).to_dicts()[0]
246
+
247
+ for (col_a, col_b), r_value in zip(pairs, result_row.values()):
248
+ r = max(-1.0, min(1.0, float(r_value)))
249
+ matrix[col_a][col_b] = r
250
+ matrix[col_b][col_a] = r
251
+
252
+ return matrix
@@ -0,0 +1,116 @@
1
+ """
2
+ Result dataclasses for numeric distribution profiling.
3
+
4
+ Populated by NumericProfiler, which is opt-in via
5
+ ProfileConfig.numeric_columns.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+ from enum import StrEnum
12
+ from typing import Optional, List
13
+
14
+
15
+ @dataclass
16
+ class PercentileSnapshot:
17
+ p1: Optional[float] = None
18
+ p5: Optional[float] = None
19
+ p25: Optional[float] = None
20
+ p50: Optional[float] = None
21
+ p75: Optional[float] = None
22
+ p95: Optional[float] = None
23
+ p99: Optional[float] = None
24
+
25
+ @property
26
+ def iqr(self) -> Optional[float]:
27
+ if self.p25 is not None and self.p75 is not None:
28
+ return self.p75 - self.p25
29
+ return None
30
+
31
+
32
+ class SkewSeverity(StrEnum):
33
+ Normal = "normal"
34
+ Moderate = "moderate"
35
+ High = "high"
36
+ Severe = "severe"
37
+
38
+
39
+ class KurtosisTag(StrEnum):
40
+ Platykurtic = "platykurtic"
41
+ Mesokurtic = "mesokurtic"
42
+ Leptokurtic = "leptokurtic"
43
+
44
+
45
+ class NumericFlag(StrEnum):
46
+ ScaleAnomaly = "scale_anomaly"
47
+ NearConstant = "near_constant"
48
+
49
+
50
+ @dataclass
51
+ class NumericTopValueEntry:
52
+ value: float
53
+ count: int
54
+ percentage: float
55
+
56
+
57
+ @dataclass
58
+ class HistogramBin:
59
+ lower_bound: float
60
+ upper_bound: float
61
+ count: int
62
+ percentage: float
63
+
64
+
65
+ @dataclass
66
+ class NumericStats:
67
+ mean: Optional[float] = None
68
+ median: Optional[float] = None
69
+ mean_median_ratio: Optional[float] = None
70
+ mode: Optional[float] = None
71
+ mode_frequency: float = 0.0
72
+ top_values: list[NumericTopValueEntry] = field(default_factory=list)
73
+ histogram: list[HistogramBin] = field(default_factory=list)
74
+ std: Optional[float] = None
75
+ variance: Optional[float] = None
76
+ min: Optional[float] = None
77
+ max: Optional[float] = None
78
+ percentiles: PercentileSnapshot = field(default_factory=PercentileSnapshot)
79
+ skewness: Optional[float] = None
80
+ kurtosis: Optional[float] = None
81
+ skewness_severity: Optional[SkewSeverity] = None
82
+ kurtosis_tag: Optional[KurtosisTag] = None
83
+ flags: List[NumericFlag] = field(default_factory=list)
84
+
85
+ @property
86
+ def iqr(self) -> Optional[float]:
87
+ return self.percentiles.iqr
88
+
89
+ def has_flag(self, flag: NumericFlag) -> bool:
90
+ return flag in self.flags
91
+
92
+
93
+ ColumnNumericProfile = NumericStats
94
+
95
+
96
+ @dataclass
97
+ class NumericProfileResult:
98
+ """
99
+ Numeric distribution profile for all opted-in columns.
100
+
101
+ Attributes
102
+ ----------
103
+ columns : dict[str, ColumnNumericProfile]
104
+ Per-column profiles, keyed by column name.
105
+ analysed_columns : list[str]
106
+ Columns that were actually profiled (after schema intersection).
107
+ """
108
+
109
+ columns: dict[str, NumericStats] = field(default_factory=dict)
110
+ analysed_columns: list[str] = field(default_factory=list)
111
+
112
+ def __str__(self) -> str: # pragma: no cover
113
+ lines = ["=== Numeric Distribution Profile ==="]
114
+ for profile in self.columns.values():
115
+ lines.append(str(profile))
116
+ return "\n".join(lines)