dataforge-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. dataforge_ml-0.1.0.dist-info/METADATA +34 -0
  2. dataforge_ml-0.1.0.dist-info/RECORD +54 -0
  3. dataforge_ml-0.1.0.dist-info/WHEEL +5 -0
  4. dataforge_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
  5. dataforge_ml-0.1.0.dist-info/top_level.txt +5 -0
  6. models/__init__.py +0 -0
  7. models/_data_structure.py +7 -0
  8. models/_data_types.py +12 -0
  9. profiling/__init__.py +35 -0
  10. profiling/_base.py +101 -0
  11. profiling/_boolean_config.py +37 -0
  12. profiling/_boolean_profiler.py +191 -0
  13. profiling/_categorical.py +315 -0
  14. profiling/_categorical_config.py +87 -0
  15. profiling/_correlation_config.py +225 -0
  16. profiling/_correlation_profiler.py +544 -0
  17. profiling/_datetime_config.py +98 -0
  18. profiling/_datetime_profiler.py +406 -0
  19. profiling/_missingness_config.py +137 -0
  20. profiling/_missingness_profiler.py +252 -0
  21. profiling/_numeric_config.py +116 -0
  22. profiling/_numeric_profiler.py +403 -0
  23. profiling/_tabular.py +249 -0
  24. profiling/_target_config.py +74 -0
  25. profiling/_target_profiler.py +156 -0
  26. profiling/_text_config.py +40 -0
  27. profiling/_text_profiler.py +194 -0
  28. profiling/_type_detector.py +463 -0
  29. profiling/config.py +236 -0
  30. profiling/structural.py +280 -0
  31. splitting/__init__.py +4 -0
  32. splitting/_config.py +56 -0
  33. splitting/_splitter.py +202 -0
  34. tests/__init__.py +0 -0
  35. tests/conftest.py +7 -0
  36. tests/integration/__init__.py +0 -0
  37. tests/integration/conftest.py +82 -0
  38. tests/integration/test_structural_end_to_end.py +219 -0
  39. tests/unit/__init__.py +0 -0
  40. tests/unit/profiling/__init__.py +0 -0
  41. tests/unit/profiling/conftest.py +81 -0
  42. tests/unit/profiling/test_boolean_profiler.py +91 -0
  43. tests/unit/profiling/test_categorical_profiler.py +182 -0
  44. tests/unit/profiling/test_correlation_profiler.py +124 -0
  45. tests/unit/profiling/test_datetime_profiler.py +133 -0
  46. tests/unit/profiling/test_missingness_profiler.py +51 -0
  47. tests/unit/profiling/test_numeric_profiler.py +212 -0
  48. tests/unit/profiling/test_target_profiler.py +44 -0
  49. tests/unit/profiling/test_text_profiler.py +61 -0
  50. tests/unit/profiling/test_type_detector.py +32 -0
  51. tests/unit/splitting/__init__.py +0 -0
  52. tests/unit/splitting/test_data_splitter.py +417 -0
  53. utils/__init__.py +0 -0
  54. utils/data_loader.py +110 -0
@@ -0,0 +1,403 @@
1
+ """
2
+ NumericProfiler – Phase 1 extension: Numeric Distribution Profiling.
3
+
4
+ Per-column metrics (opt-in via ProfileConfig.numeric_columns):
5
+ 1. Central tendency – mean, median, mean/median ratio
6
+ 2. Spread – std, variance, IQR (Q3 – Q1)
7
+ 3. Skewness & kurtosis – with severity/tag labels
8
+ 4. Range – min, max
9
+ 5. Percentile profile – p1, p5, p25, p50, p75, p95, p99
10
+ 6. Scale-anomaly flag – values spanning 3+ orders of magnitude
11
+
12
+ Only numeric Polars dtypes are profiled; string columns in the list are
13
+ silently skipped (a warning is produced if the caller passes non-numeric
14
+ column names).
15
+
16
+ Integration
17
+ -----------
18
+ Add ``numeric_columns: list[str] | None`` to ProfileConfig, then call::
19
+
20
+ from profiling.numeric_profiler import NumericProfiler
21
+
22
+ num_profiler = NumericProfiler(
23
+ columns=["age", "income", "temperature"],
24
+ config=cfg,
25
+ )
26
+ num_result = num_profiler.profile(df)
27
+
28
+ Attach ``num_result`` to ``TabularProfileResult`` as
29
+ ``result.numeric_profile``.
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+
35
+ import polars as pl
36
+
37
+ from ._base import ColumnBatchProfiler
38
+ from .config import (
39
+ ProfileConfig,
40
+ SemanticType,
41
+ )
42
+ from ._correlation_profiler import _INT_DTYPES
43
+ from ._numeric_config import (
44
+ NumericProfileResult,
45
+ NumericStats,
46
+ PercentileSnapshot,
47
+ KurtosisTag,
48
+ NumericFlag,
49
+ SkewSeverity,
50
+ NumericTopValueEntry,
51
+ HistogramBin,
52
+ )
53
+ from ..models._data_types import _NUMERIC_DTYPES
54
+
55
+ # ---------------------------------------------------------------------------
56
+ # Thresholds (documented so callers can see what drives labels / flags)
57
+ # ---------------------------------------------------------------------------
58
+
59
+ # Skewness severity bands (applied to |skewness|)
60
+ _SKEW_NORMAL = 0.5 # |skew| ≤ this → normal
61
+ _SKEW_MODERATE = 1.0 # |skew| ≤ this → moderate
62
+ _SKEW_HIGH = 2.0 # |skew| ≤ this → high
63
+ # |skew| > 2.0 → severe
64
+
65
+ # Excess kurtosis bands
66
+ _KURT_PLATY_UPPER = -1.0 # excess < this → platykurtic
67
+ _KURT_LEPTO_LOWER = 3.0 # excess > this → leptokurtic
68
+ # else → mesokurtic
69
+
70
+ # Scale-anomaly: flag when max/min ratio spans ≥ 3 orders of magnitude
71
+ _SCALE_ORDERS_OF_MAGNITUDE = 3 # i.e. ratio ≥ 10^3
72
+
73
+
74
+ # Percentile quantile levels (in order)
75
+ _QUANTILE_LEVELS = (0.01, 0.05, 0.25, 0.50, 0.75, 0.95, 0.99)
76
+ _NEAR_CONSTANT_THRESHOLD = 0.90
77
+ _DISCRETE_MAX_UNIQUE = 20
78
+
79
+
80
+ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
81
+ """
82
+ Numeric distribution profiler for Polars DataFrames.
83
+
84
+ Parameters
85
+ ----------
86
+ columns : list[str]
87
+ Columns to profile. Non-numeric or absent columns are skipped
88
+ with a warning; they do not raise.
89
+ config : ProfileConfig | None
90
+ Shared profiling configuration.
91
+ """
92
+
93
+ def __init__(
94
+ self,
95
+ config: ProfileConfig | None = None,
96
+ ) -> None:
97
+ super().__init__(config)
98
+
99
+ # ------------------------------------------------------------------
100
+ # Public API
101
+ # ------------------------------------------------------------------
102
+
103
+ def profile(
104
+ self,
105
+ data: pl.DataFrame,
106
+ columns: list[str],
107
+ ) -> NumericProfileResult:
108
+ return self._run(data, columns)
109
+
110
+ # ------------------------------------------------------------------
111
+ # Orchestration
112
+ # ------------------------------------------------------------------
113
+
114
+ def _eligible(self, series: pl.Series) -> bool:
115
+ override = self.config.column_overrides.get(series.name)
116
+ if override == SemanticType.Numeric:
117
+ return True
118
+
119
+ if override is not None:
120
+ return False
121
+
122
+ return series.dtype in _NUMERIC_DTYPES
123
+
124
+ def _run(
125
+ self,
126
+ df: pl.DataFrame,
127
+ columns: list[str],
128
+ ) -> NumericProfileResult:
129
+ result = NumericProfileResult()
130
+
131
+ n_rows = df.height
132
+ # Intersect requested columns with the actual schema
133
+ available = [
134
+ c
135
+ for c in self._resolve_columns(df.columns, columns)
136
+ if self._eligible(df[c])
137
+ ]
138
+ result.analysed_columns = available
139
+
140
+ for col_name in available:
141
+ series = df[col_name]
142
+ profile = self._profile_column(series, n_rows)
143
+ result.columns[col_name] = profile
144
+
145
+ return result
146
+
147
+ # ------------------------------------------------------------------
148
+ # Per-column driver
149
+ # ------------------------------------------------------------------
150
+
151
+ @staticmethod
152
+ def _compute_frequency_and_distribution(
153
+ original_series: pl.Series,
154
+ clean_f64: pl.Series,
155
+ profile: NumericStats,
156
+ n_rows: int,
157
+ ) -> None:
158
+ """
159
+ Compute Mode, and depending on whether the feature is continuous or discrete,
160
+ calculate a 20-bin histogram OR Top-10 value counts.
161
+ """
162
+ if clean_f64.len() == 0:
163
+ return
164
+
165
+ vc = clean_f64.value_counts(sort=True)
166
+ col_name = clean_f64.name
167
+
168
+ # --- Absolute Mode Frequency ---
169
+ mode_val = float(vc[col_name][0])
170
+ mode_count = int(vc["count"][0])
171
+ mode_freq = mode_count / n_rows if n_rows > 0 else 0.0
172
+
173
+ profile.mode = mode_val
174
+ profile.mode_frequency = mode_freq
175
+
176
+ if mode_freq > _NEAR_CONSTANT_THRESHOLD:
177
+ profile.flags.append(NumericFlag.NearConstant)
178
+
179
+ n_unique = vc.height
180
+ is_discrete = (
181
+ original_series.dtype in _INT_DTYPES or n_unique <= _DISCRETE_MAX_UNIQUE
182
+ )
183
+
184
+ if is_discrete:
185
+ # --- Top-10 Distribution (Discrete) ---
186
+ top_rows = min(10, n_unique)
187
+ profile.top_values = [
188
+ NumericTopValueEntry(
189
+ value=float(vc[col_name][i]),
190
+ count=int(vc["count"][i]),
191
+ percentage=int(vc["count"][i]) / n_rows if n_rows > 0 else 0.0,
192
+ )
193
+ for i in range(top_rows)
194
+ ]
195
+ else:
196
+ # --- 20-Bin Histogram Distribution (Continuous) ---
197
+ import numpy as np
198
+
199
+ counts, bin_edges = np.histogram(clean_f64.to_numpy(), bins=20)
200
+ profile.histogram = [
201
+ HistogramBin(
202
+ lower_bound=float(bin_edges[i]),
203
+ upper_bound=float(bin_edges[i + 1]),
204
+ count=int(counts[i]),
205
+ percentage=int(counts[i]) / n_rows if n_rows > 0 else 0.0,
206
+ )
207
+ for i in range(len(counts))
208
+ ]
209
+
210
+ def _profile_column(
211
+ self,
212
+ series: pl.Series,
213
+ n_rows: int,
214
+ ) -> NumericStats:
215
+ profile = NumericStats()
216
+
217
+ f64 = series.cast(pl.Float64)
218
+ clean = f64.drop_nulls()
219
+
220
+ if clean.len() == 0:
221
+ return profile
222
+
223
+ self._compute_central_tendency(clean, profile)
224
+ self._compute_range(clean, profile)
225
+ self._compute_frequency_and_distribution(series, clean, profile, n_rows)
226
+ self._compute_percentiles(clean, profile)
227
+ self._compute_spread(clean, profile)
228
+ self._compute_shape(clean, profile)
229
+ self._check_scale_anomaly(profile)
230
+
231
+ return profile
232
+
233
+ # ------------------------------------------------------------------
234
+ # Step 1: Central tendency
235
+ # ------------------------------------------------------------------
236
+
237
+ @staticmethod
238
+ def _compute_central_tendency(
239
+ clean: pl.Series,
240
+ profile: NumericStats,
241
+ ) -> None:
242
+ mean = float(clean.mean()) # type: ignore[arg-type]
243
+ median = float(clean.median()) # type: ignore[arg-type]
244
+
245
+ profile.mean = mean
246
+ profile.median = median
247
+
248
+ # Mean/median ratio: primary skew indicator at a glance.
249
+ # Guard against division by zero (e.g. a column of all zeros).
250
+ if median == 0.0:
251
+ profile.mean_median_ratio = float("inf") if mean != 0.0 else 1.0
252
+ else:
253
+ profile.mean_median_ratio = mean / median
254
+
255
+ # ------------------------------------------------------------------
256
+ # Step 2: Spread
257
+ # ------------------------------------------------------------------
258
+
259
+ @staticmethod
260
+ def _compute_spread(
261
+ clean: pl.Series,
262
+ profile: NumericStats,
263
+ ) -> None:
264
+ n = clean.len()
265
+ if n < 2:
266
+ # Std / variance undefined for a single observation
267
+ profile.std = 0.0
268
+ profile.variance = 0.0
269
+ return
270
+
271
+ std = float(clean.std(ddof=1)) # type: ignore[arg-type]
272
+ profile.std = std
273
+ profile.variance = std**2
274
+
275
+ # ------------------------------------------------------------------
276
+ # Step 3: Shape — skewness and kurtosis
277
+ # ------------------------------------------------------------------
278
+
279
+ @staticmethod
280
+ def _compute_shape(
281
+ clean: pl.Series,
282
+ profile: NumericStats,
283
+ ) -> None:
284
+ from scipy.stats import skew, kurtosis as scipy_kurtosis
285
+
286
+ if clean.len() < 3:
287
+ return
288
+
289
+ if profile.std is None or profile.std == 0.0:
290
+ profile.skewness = 0.0
291
+ profile.kurtosis = 0.0
292
+ profile.skewness_severity = SkewSeverity.Normal
293
+ profile.kurtosis_tag = KurtosisTag.Mesokurtic
294
+ return
295
+
296
+ arr = clean.to_numpy()
297
+ profile.skewness = float(skew(arr, bias=False))
298
+ profile.kurtosis = float(scipy_kurtosis(arr, bias=False))
299
+
300
+ abs_skew = abs(profile.skewness)
301
+ if abs_skew <= _SKEW_NORMAL:
302
+ profile.skewness_severity = SkewSeverity.Normal
303
+ elif abs_skew <= _SKEW_MODERATE:
304
+ profile.skewness_severity = SkewSeverity.Moderate
305
+ elif abs_skew <= _SKEW_HIGH:
306
+ profile.skewness_severity = SkewSeverity.High
307
+ else:
308
+ profile.skewness_severity = SkewSeverity.Severe
309
+
310
+ if profile.kurtosis < _KURT_PLATY_UPPER:
311
+ profile.kurtosis_tag = KurtosisTag.Platykurtic
312
+ elif profile.kurtosis > _KURT_LEPTO_LOWER:
313
+ profile.kurtosis_tag = KurtosisTag.Leptokurtic
314
+ else:
315
+ profile.kurtosis_tag = KurtosisTag.Mesokurtic
316
+
317
+ # ------------------------------------------------------------------
318
+ # Step 4: Range
319
+ # ------------------------------------------------------------------
320
+
321
+ @staticmethod
322
+ def _compute_range(
323
+ clean: pl.Series,
324
+ profile: NumericStats,
325
+ ) -> None:
326
+ profile.min = float(clean.min()) # type: ignore[arg-type]
327
+ profile.max = float(clean.max()) # type: ignore[arg-type]
328
+
329
+ # ------------------------------------------------------------------
330
+ # Step 5: Percentiles
331
+ # ------------------------------------------------------------------
332
+
333
+ @staticmethod
334
+ def _compute_percentiles(
335
+ clean: pl.Series,
336
+ profile: NumericStats,
337
+ ) -> None:
338
+ # Polars quantile() is O(n log n) once; compute all at once via select
339
+ # to avoid repeated passes.
340
+ quantile_frame = pl.DataFrame({"v": clean}).select(
341
+ [
342
+ pl.col("v").quantile(q, interpolation="linear").alias(f"q{i}")
343
+ for i, q in enumerate(_QUANTILE_LEVELS)
344
+ ]
345
+ )
346
+ row = quantile_frame.row(0)
347
+ # row order: p1, p5, p25, p50, p75, p95, p99
348
+ profile.percentiles = PercentileSnapshot(
349
+ p1=row[0],
350
+ p5=row[1],
351
+ p25=row[2],
352
+ p50=row[3],
353
+ p75=row[4],
354
+ p95=row[5],
355
+ p99=row[6],
356
+ )
357
+
358
+ # ------------------------------------------------------------------
359
+ # Step 6: Scale-anomaly flag
360
+ # ------------------------------------------------------------------
361
+
362
+ @staticmethod
363
+ def _check_scale_anomaly(
364
+ profile: NumericStats,
365
+ ) -> None:
366
+ """
367
+ Flag when values span ≥ 3 orders of magnitude *on the positive side*.
368
+
369
+ Rationale: a column with values like [0.002, 15000] almost certainly
370
+ mixes units or scales, which will mislead distance-based models.
371
+
372
+ We use the absolute-value range to handle columns that cross zero
373
+ (e.g. log-returns that go from -0.05 to 500). Columns whose
374
+ entire range is within [-1, 1] are exempt (percentages, probabilities).
375
+ """
376
+ col_min = profile.min
377
+ col_max = profile.max
378
+
379
+ if col_min is None or col_max is None:
380
+ return
381
+
382
+ abs_min = abs(col_min)
383
+ abs_max = abs(col_max)
384
+
385
+ # Skip all-zero or all-same-sign tiny ranges
386
+ if abs_max == 0.0:
387
+ return
388
+
389
+ # Exempt probability / ratio columns
390
+ if abs_max <= 1.0 and abs_min <= 1.0:
391
+ return
392
+
393
+ # Compute orders of magnitude
394
+ if abs_min == 0.0:
395
+ # Any non-zero max with a zero minimum → infinite ratio →
396
+ # conservatively flag if max is large enough to be suspicious.
397
+ if abs_max >= 10**_SCALE_ORDERS_OF_MAGNITUDE:
398
+ profile.flags.append(NumericFlag.ScaleAnomaly)
399
+ return
400
+
401
+ ratio = abs_max / abs_min
402
+ if ratio >= 10**_SCALE_ORDERS_OF_MAGNITUDE:
403
+ profile.flags.append(NumericFlag.ScaleAnomaly)
profiling/_tabular.py ADDED
@@ -0,0 +1,249 @@
1
+ """
2
+ TabularProfiler – Phase 1: Structural Profiling for tabular datasets.
3
+
4
+ All DataFrame operations use Polars (no pandas dependency).
5
+
6
+ Computes:
7
+ • row / column count (always full dataset)
8
+ • memory usage + per-column breakdown when threshold exceeded
9
+ • duplicate row count & ratio (scoped to config.duplicate_columns)
10
+ • overall sparsity (scoped to config.sparsity_columns)
11
+ • data-type detection (scoped to config.type_detection_columns;
12
+ skipped entirely when None)
13
+
14
+ Chunked processing is activated automatically when the DataFrame's
15
+ estimated memory exceeds config.memory_threshold_mb.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import math
21
+
22
+ import polars as pl
23
+
24
+ from ._base import ModalityProfiler
25
+ from .config import (
26
+ MemoryBreakdown,
27
+ ProfileConfig,
28
+ DatasetStats,
29
+ )
30
+
31
+
32
+ class TabularProfiler(ModalityProfiler):
33
+ """
34
+ Structural profiler for Polars DataFrames.
35
+
36
+ Usage
37
+ -----
38
+ >>> cfg = ProfileConfig(
39
+ ... duplicate_columns=["user_id", "event_time"],
40
+ ... sparsity_columns=["age", "income", "postcode"],
41
+ ... type_detection_columns=["age", "income", "postcode", "created_at"],
42
+ ... memory_threshold_mb=200,
43
+ ... )
44
+ >>> profiler = TabularProfiler(config=cfg)
45
+ >>> result = profiler.profile(df)
46
+ >>> print(result)
47
+ """
48
+
49
+ def __init__(self, config: ProfileConfig | None = None):
50
+ super().__init__(config)
51
+
52
+ # ------------------------------------------------------------------
53
+ # Public API
54
+ # ------------------------------------------------------------------
55
+
56
+ def profile(self, data: pl.DataFrame, **kwargs) -> DatasetStats:
57
+ return self._run(data)
58
+
59
+ # ------------------------------------------------------------------
60
+ # Internals
61
+ # ------------------------------------------------------------------
62
+
63
+ def _run(self, df: pl.DataFrame) -> DatasetStats:
64
+ result = DatasetStats()
65
+
66
+ # 1. Shape — always computed on the full frame
67
+ result.row_count = df.height
68
+ result.column_count = df.width
69
+
70
+ # 2. Memory
71
+ self._analyse_memory(df, result)
72
+
73
+ # Decide processing mode AFTER memory analysis
74
+ use_chunks = (result.memory_breakdown is not None) and result.row_count > 0
75
+ result.was_chunked = use_chunks
76
+
77
+ if result.row_count == 0:
78
+ return result
79
+
80
+ # 3. Resolve column scopes
81
+ all_cols: list[str] = df.columns
82
+ analysed_cols = [c for c in all_cols if c not in self.config.exclude_columns]
83
+
84
+ dup_cols = analysed_cols
85
+ missingness_cols = analysed_cols
86
+
87
+ if use_chunks:
88
+ self._chunked_metrics(df, dup_cols, missingness_cols, result)
89
+ else:
90
+ self._full_metrics(df, dup_cols, missingness_cols, result)
91
+
92
+ return result
93
+
94
+ @staticmethod
95
+ def _build_missingness_exprs(df: pl.DataFrame, cols: list[str]) -> list[pl.Expr]:
96
+ exprs = []
97
+ for col_name in cols:
98
+ dtype = df[col_name].dtype
99
+ std_expr = pl.col(col_name).is_null()
100
+
101
+ if dtype in (pl.Utf8, pl.String):
102
+ eff_expr = (
103
+ std_expr
104
+ | (pl.col(col_name).str.strip_chars() == "")
105
+ | pl.col(col_name)
106
+ .str.to_uppercase()
107
+ .is_in(["NA", "NAN", "NULL", "NONE", "?"])
108
+ )
109
+ elif dtype in (pl.Float32, pl.Float64):
110
+ eff_expr = (
111
+ std_expr
112
+ | pl.col(col_name).is_nan()
113
+ | pl.col(col_name).is_infinite()
114
+ )
115
+ else:
116
+ eff_expr = std_expr
117
+
118
+ exprs.append(std_expr.sum().alias(f"{col_name}_std"))
119
+ exprs.append(eff_expr.sum().alias(f"{col_name}_eff"))
120
+
121
+ return exprs
122
+
123
+ # ------------------------------------------------------------------
124
+ # Memory analysis
125
+ # ------------------------------------------------------------------
126
+
127
+ def _analyse_memory(self, df: pl.DataFrame, result: DatasetStats) -> None:
128
+ """
129
+ Populate memory fields on *result*.
130
+
131
+ Polars exposes estimated_size() per Series for heap allocation.
132
+ """
133
+ col_bytes: dict[str, int] = {
134
+ col: df[col].estimated_size() for col in df.columns
135
+ }
136
+ total_bytes = sum(col_bytes.values())
137
+
138
+ result.memory_bytes = total_bytes
139
+ threshold_bytes = self.config.memory_threshold_mb * 1024 * 1024
140
+
141
+ if total_bytes > threshold_bytes:
142
+ result.memory_breakdown = MemoryBreakdown(column_bytes=col_bytes)
143
+
144
+ # ------------------------------------------------------------------
145
+ # Full-frame metrics
146
+ # ------------------------------------------------------------------
147
+
148
+ def _full_metrics(
149
+ self,
150
+ df: pl.DataFrame,
151
+ dup_cols: list[str],
152
+ missing_cols: list[str],
153
+ result: DatasetStats,
154
+ ) -> None:
155
+ result.duplicate_count = self._count_duplicates(df, dup_cols)
156
+ result.duplicate_ratio = (
157
+ result.duplicate_count / result.row_count if result.row_count else 0.0
158
+ )
159
+
160
+ if missing_cols:
161
+ exprs = self._build_missingness_exprs(df, missing_cols)
162
+ row = df.select(exprs).row(0)
163
+
164
+ total_eff_cells = 0
165
+ for i, _ in enumerate(missing_cols):
166
+ eff_nulls = row[i * 2 + 1]
167
+ total_eff_cells += eff_nulls
168
+
169
+ total_cells = result.row_count * len(missing_cols)
170
+ result.overall_sparsity = (
171
+ total_eff_cells / total_cells if total_cells else 0.0
172
+ )
173
+
174
+ # ------------------------------------------------------------------
175
+ # Chunked metrics
176
+ # ------------------------------------------------------------------
177
+
178
+ def _chunked_metrics(
179
+ self,
180
+ df: pl.DataFrame,
181
+ dup_cols: list[str],
182
+ sparsity_cols: list[str],
183
+ result: DatasetStats,
184
+ ) -> None:
185
+ """
186
+ Stream through the DataFrame in row-chunks to keep peak memory low.
187
+
188
+ Duplicate detection: hash the dup_cols subset row-by-row and track
189
+ seen hashes — semantics match keep='first'.
190
+ Sparsity is accumulated as (missing_cells, total_cells).
191
+ """
192
+ chunk_size = self.config.chunk_size
193
+ n_chunks = math.ceil(result.row_count / chunk_size)
194
+
195
+ seen_hashes: set[int] = set()
196
+ dup_count = 0
197
+ missing_cells = 0
198
+ total_cells = 0
199
+
200
+ for i in range(n_chunks):
201
+ start = i * chunk_size
202
+ end = min(start + chunk_size, result.row_count)
203
+ chunk: pl.DataFrame = df.slice(start, end - start)
204
+
205
+ if dup_cols:
206
+ # --- duplicates ---
207
+ sub = chunk.select(dup_cols) if dup_cols else chunk
208
+ for row_tuple in sub.iter_rows():
209
+ h = hash(row_tuple)
210
+ if h in seen_hashes:
211
+ dup_count += 1
212
+ else:
213
+ seen_hashes.add(h)
214
+
215
+ if sparsity_cols:
216
+ # --- sparsity ---
217
+ exprs = self._build_missingness_exprs(chunk, sparsity_cols)
218
+ row = chunk.select(exprs).row(0)
219
+ for j in range(len(sparsity_cols)):
220
+ missing_cells += row[j * 2 + 1]
221
+ total_cells += chunk.height * len(sparsity_cols)
222
+
223
+ result.duplicate_count = dup_count
224
+ result.duplicate_ratio = (
225
+ dup_count / result.row_count if result.row_count else 0.0
226
+ )
227
+ result.overall_sparsity = missing_cells / total_cells if total_cells else 0.0
228
+
229
+ # ------------------------------------------------------------------
230
+ # Type detection
231
+ # ------------------------------------------------------------------
232
+
233
+ # ------------------------------------------------------------------
234
+ # Stateless helpers
235
+ # ------------------------------------------------------------------
236
+
237
+ @staticmethod
238
+ def _count_duplicates(df: pl.DataFrame, cols: list[str]) -> int:
239
+ """
240
+ Count rows that are duplicates (keeping first occurrence).
241
+
242
+ Equivalent to pandas duplicated(subset=cols, keep='first').sum().
243
+ """
244
+ sub = df.select(cols) if cols else df
245
+ # is_duplicated() marks ALL occurrences of a duplicate group.
246
+ # We want only the non-first occurrences, so we subtract the
247
+ # number of unique rows.
248
+ n_unique = sub.unique().height
249
+ return df.height - n_unique