dataforge-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. dataforge_ml-0.1.0.dist-info/METADATA +34 -0
  2. dataforge_ml-0.1.0.dist-info/RECORD +54 -0
  3. dataforge_ml-0.1.0.dist-info/WHEEL +5 -0
  4. dataforge_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
  5. dataforge_ml-0.1.0.dist-info/top_level.txt +5 -0
  6. models/__init__.py +0 -0
  7. models/_data_structure.py +7 -0
  8. models/_data_types.py +12 -0
  9. profiling/__init__.py +35 -0
  10. profiling/_base.py +101 -0
  11. profiling/_boolean_config.py +37 -0
  12. profiling/_boolean_profiler.py +191 -0
  13. profiling/_categorical.py +315 -0
  14. profiling/_categorical_config.py +87 -0
  15. profiling/_correlation_config.py +225 -0
  16. profiling/_correlation_profiler.py +544 -0
  17. profiling/_datetime_config.py +98 -0
  18. profiling/_datetime_profiler.py +406 -0
  19. profiling/_missingness_config.py +137 -0
  20. profiling/_missingness_profiler.py +252 -0
  21. profiling/_numeric_config.py +116 -0
  22. profiling/_numeric_profiler.py +403 -0
  23. profiling/_tabular.py +249 -0
  24. profiling/_target_config.py +74 -0
  25. profiling/_target_profiler.py +156 -0
  26. profiling/_text_config.py +40 -0
  27. profiling/_text_profiler.py +194 -0
  28. profiling/_type_detector.py +463 -0
  29. profiling/config.py +236 -0
  30. profiling/structural.py +280 -0
  31. splitting/__init__.py +4 -0
  32. splitting/_config.py +56 -0
  33. splitting/_splitter.py +202 -0
  34. tests/__init__.py +0 -0
  35. tests/conftest.py +7 -0
  36. tests/integration/__init__.py +0 -0
  37. tests/integration/conftest.py +82 -0
  38. tests/integration/test_structural_end_to_end.py +219 -0
  39. tests/unit/__init__.py +0 -0
  40. tests/unit/profiling/__init__.py +0 -0
  41. tests/unit/profiling/conftest.py +81 -0
  42. tests/unit/profiling/test_boolean_profiler.py +91 -0
  43. tests/unit/profiling/test_categorical_profiler.py +182 -0
  44. tests/unit/profiling/test_correlation_profiler.py +124 -0
  45. tests/unit/profiling/test_datetime_profiler.py +133 -0
  46. tests/unit/profiling/test_missingness_profiler.py +51 -0
  47. tests/unit/profiling/test_numeric_profiler.py +212 -0
  48. tests/unit/profiling/test_target_profiler.py +44 -0
  49. tests/unit/profiling/test_text_profiler.py +61 -0
  50. tests/unit/profiling/test_type_detector.py +32 -0
  51. tests/unit/splitting/__init__.py +0 -0
  52. tests/unit/splitting/test_data_splitter.py +417 -0
  53. utils/__init__.py +0 -0
  54. utils/data_loader.py +110 -0
@@ -0,0 +1,406 @@
1
+ """
2
+ DatetimeProfiler – Phase 1 extension: Datetime Column Profiling.
3
+
4
+ Per-column metrics (opt-in via ProfileConfig.datetime_columns):
5
+ 1. Range – min date, max date, total range in days
6
+ 2. Null analysis – count, ratio, MNAR flag when null_ratio > 5 %
7
+ 3. Future dates – count of values > now, with context note
8
+ 4. Granularity – inferred periodicity from median consecutive gap;
9
+ high gap-CV flagged as irregular
10
+ 5. Temporal signals – audit which of {year, month, day, day-of-week,
11
+ hour, is-weekend, is-month-end} vary in the data,
12
+ to guide downstream feature engineering
13
+
14
+ Granularity inference bands (median gap in seconds):
15
+ < 90 s → secondly
16
+ < 3 600 s → minutely
17
+ < 7 200 s → hourly
18
+ < 172 800 s → daily (< 2 days)
19
+ < 1 209 600 s → weekly (< 14 days)
20
+ < 5 184 000 s → monthly (< 60 days)
21
+ else → yearly
22
+
23
+ Integration
24
+ -----------
25
+ Add ``datetime_columns: list[str] | None`` to ProfileConfig, then call::
26
+
27
+ from profiling.datetime_profiler import DatetimeProfiler
28
+
29
+ dt_profiler = DatetimeProfiler(
30
+ columns=["created_at", "event_time"],
31
+ config=cfg,
32
+ )
33
+ dt_result = dt_profiler.profile(df)
34
+
35
+ Attach ``dt_result`` to ``StructuralProfileResult`` as
36
+ ``result.datetime``.
37
+ """
38
+
39
+ from __future__ import annotations
40
+
41
+ from datetime import datetime, timezone
42
+
43
+ import polars as pl
44
+
45
+ from ._base import ColumnBatchProfiler
46
+ from .config import (
47
+ ProfileConfig,
48
+ SemanticType,
49
+ )
50
+ from ._datetime_config import (
51
+ DatetimeProfileResult,
52
+ DatetimeStats,
53
+ InferredGranularity,
54
+ DatetimeFlag,
55
+ TemporalSignals,
56
+ )
57
+
58
+ # ---------------------------------------------------------------------------
59
+ # Thresholds
60
+ # ---------------------------------------------------------------------------
61
+
62
+ # MNAR suspicion: missing rate above this fraction → flag
63
+ _MNAR_NULL_RATIO_THRESHOLD: float = 0.05
64
+
65
+ # Gap coefficient of variation above this → flag as irregular
66
+ _HIGH_GAP_CV_THRESHOLD: float = 1.0
67
+
68
+ # Granularity bands — upper bound (exclusive) in seconds for each label
69
+ # Ordered from finest to coarsest.
70
+ _GRANULARITY_BANDS: list[tuple[float, InferredGranularity]] = [
71
+ (90.0, InferredGranularity.Secondly), # < 1.5 min
72
+ (3_600.0, InferredGranularity.Minutely), # < 1 h
73
+ (7_200.0, InferredGranularity.Hourly), # < 2 h
74
+ (172_800.0, InferredGranularity.Daily), # < 2 days
75
+ (1_209_600.0, InferredGranularity.Weekly), # < 14 days
76
+ (5_184_000.0, InferredGranularity.Monthly), # < 60 days
77
+ ]
78
+ # Anything ≥ 5_184_000 s → Yearly
79
+
80
+ # Recent-data sparsity: consider the last this-fraction of the total range
81
+ _RECENT_WINDOW_FRACTION: float = 0.10
82
+
83
+
84
+ def _is_datetime_dtype(dtype: pl.DataType) -> bool:
85
+ """Return True for Date, Datetime (any time-unit / tz)."""
86
+ return isinstance(dtype, (pl.Date, pl.Datetime))
87
+
88
+
89
+ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
90
+ """
91
+ Datetime distribution profiler for Polars DataFrames.
92
+
93
+ Parameters
94
+ ----------
95
+ columns : list[str]
96
+ Columns to profile. Non-datetime columns are skipped with a warning.
97
+ config : ProfileConfig | None
98
+ Shared profiling configuration.
99
+ """
100
+
101
+ def __init__(
102
+ self,
103
+ config: ProfileConfig | None = None,
104
+ ) -> None:
105
+ super().__init__(config)
106
+
107
+ # ------------------------------------------------------------------
108
+ # Public API
109
+ # ------------------------------------------------------------------
110
+
111
+ def profile(
112
+ self,
113
+ data: pl.DataFrame,
114
+ columns: list[str],
115
+ ) -> DatetimeProfileResult:
116
+ return self._run(data, columns)
117
+
118
+ # ------------------------------------------------------------------
119
+ # Orchestration
120
+ # ------------------------------------------------------------------
121
+
122
+ def _eligible(self, series: pl.Series) -> bool:
123
+ override = self.config.column_overrides.get(series.name)
124
+
125
+ if override == SemanticType.Datetime:
126
+ return True
127
+ if override is not None:
128
+ return False
129
+
130
+ return _is_datetime_dtype(series.dtype) or series.dtype in (pl.Utf8, pl.String)
131
+
132
+ def _coerce_to_datetime(self, series: pl.Series) -> pl.Series | None:
133
+ if series.dtype in (pl.Utf8, pl.String):
134
+ coerced = series.str.to_datetime(strict=False)
135
+ return coerced if coerced.drop_nulls().len() > 0 else None
136
+ return series
137
+
138
+ def _run(self, df: pl.DataFrame, columns: list[str]) -> DatetimeProfileResult:
139
+ result = DatetimeProfileResult()
140
+ now = datetime.now(tz=timezone.utc)
141
+
142
+ candidates = [
143
+ c
144
+ for c in self._resolve_columns(df.columns, columns)
145
+ if self._eligible(df[c])
146
+ ]
147
+
148
+ available = []
149
+ coerced_cache = {}
150
+ for col_name in candidates:
151
+ series = self._coerce_to_datetime(df[col_name])
152
+ if series is not None:
153
+ available.append(col_name)
154
+ coerced_cache[col_name] = series
155
+
156
+ result.analysed_columns = available
157
+
158
+ for col_name in available:
159
+ profile = self._profile_column(coerced_cache[col_name], df.height, now)
160
+ result.columns[col_name] = profile
161
+
162
+ return result
163
+
164
+ # ------------------------------------------------------------------
165
+ # Per-column driver
166
+ # ------------------------------------------------------------------
167
+
168
+ def _profile_column(
169
+ self,
170
+ series: pl.Series,
171
+ n_rows: int,
172
+ now: datetime,
173
+ ) -> DatetimeStats:
174
+ profile = DatetimeStats()
175
+
176
+ # Normalise to microsecond Datetime (UTC) for uniform arithmetic
177
+ # Date columns are cast to Datetime at midnight UTC.
178
+ if isinstance(series.dtype, pl.Date):
179
+ series = series.cast(pl.Datetime("us", "UTC"))
180
+ elif isinstance(series.dtype, pl.Datetime):
181
+ if series.dtype.time_zone is None:
182
+ series = series.dt.replace_time_zone("UTC")
183
+ else:
184
+ series = series.dt.convert_time_zone("UTC")
185
+
186
+ # Drop nulls for all remaining computations
187
+ clean = series.drop_nulls()
188
+
189
+ if clean.len() == 0:
190
+ return profile
191
+
192
+ # 2. Range
193
+ self._compute_range(clean, profile)
194
+
195
+ # 3. Future dates
196
+ self._check_future_dates(clean, profile, now)
197
+
198
+ # 4. Recent data sparsity (needs range, so after _compute_range)
199
+ self._check_recent_date_missing(series, profile)
200
+
201
+ # 5. Granularity
202
+ self._infer_granularity(clean, profile)
203
+
204
+ # 6. Temporal signals
205
+ self._audit_temporal_signals(clean, profile)
206
+
207
+ return profile
208
+
209
+ # ------------------------------------------------------------------
210
+ # Step 2: Range
211
+ # ------------------------------------------------------------------
212
+
213
+ @staticmethod
214
+ def _compute_range(
215
+ clean: pl.Series,
216
+ profile: DatetimeStats,
217
+ ) -> None:
218
+ min_ts = clean.min()
219
+ max_ts = clean.max()
220
+
221
+ if min_ts is not None:
222
+ profile.min_date = (
223
+ min_ts.replace(tzinfo=timezone.utc)
224
+ if isinstance(min_ts, datetime)
225
+ else min_ts
226
+ )
227
+ if max_ts is not None:
228
+ profile.max_date = (
229
+ max_ts.replace(tzinfo=timezone.utc)
230
+ if isinstance(max_ts, datetime)
231
+ else max_ts
232
+ )
233
+
234
+ if profile.min_date is not None and profile.max_date is not None:
235
+ delta = profile.max_date - profile.min_date
236
+ profile.date_range_days = delta.total_seconds() / 86_400.0
237
+
238
+ # ------------------------------------------------------------------
239
+ # Step 3: Future dates
240
+ # ------------------------------------------------------------------
241
+
242
+ @staticmethod
243
+ def _check_future_dates(
244
+ clean: pl.Series,
245
+ profile: DatetimeStats,
246
+ now: datetime,
247
+ ) -> None:
248
+ # Cast to Int64 (epoch microseconds) and compare against now scalar
249
+ now_us = int(now.timestamp() * 1_000_000)
250
+ ts_int = clean.cast(pl.Int64)
251
+ future_mask = ts_int > now_us
252
+ future_count = int(future_mask.sum())
253
+
254
+ profile.future_date_count = future_count
255
+ if future_count > 0:
256
+ profile.flags.append(DatetimeFlag.FutureDates)
257
+
258
+ # ------------------------------------------------------------------
259
+ # Step 3b: Recent data sparsity
260
+ # ------------------------------------------------------------------
261
+
262
+ @staticmethod
263
+ def _check_recent_date_missing(
264
+ series: pl.Series,
265
+ profile: DatetimeStats,
266
+ ) -> None:
267
+ """
268
+ Flag when the last _RECENT_WINDOW_FRACTION of the expected date
269
+ range contains fewer observations than expected.
270
+
271
+ We compare density in the recent window vs overall density.
272
+ If the recent window has < 20 % of the expected density → flag.
273
+ """
274
+ if profile.min_date is None or profile.max_date is None:
275
+ return
276
+ if profile.date_range_days is None or profile.date_range_days == 0:
277
+ return
278
+
279
+ range_seconds = profile.date_range_days * 86_400.0
280
+ window_seconds = range_seconds * _RECENT_WINDOW_FRACTION
281
+
282
+ # Compute cutoff as epoch microseconds
283
+ max_ts_us = int(profile.max_date.timestamp() * 1_000_000)
284
+ window_us = int(window_seconds * 1_000_000)
285
+ cutoff_us = max_ts_us - window_us
286
+
287
+ # Cast series to Int64 (epoch microseconds) for comparison
288
+ ts_int = series.cast(pl.Int64)
289
+ recent_mask = ts_int >= cutoff_us
290
+ recent_count = int(recent_mask.sum())
291
+
292
+ # Expected count if uniform distribution
293
+ total_non_null = series.drop_nulls().len()
294
+ if total_non_null == 0:
295
+ return
296
+ expected_recent = total_non_null * _RECENT_WINDOW_FRACTION
297
+ density_ratio = recent_count / expected_recent if expected_recent > 0 else 1.0
298
+
299
+ if density_ratio < 0.20:
300
+ profile.flags.append(DatetimeFlag.RecentDateMissing)
301
+
302
+ # ------------------------------------------------------------------
303
+ # Step 4: Granularity inference
304
+ # ------------------------------------------------------------------
305
+
306
+ @staticmethod
307
+ def _infer_granularity(
308
+ clean: pl.Series,
309
+ profile: DatetimeStats,
310
+ ) -> None:
311
+ """
312
+ Sort values, compute consecutive gaps in seconds, derive median gap.
313
+
314
+ Uses Int64 epoch-microsecond representation for vectorised diff.
315
+ """
316
+ n = clean.len()
317
+ if n < 2:
318
+ profile.inferred_granularity = InferredGranularity.Irregular
319
+ return
320
+
321
+ ts_us = clean.sort().cast(pl.Int64) # microseconds since epoch
322
+ gaps_us = ts_us.diff().drop_nulls() # consecutive differences
323
+
324
+ # Discard zero and negative gaps (exact duplicates or out-of-order noise)
325
+ gaps_us = gaps_us.filter(gaps_us > 0)
326
+
327
+ if gaps_us.len() == 0:
328
+ profile.inferred_granularity = InferredGranularity.Irregular
329
+ return
330
+
331
+ gaps_s = gaps_us.cast(pl.Float64) / 1_000_000.0 # → seconds
332
+
333
+ median_gap_s = float(gaps_s.median()) # type: ignore[arg-type]
334
+ mean_gap_s = float(gaps_s.mean()) # type: ignore[arg-type]
335
+ std_gap_s = float(gaps_s.std(ddof=1)) if gaps_s.len() > 1 else 0.0
336
+
337
+ profile.median_gap_seconds = median_gap_s
338
+
339
+ # Coefficient of variation (robust to skewed gap distributions)
340
+ if mean_gap_s > 0:
341
+ profile.gap_cv = std_gap_s / mean_gap_s
342
+ if profile.gap_cv > _HIGH_GAP_CV_THRESHOLD:
343
+ profile.flags.append(DatetimeFlag.HighGapVariance)
344
+ else:
345
+ profile.gap_cv = 0.0
346
+
347
+ # Map median gap to granularity label
348
+ granularity = InferredGranularity.Yearly # default (coarsest)
349
+ for upper_bound, label in _GRANULARITY_BANDS:
350
+ if median_gap_s < upper_bound:
351
+ granularity = label
352
+ break
353
+
354
+ profile.inferred_granularity = granularity
355
+
356
+ # ------------------------------------------------------------------
357
+ # Step 5: Temporal signal audit
358
+ # ------------------------------------------------------------------
359
+
360
+ @staticmethod
361
+ def _audit_temporal_signals(
362
+ clean: pl.Series,
363
+ profile: DatetimeStats,
364
+ ) -> None:
365
+ """
366
+ Check which temporal features vary across rows
367
+
368
+ All checks are done via Polars expressions on the full clean series,
369
+ so no Python-level loops are required.
370
+ """
371
+ signals = TemporalSignals()
372
+
373
+ years = clean.dt.year()
374
+ months = clean.dt.month()
375
+ days = clean.dt.day()
376
+ dow = clean.dt.weekday() # 0=Monday … 6=Sunday
377
+ hours = clean.dt.hour()
378
+
379
+ signals.has_year = years.n_unique() > 1
380
+ signals.has_month = months.n_unique() > 1
381
+ signals.has_day = days.n_unique() > 1
382
+ signals.has_day_of_week = dow.n_unique() > 1
383
+ signals.has_hour = int(hours.max()) > 0 # type: ignore[arg-type]
384
+
385
+ # Weekend signal is only meaningful when day-of-week varies
386
+ if signals.has_day_of_week:
387
+ weekend_mask = dow >= 5 # Saturday=5, Sunday=6
388
+ signals.has_is_weekend = bool(weekend_mask.any())
389
+
390
+ # Month-end: day == last day of the respective month
391
+ # We approximate: day == 28/29/30/31 AND next-day's month ≠ current month.
392
+ # Polars has dt.month_end() which returns the last day of the month.
393
+ try:
394
+ month_end_ts = clean.dt.month_end()
395
+ # Strip time component for date-level comparison
396
+ is_month_end_mask = (
397
+ (clean.dt.year() == month_end_ts.dt.year())
398
+ & (clean.dt.month() == month_end_ts.dt.month())
399
+ & (clean.dt.day() == month_end_ts.dt.day())
400
+ )
401
+ signals.has_is_month_end = bool(is_month_end_mask.any())
402
+ except Exception:
403
+ # Fallback: flag if day ≥ 28
404
+ signals.has_is_month_end = False
405
+
406
+ profile.signals = signals
@@ -0,0 +1,137 @@
1
+ """
2
+ Result dataclasses for missingness profiling.
3
+
4
+ Populated by MissingnessProfiler, which is always run as part of
5
+ StructuralProfiler (non-optional Phase 1 component).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+ from enum import StrEnum
12
+ from typing import Optional
13
+
14
+
15
+ # ---------------------------------------------------------------------------
16
+ # Enums
17
+ # ---------------------------------------------------------------------------
18
+
19
+
20
+ class MissingSeverity(StrEnum):
21
+ Minor = "minor" # < 1% missing
22
+ Moderate = "moderate" # 1–5% missing
23
+ High = "high" # 5–20% missing
24
+ Severe = "severe" # > 20% missing
25
+
26
+
27
+ class MissingnessFlag(StrEnum):
28
+ FullyNull = "fully_null" # missing ratio == 1.0 → must drop
29
+ MARSuspect = "mar_suspect" # correlated missingness with ≥1 other col
30
+ DropCandidate = "drop_candidate" # >50% of rows missing across the column
31
+
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # Per-column result
35
+ # ---------------------------------------------------------------------------
36
+
37
+
38
+ @dataclass
39
+ class ColumnMissingnessProfile:
40
+ """
41
+ Full missingness profile for a single column.
42
+
43
+ Attributes
44
+ ----------
45
+ column : str
46
+ Column name.
47
+ total_rows : int
48
+ Total rows in the DataFrame.
49
+ standard_null_count : int
50
+ Polars-level nulls (None / NaN for floats).
51
+ effective_null_count : int
52
+ Standard nulls + whitespace-only strings + sentinel strings
53
+ ("NA", "NAN", "NULL", "NONE", "?") — i.e. the count used for
54
+ imputation decisions.
55
+ standard_null_ratio : float
56
+ standard_null_count / total_rows.
57
+ effective_null_ratio : float
58
+ effective_null_count / total_rows.
59
+ severity : MissingSeverity
60
+ Derived from effective_null_ratio.
61
+ flags : list[MissingnessFlag]
62
+ Zero or more non-exclusive behavioural flags.
63
+ correlated_with : list[str]
64
+ Columns whose missingness indicator correlates > 0.6 with this
65
+ column's indicator (populated after the correlation matrix pass).
66
+ """
67
+
68
+ column: str
69
+ total_rows: int
70
+
71
+ standard_null_count: int = 0
72
+ effective_null_count: int = 0
73
+ standard_null_ratio: float = 0.0
74
+ effective_null_ratio: float = 0.0
75
+
76
+ severity: Optional[MissingSeverity] = None
77
+
78
+ flags: list[MissingnessFlag] = field(default_factory=list)
79
+ correlated_with: list[str] = field(default_factory=list)
80
+
81
+ def has_flag(self, flag: MissingnessFlag) -> bool:
82
+ return flag in self.flags
83
+
84
+ def __str__(self) -> str: # pragma: no cover
85
+ lines = [
86
+ f" Column : {self.column}",
87
+ f" Standard nulls : {self.standard_null_count:,}"
88
+ f" ({self.standard_null_ratio:.2%})",
89
+ f" Effective nulls : {self.effective_null_count:,}"
90
+ f" ({self.effective_null_ratio:.2%})",
91
+ f" Severity : {self.severity or 'N/A'}",
92
+ ]
93
+ if self.correlated_with:
94
+ lines.append(f" MAR correlates with: {', '.join(self.correlated_with)}")
95
+ if self.flags:
96
+ lines.append(f" Flags : {', '.join(self.flags)}")
97
+ return "\n".join(lines)
98
+
99
+
100
+
101
+ @dataclass
102
+ class MissingnessProfileResult:
103
+ """
104
+ Missingness profile for all analysed columns.
105
+
106
+ Attributes
107
+ ----------
108
+ columns : dict[str, ColumnMissingnessProfile]
109
+ Per-column profiles, keyed by column name.
110
+ analysed_columns : list[str]
111
+ Columns that were actually profiled.
112
+ fully_null_columns : list[str]
113
+ Columns where effective_null_ratio == 1.0. Must be dropped.
114
+ correlation_matrix : dict[str, dict[str, float]]
115
+ Pairwise Pearson correlations between binary missingness indicators.
116
+ Only populated when ≥ 2 columns have at least one missing value.
117
+ Stored as a nested dict: matrix[col_a][col_b] = correlation.
118
+ row_distribution : RowMissingnessDistribution
119
+ Aggregate row-wise missingness summary.
120
+ """
121
+
122
+ columns: dict[str, ColumnMissingnessProfile] = field(default_factory=dict)
123
+ analysed_columns: list[str] = field(default_factory=list)
124
+ fully_null_columns: list[str] = field(default_factory=list)
125
+ correlation_matrix: dict[str, dict[str, float]] = field(default_factory=dict)
126
+
127
+ def __str__(self) -> str: # pragma: no cover
128
+ lines = ["=== Missingness Profile ==="]
129
+ for profile in self.columns.values():
130
+ lines.append(str(profile))
131
+ if self.fully_null_columns:
132
+ lines.append(
133
+ f"\n Fully-null columns (must drop): "
134
+ f"{', '.join(self.fully_null_columns)}"
135
+ )
136
+
137
+ return "\n".join(lines)