dataforge-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. dataforge_ml-0.1.0.dist-info/METADATA +34 -0
  2. dataforge_ml-0.1.0.dist-info/RECORD +54 -0
  3. dataforge_ml-0.1.0.dist-info/WHEEL +5 -0
  4. dataforge_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
  5. dataforge_ml-0.1.0.dist-info/top_level.txt +5 -0
  6. models/__init__.py +0 -0
  7. models/_data_structure.py +7 -0
  8. models/_data_types.py +12 -0
  9. profiling/__init__.py +35 -0
  10. profiling/_base.py +101 -0
  11. profiling/_boolean_config.py +37 -0
  12. profiling/_boolean_profiler.py +191 -0
  13. profiling/_categorical.py +315 -0
  14. profiling/_categorical_config.py +87 -0
  15. profiling/_correlation_config.py +225 -0
  16. profiling/_correlation_profiler.py +544 -0
  17. profiling/_datetime_config.py +98 -0
  18. profiling/_datetime_profiler.py +406 -0
  19. profiling/_missingness_config.py +137 -0
  20. profiling/_missingness_profiler.py +252 -0
  21. profiling/_numeric_config.py +116 -0
  22. profiling/_numeric_profiler.py +403 -0
  23. profiling/_tabular.py +249 -0
  24. profiling/_target_config.py +74 -0
  25. profiling/_target_profiler.py +156 -0
  26. profiling/_text_config.py +40 -0
  27. profiling/_text_profiler.py +194 -0
  28. profiling/_type_detector.py +463 -0
  29. profiling/config.py +236 -0
  30. profiling/structural.py +280 -0
  31. splitting/__init__.py +4 -0
  32. splitting/_config.py +56 -0
  33. splitting/_splitter.py +202 -0
  34. tests/__init__.py +0 -0
  35. tests/conftest.py +7 -0
  36. tests/integration/__init__.py +0 -0
  37. tests/integration/conftest.py +82 -0
  38. tests/integration/test_structural_end_to_end.py +219 -0
  39. tests/unit/__init__.py +0 -0
  40. tests/unit/profiling/__init__.py +0 -0
  41. tests/unit/profiling/conftest.py +81 -0
  42. tests/unit/profiling/test_boolean_profiler.py +91 -0
  43. tests/unit/profiling/test_categorical_profiler.py +182 -0
  44. tests/unit/profiling/test_correlation_profiler.py +124 -0
  45. tests/unit/profiling/test_datetime_profiler.py +133 -0
  46. tests/unit/profiling/test_missingness_profiler.py +51 -0
  47. tests/unit/profiling/test_numeric_profiler.py +212 -0
  48. tests/unit/profiling/test_target_profiler.py +44 -0
  49. tests/unit/profiling/test_text_profiler.py +61 -0
  50. tests/unit/profiling/test_type_detector.py +32 -0
  51. tests/unit/splitting/__init__.py +0 -0
  52. tests/unit/splitting/test_data_splitter.py +417 -0
  53. utils/__init__.py +0 -0
  54. utils/data_loader.py +110 -0
@@ -0,0 +1,544 @@
1
+ """
2
+ CorrelationProfiler – Phase 1 extension: Correlation & Information Structure.
3
+
4
+ Two public methods
5
+ ------------------
6
+ profile_features(df, numeric_cols, categorical_cols)
7
+ Computes pairwise Pearson + Spearman matrices and near-redundancy
8
+ groups. Target-independent — run once per dataset.
9
+ Returns a CorrelationProfileResult with all matrix fields populated
10
+ and all target-specific fields empty.
11
+
12
+ profile_target(df, feature_result, numeric_cols, categorical_cols, target_col)
13
+ Takes the already-computed feature_result (so matrices are NOT
14
+ recomputed) and adds feature-target Pearson / ANOVA / MI for one
15
+ specific target column.
16
+ Returns a new CorrelationProfileResult that shares the same matrix
17
+ data plus the target-specific fields.
18
+
19
+ StructuralProfiler calls them like this::
20
+
21
+ feature_corr = profiler.profile_features(df, numeric_cols, cat_cols)
22
+ result.dataset.feature_correlation = feature_corr
23
+
24
+ for target in config.target_columns:
25
+ target_corr = profiler.profile_target(
26
+ df, feature_corr, numeric_cols, cat_cols, target
27
+ )
28
+ result.dataset.target_correlations[target] = target_corr
29
+
30
+ The legacy profile() method is preserved for backward compatibility —
31
+ it calls profile_features() when no target is given, or profile_target()
32
+ after an internal profile_features() call when a target is given.
33
+ """
34
+
35
+ from __future__ import annotations
36
+
37
+ import copy
38
+ import itertools
39
+ import warnings
40
+ from typing import Optional
41
+
42
+ import polars as pl
43
+
44
+ from ._base import DatasetLevelProfiler
45
+ from .config import ProfileConfig
46
+ from ._correlation_config import (
47
+ CategoricalTargetCorrelation,
48
+ CorrelationPair,
49
+ CorrelationProfileResult,
50
+ MutualInformationEntry,
51
+ NearRedundancyGroup,
52
+ NumericTargetCorrelation,
53
+ TargetType,
54
+ )
55
+ from ..models._data_types import _NUMERIC_DTYPES, _INT_DTYPES
56
+
57
+ _NEAR_REDUNDANT_THRESHOLD: float = 0.95
58
+ _TOP_N_FEATURE_TARGET: int = 10
59
+ _MI_N_NEIGHBORS: int = 3
60
+ _MI_MIN_ROWS: int = 10 # min complete-case rows for a meaningful k-NN MI estimate
61
+
62
+
63
+ # ---------------------------------------------------------------------------
64
+ # Union-Find (unchanged)
65
+ # ---------------------------------------------------------------------------
66
+
67
+
68
+ class _UnionFind:
69
+ def __init__(self) -> None:
70
+ self._parent: dict[str, str] = {}
71
+
72
+ def find(self, x: str) -> str:
73
+ if x not in self._parent:
74
+ self._parent[x] = x
75
+ while self._parent[x] != x:
76
+ self._parent[x] = self._parent[self._parent[x]]
77
+ x = self._parent[x]
78
+ return x
79
+
80
+ def union(self, a: str, b: str) -> None:
81
+ ra, rb = self.find(a), self.find(b)
82
+ if ra != rb:
83
+ self._parent[rb] = ra
84
+
85
+ def groups(self) -> list[list[str]]:
86
+ from collections import defaultdict
87
+
88
+ buckets: dict[str, list[str]] = defaultdict(list)
89
+ for x in self._parent:
90
+ buckets[self.find(x)].append(x)
91
+ return [sorted(m) for m in buckets.values() if len(m) > 1]
92
+
93
+
94
+ # ---------------------------------------------------------------------------
95
+ # Main profiler
96
+ # ---------------------------------------------------------------------------
97
+
98
+
99
+ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
100
+ """
101
+ Correlation and information-structure profiler.
102
+
103
+ Parameters
104
+ ----------
105
+ numeric_columns : list[str]
106
+ Columns to include in pairwise matrices.
107
+ categorical_columns : list[str]
108
+ Columns to include in MI / ANOVA target steps.
109
+ target_column : str | None
110
+ Used only by the legacy profile() path.
111
+ config : ProfileConfig | None
112
+ near_redundant_threshold : float
113
+ top_n_feature_target : int
114
+ """
115
+
116
+ def __init__(
117
+ self,
118
+ numeric_columns: list[str],
119
+ categorical_columns: Optional[list[str]] = None,
120
+ config: Optional[ProfileConfig] = None,
121
+ near_redundant_threshold: float = _NEAR_REDUNDANT_THRESHOLD,
122
+ top_n_feature_target: int = _TOP_N_FEATURE_TARGET,
123
+ ) -> None:
124
+ super().__init__(config)
125
+ self._numeric_columns = numeric_columns
126
+ self._categorical_columns = categorical_columns or []
127
+ self._threshold = near_redundant_threshold
128
+ self._top_n = top_n_feature_target
129
+
130
+ # ------------------------------------------------------------------
131
+ # Concrete implementation of the abstract base method
132
+ # ------------------------------------------------------------------
133
+
134
+ def profile(self, data: pl.DataFrame) -> CorrelationProfileResult: # type: ignore[override]
135
+ return self.profile_features(data, self._numeric_columns)
136
+
137
+ # ------------------------------------------------------------------
138
+ # Primary API (called by StructuralProfiler)
139
+ # ------------------------------------------------------------------
140
+
141
+ def profile_features(
142
+ self,
143
+ df: pl.DataFrame,
144
+ numeric_cols: list[str],
145
+ ) -> CorrelationProfileResult:
146
+ """
147
+ Compute pairwise feature-feature correlation matrices.
148
+
149
+ Pearson + Spearman matrices and near-redundancy groups are filled.
150
+ All target-specific fields are left at their defaults (empty lists /
151
+ None). Call profile_target() separately for each target column.
152
+ """
153
+ result = CorrelationProfileResult()
154
+
155
+ resolved_numeric = [
156
+ c
157
+ for c in numeric_cols
158
+ if c in df.columns and df[c].dtype in _NUMERIC_DTYPES
159
+ ]
160
+ result.analysed_numeric_columns = resolved_numeric
161
+
162
+ if len(resolved_numeric) >= 2:
163
+ pearson_mat, spearman_mat = self._compute_matrices(df, resolved_numeric)
164
+ result.pearson_matrix = pearson_mat
165
+ result.spearman_matrix = spearman_mat
166
+
167
+ pairs = self._build_pairs(resolved_numeric, pearson_mat, spearman_mat)
168
+ result.pairwise = pairs
169
+ result.near_redundant_pairs = [p for p in pairs if p.near_redundant]
170
+ result.near_redundancy_groups = self._build_redundancy_groups(
171
+ result.near_redundant_pairs
172
+ )
173
+
174
+ return result
175
+
176
+ def profile_target(
177
+ self,
178
+ df: pl.DataFrame,
179
+ feature_result: CorrelationProfileResult,
180
+ numeric_cols: list[str],
181
+ categorical_cols: list[str],
182
+ target_col: str,
183
+ ) -> CorrelationProfileResult:
184
+ """
185
+ Extend an existing feature_result with target-specific analysis
186
+ for one target column.
187
+
188
+ The pairwise matrices are NOT recomputed — they are copied from
189
+ feature_result so the returned object is fully self-contained
190
+ (i.e. safe to store independently and serialise).
191
+
192
+ Feature columns for target analysis exclude the target itself,
193
+ so a target that also appears in numeric_cols / categorical_cols
194
+ is automatically excluded from its own feature-target stats.
195
+ """
196
+ if target_col not in df.columns:
197
+ raise ValueError(f"target_col {target_col!r} not found in DataFrame.")
198
+
199
+ # Shallow-copy the result so matrix dicts are shared (not duplicated
200
+ # in memory) but the top-level object is independent.
201
+ result = copy.copy(feature_result)
202
+ result.target_column = target_col
203
+ result.target_type = self._detect_target_type(df, target_col)
204
+
205
+ # Feature columns: exclude the target from both lists
206
+ feature_numeric = [
207
+ c
208
+ for c in numeric_cols
209
+ if c != target_col and c in df.columns and df[c].dtype in _NUMERIC_DTYPES
210
+ ]
211
+ feature_categorical = [
212
+ c for c in categorical_cols if c != target_col and c in df.columns
213
+ ]
214
+
215
+ if result.target_type == TargetType.Numeric:
216
+ result.feature_target_numeric = self._feature_target_pearson(
217
+ df, feature_numeric, target_col
218
+ )
219
+ else:
220
+ result.feature_target_categorical = self._feature_target_anova(
221
+ df, feature_numeric, target_col
222
+ )
223
+
224
+ result.mutual_information = self._mutual_information(
225
+ df, feature_numeric, feature_categorical, target_col, result.target_type
226
+ )
227
+
228
+ return result
229
+
230
+ # ------------------------------------------------------------------
231
+ # Step 1–2: Pearson + Spearman matrices (unchanged)
232
+ # ------------------------------------------------------------------
233
+
234
+ @staticmethod
235
+ def _compute_matrices(
236
+ df: pl.DataFrame,
237
+ cols: list[str],
238
+ ) -> tuple[dict[str, dict[str, float]], dict[str, dict[str, float]]]:
239
+ pearson_mat: dict[str, dict[str, float]] = {c: {c: 1.0} for c in cols}
240
+ spearman_mat: dict[str, dict[str, float]] = {c: {c: 1.0} for c in cols}
241
+
242
+ pairs = list(itertools.combinations(cols, 2))
243
+ if not pairs:
244
+ return pearson_mat, spearman_mat
245
+
246
+ numeric_frame = df.select([pl.col(c).cast(pl.Float64).alias(c) for c in cols])
247
+ rank_frame = numeric_frame.select(
248
+ [pl.col(c).rank(method="average").alias(c) for c in cols]
249
+ )
250
+
251
+ pearson_exprs = [
252
+ pl.corr(col_a, col_b, method="pearson")
253
+ .fill_nan(0.0)
254
+ .fill_null(0.0)
255
+ .alias(f"p|{col_a}|{col_b}")
256
+ for col_a, col_b in pairs
257
+ ]
258
+ spearman_exprs = [
259
+ pl.corr(col_a, col_b, method="pearson")
260
+ .fill_nan(0.0)
261
+ .fill_null(0.0)
262
+ .alias(f"s|{col_a}|{col_b}")
263
+ for col_a, col_b in pairs
264
+ ]
265
+
266
+ p_row = numeric_frame.select(pearson_exprs).row(0)
267
+ s_row = rank_frame.select(spearman_exprs).row(0)
268
+
269
+ for i, (col_a, col_b) in enumerate(pairs):
270
+ p_r = max(-1.0, min(1.0, float(p_row[i])))
271
+ s_r = max(-1.0, min(1.0, float(s_row[i])))
272
+ pearson_mat[col_a][col_b] = p_r
273
+ pearson_mat[col_b][col_a] = p_r
274
+ spearman_mat[col_a][col_b] = s_r
275
+ spearman_mat[col_b][col_a] = s_r
276
+
277
+ return pearson_mat, spearman_mat
278
+
279
+ # ------------------------------------------------------------------
280
+ # Step 3–4: Near-redundancy (unchanged)
281
+ # ------------------------------------------------------------------
282
+
283
+ def _build_pairs(
284
+ self,
285
+ cols: list[str],
286
+ pearson_mat: dict[str, dict[str, float]],
287
+ spearman_mat: dict[str, dict[str, float]],
288
+ ) -> list[CorrelationPair]:
289
+ pairs: list[CorrelationPair] = []
290
+ for col_a, col_b in itertools.combinations(cols, 2):
291
+ p_r = pearson_mat.get(col_a, {}).get(col_b)
292
+ s_r = spearman_mat.get(col_a, {}).get(col_b)
293
+ near_redundant = (p_r is not None and abs(p_r) > self._threshold) or (
294
+ s_r is not None and abs(s_r) > self._threshold
295
+ )
296
+ pairs.append(
297
+ CorrelationPair(
298
+ col_a=col_a,
299
+ col_b=col_b,
300
+ pearson_r=p_r,
301
+ spearman_r=s_r,
302
+ near_redundant=near_redundant,
303
+ )
304
+ )
305
+ return pairs
306
+
307
+ @staticmethod
308
+ def _build_redundancy_groups(
309
+ near_redundant_pairs: list[CorrelationPair],
310
+ ) -> list[NearRedundancyGroup]:
311
+ uf = _UnionFind()
312
+ for pair in near_redundant_pairs:
313
+ uf.union(pair.col_a, pair.col_b)
314
+ return [
315
+ NearRedundancyGroup(columns=members, suggested_drop=members[1:])
316
+ for members in uf.groups()
317
+ ]
318
+
319
+ # ------------------------------------------------------------------
320
+ # Step 5a: Feature–target Pearson (unchanged)
321
+ # ------------------------------------------------------------------
322
+
323
+ def _feature_target_pearson(
324
+ self,
325
+ df: pl.DataFrame,
326
+ feature_cols: list[str],
327
+ target_col: str,
328
+ ) -> list[NumericTargetCorrelation]:
329
+ if not feature_cols:
330
+ return []
331
+
332
+ target_series = df[target_col].cast(pl.Float64)
333
+ exprs = [
334
+ pl.corr(feat, target_col, method="pearson")
335
+ .fill_nan(0.0)
336
+ .fill_null(0.0)
337
+ .alias(feat)
338
+ for feat in feature_cols
339
+ ]
340
+ frame = df.select(
341
+ [pl.col(c).cast(pl.Float64) for c in feature_cols]
342
+ + [target_series.alias(target_col)]
343
+ ).select(exprs)
344
+
345
+ row = frame.row(0)
346
+ entries = [
347
+ NumericTargetCorrelation(feature=col, pearson_r=float(r))
348
+ for col, r in zip(feature_cols, row)
349
+ ]
350
+ entries.sort(key=lambda e: abs(e.pearson_r or 0.0), reverse=True)
351
+ return entries[: self._top_n]
352
+
353
+ # ------------------------------------------------------------------
354
+ # Step 5b: Feature–target ANOVA (unchanged)
355
+ # ------------------------------------------------------------------
356
+
357
+ def _feature_target_anova(
358
+ self,
359
+ df: pl.DataFrame,
360
+ feature_cols: list[str],
361
+ target_col: str,
362
+ ) -> list[CategoricalTargetCorrelation]:
363
+ try:
364
+ from scipy.stats import f_oneway # type: ignore[import]
365
+ except ImportError:
366
+ warnings.warn(
367
+ "scipy is required for ANOVA feature-target analysis. "
368
+ "Install it with: pip install scipy",
369
+ stacklevel=3,
370
+ )
371
+ return []
372
+
373
+ if not feature_cols:
374
+ return []
375
+
376
+ target_series = df[target_col]
377
+ categories = target_series.drop_nulls().unique().to_list()
378
+ entries: list[CategoricalTargetCorrelation] = []
379
+
380
+ for feat in feature_cols:
381
+ feat_series = df[feat].cast(pl.Float64)
382
+ groups = [
383
+ feat_series.filter(target_series == cat).drop_nulls().to_numpy()
384
+ for cat in categories
385
+ ]
386
+ non_empty = [g for g in groups if len(g) > 0]
387
+ if len(non_empty) < 2:
388
+ entries.append(CategoricalTargetCorrelation(feature=feat))
389
+ continue
390
+ try:
391
+ f_stat, p_val = f_oneway(*non_empty)
392
+ except Exception:
393
+ entries.append(CategoricalTargetCorrelation(feature=feat))
394
+ continue
395
+
396
+ grand_mean = feat_series.drop_nulls().mean() or 0.0
397
+ ss_total = float(
398
+ ((feat_series.drop_nulls() - grand_mean) ** 2).sum() or 0.0
399
+ )
400
+ ss_between = sum(
401
+ len(g) * (float(g.mean()) - float(grand_mean)) ** 2 for g in non_empty
402
+ )
403
+ eta_sq = ss_between / ss_total if ss_total > 0 else 0.0
404
+ entries.append(
405
+ CategoricalTargetCorrelation(
406
+ feature=feat,
407
+ f_statistic=float(f_stat),
408
+ p_value=float(p_val),
409
+ eta_squared=eta_sq,
410
+ )
411
+ )
412
+
413
+ entries.sort(key=lambda e: e.eta_squared or 0.0, reverse=True)
414
+ return entries[: self._top_n]
415
+
416
+ # ------------------------------------------------------------------
417
+ # Step 6: Mutual Information (unchanged)
418
+ # ------------------------------------------------------------------
419
+
420
+ def _mutual_information(
421
+ self,
422
+ df: pl.DataFrame,
423
+ numeric_feature_cols: list[str],
424
+ categorical_feature_cols: list[str],
425
+ target_col: str,
426
+ target_type: TargetType,
427
+ ) -> list[MutualInformationEntry]:
428
+ try:
429
+ from sklearn.feature_selection import ( # type: ignore[import]
430
+ mutual_info_classif,
431
+ mutual_info_regression,
432
+ )
433
+ except ImportError:
434
+ warnings.warn(
435
+ "scikit-learn is required for mutual information analysis. "
436
+ "Install it with: pip install scikit-learn",
437
+ stacklevel=3,
438
+ )
439
+ return []
440
+
441
+ if not numeric_feature_cols and not categorical_feature_cols:
442
+ return []
443
+
444
+ fn = (
445
+ mutual_info_regression
446
+ if target_type == TargetType.Numeric
447
+ else mutual_info_classif
448
+ )
449
+
450
+ # Build the target array once; track its null positions separately so
451
+ # each feature can use its own complete-case mask (rows null in either
452
+ # the feature or the target are excluded for that feature only).
453
+ if target_type == TargetType.Numeric:
454
+ target_series = df[target_col].cast(pl.Float64)
455
+ y_full = target_series.to_numpy()
456
+ target_null = (target_series.is_null() | target_series.is_nan()).to_numpy()
457
+ else:
458
+ target_encoded = (
459
+ df[target_col]
460
+ .cast(pl.Utf8, strict=False)
461
+ .cast(pl.Categorical)
462
+ .to_physical()
463
+ .cast(pl.Int64)
464
+ )
465
+ y_full = target_encoded.to_numpy()
466
+ target_null = df[target_col].is_null().to_numpy()
467
+
468
+ entries: list[MutualInformationEntry] = []
469
+
470
+ for col in numeric_feature_cols:
471
+ series = df[col].cast(pl.Float64)
472
+ feat_null = (series.is_null() | series.is_nan()).to_numpy()
473
+ valid = ~(feat_null | target_null)
474
+ n_valid = int(valid.sum())
475
+ if n_valid < _MI_MIN_ROWS:
476
+ continue
477
+ x = series.to_numpy()[valid].reshape(-1, 1)
478
+ y = y_full[valid]
479
+ try:
480
+ score = float(
481
+ fn(
482
+ x,
483
+ y,
484
+ discrete_features=[df[col].dtype in _INT_DTYPES],
485
+ n_neighbors=_MI_N_NEIGHBORS,
486
+ random_state=42,
487
+ )[0]
488
+ )
489
+ except Exception as exc:
490
+ warnings.warn(f"MI failed for '{col}': {exc}", stacklevel=3)
491
+ continue
492
+ entries.append(MutualInformationEntry(feature=col, mi_score=score))
493
+
494
+ for col in categorical_feature_cols:
495
+ feat_encoded = (
496
+ df[col]
497
+ .cast(pl.Utf8, strict=False)
498
+ .cast(pl.Categorical)
499
+ .to_physical()
500
+ .cast(pl.Int64)
501
+ )
502
+ feat_null = df[col].is_null().to_numpy()
503
+ valid = ~(feat_null | target_null)
504
+ n_valid = int(valid.sum())
505
+ if n_valid < _MI_MIN_ROWS:
506
+ warnings.warn(
507
+ f"Skipping MI for '{col}': only {n_valid} complete rows "
508
+ f"(need {_MI_MIN_ROWS}).",
509
+ stacklevel=3,
510
+ )
511
+ continue
512
+ x = feat_encoded.to_numpy()[valid].reshape(-1, 1).astype(float)
513
+ y = y_full[valid]
514
+ try:
515
+ score = float(
516
+ fn(
517
+ x,
518
+ y,
519
+ discrete_features=[True],
520
+ n_neighbors=_MI_N_NEIGHBORS,
521
+ random_state=42,
522
+ )[0]
523
+ )
524
+ except Exception as exc:
525
+ warnings.warn(f"MI failed for '{col}': {exc}", stacklevel=3)
526
+ continue
527
+ entries.append(MutualInformationEntry(feature=col, mi_score=score))
528
+
529
+ entries.sort(key=lambda e: e.mi_score, reverse=True)
530
+ for rank, entry in enumerate(entries, start=1):
531
+ entry.rank = rank
532
+ return entries
533
+
534
+ # ------------------------------------------------------------------
535
+ # Helper
536
+ # ------------------------------------------------------------------
537
+
538
+ @staticmethod
539
+ def _detect_target_type(df: pl.DataFrame, target_col: str) -> TargetType:
540
+ return (
541
+ TargetType.Numeric
542
+ if df[target_col].dtype in _NUMERIC_DTYPES
543
+ else TargetType.Categorical
544
+ )
@@ -0,0 +1,98 @@
1
+ """
2
+ Result dataclasses for datetime column profiling.
3
+
4
+ Populated by DatetimeProfiler, which is opt-in via
5
+ ProfileConfig.datetime_columns.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+ from enum import StrEnum
12
+ from typing import Optional
13
+
14
+
15
+ class InferredGranularity(StrEnum):
16
+ Yearly = "yearly"
17
+ Monthly = "monthly"
18
+ Weekly = "weekly"
19
+ Daily = "daily"
20
+ Hourly = "hourly"
21
+ Minutely = "minutely"
22
+ Secondly = "secondly"
23
+ Irregular = "irregular"
24
+
25
+
26
+ class DatetimeFlag(StrEnum):
27
+ FutureDates = "future_dates"
28
+ HighGapVariance = "high_gap_variance"
29
+ MnarSuspected = "mnar_suspected"
30
+ RecentDateMissing = "recent_date_missing"
31
+
32
+
33
+ @dataclass
34
+ class TemporalSignals:
35
+ has_year: bool = False
36
+ has_month: bool = False
37
+ has_day: bool = False
38
+ has_day_of_week: bool = False
39
+ has_hour: bool = False
40
+ has_is_weekend: bool = False
41
+ has_is_month_end: bool = False
42
+
43
+ def extractable_features(self) -> list[str]:
44
+ features = []
45
+ if self.has_year:
46
+ features.append("year")
47
+ if self.has_month:
48
+ features.append("month")
49
+ if self.has_day:
50
+ features.append("day_of_month")
51
+ if self.has_day_of_week:
52
+ features.append("day_of_week")
53
+ if self.has_hour:
54
+ features.append("hour")
55
+ if self.has_is_weekend:
56
+ features.append("is_weekend")
57
+ if self.has_is_month_end:
58
+ features.append("is_month_end")
59
+ return features
60
+
61
+
62
+ @dataclass
63
+ class DatetimeStats:
64
+ min_date: Optional[str] = None
65
+ max_date: Optional[str] = None
66
+ date_range_days: Optional[float] = None
67
+ future_date_count: int = 0
68
+ inferred_granularity: Optional[InferredGranularity] = None
69
+ median_gap_seconds: Optional[float] = None
70
+ gap_cv: Optional[float] = None
71
+ signals: TemporalSignals = field(default_factory=TemporalSignals)
72
+ flags: list[DatetimeFlag] = field(default_factory=list)
73
+
74
+ def has_flag(self, flag: DatetimeFlag) -> bool:
75
+ return flag in self.flags
76
+
77
+
78
+ @dataclass
79
+ class DatetimeProfileResult:
80
+ """
81
+ Datetime profile for all opted-in columns.
82
+
83
+ Attributes
84
+ ----------
85
+ columns : dict[str, ColumnDatetimeProfile]
86
+ Per-column profiles, keyed by column name.
87
+ analysed_columns : list[str]
88
+ Columns that were actually profiled (after schema intersection).
89
+ """
90
+
91
+ columns: dict[str, DatetimeStats] = field(default_factory=dict)
92
+ analysed_columns: list[str] = field(default_factory=list)
93
+
94
+ def __str__(self) -> str: # pragma: no cover
95
+ lines = ["=== Datetime Profile ==="]
96
+ for profile in self.columns.values():
97
+ lines.append(str(profile))
98
+ return "\n".join(lines)