dataforge-ml 2.0.7__tar.gz → 2.0.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataforge_ml-2.0.7/src/dataforge_ml.egg-info → dataforge_ml-2.0.8}/PKG-INFO +1 -1
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/pyproject.toml +1 -1
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_datetime_config.py +2 -70
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_datetime_profiler.py +4 -55
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8/src/dataforge_ml.egg-info}/PKG-INFO +1 -1
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/LICENSE +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/README.md +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/setup.cfg +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/__init__.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/config.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/__init__.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/_config.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/_fitted_imputer.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/_numeric_imputer.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/_regression_estimator_factory.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/_strategy_router.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/_utils.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/imputation/orchestrator.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/models/__init__.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/models/_data_structure.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/models/_data_types.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/__init__.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_base.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_boolean_profiler.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_categorical.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_config.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_correlation_profiler.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_missingness_profiler.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_nonlinearity_profiler.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_numeric_profiler.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_tabular.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_target_config.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_target_profiler.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_text_config.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_type_detection_config.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_type_detector.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/orchestrator.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/splitting/__init__.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/splitting/_config.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/splitting/_profile_signals.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/splitting/_splitter.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/utils/__init__.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/utils/_null_detection.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/utils/_null_normalization.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/utils/data_loader.py +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml.egg-info/requires.txt +0 -0
- {dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml.egg-info/top_level.txt +0 -0
|
@@ -31,77 +31,12 @@ class DatetimeFlag(StrEnum):
|
|
|
31
31
|
FormatMismatch = "format_mismatch"
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
@dataclass
|
|
35
|
-
class TemporalSignals:
|
|
36
|
-
"""Which time-component features are present in a Datetime column.
|
|
37
|
-
|
|
38
|
-
Each boolean field indicates that the corresponding granularity was
|
|
39
|
-
detected as non-constant, making it a candidate for feature extraction
|
|
40
|
-
in Phase 5 Encoding.
|
|
41
|
-
"""
|
|
42
|
-
|
|
43
|
-
has_year: bool = False
|
|
44
|
-
has_month: bool = False
|
|
45
|
-
has_day: bool = False
|
|
46
|
-
has_day_of_week: bool = False
|
|
47
|
-
has_hour: bool = False
|
|
48
|
-
has_is_weekend: bool = False
|
|
49
|
-
has_is_month_end: bool = False
|
|
50
|
-
|
|
51
|
-
def extractable_features(self) -> list[str]:
|
|
52
|
-
"""Return the names of all time-component features that can be extracted.
|
|
53
|
-
|
|
54
|
-
Returns
|
|
55
|
-
-------
|
|
56
|
-
list[str]
|
|
57
|
-
Feature names corresponding to every ``has_*`` field that is
|
|
58
|
-
``True``. An empty list means no temporal variation was detected.
|
|
59
|
-
"""
|
|
60
|
-
features = []
|
|
61
|
-
if self.has_year:
|
|
62
|
-
features.append("year")
|
|
63
|
-
if self.has_month:
|
|
64
|
-
features.append("month")
|
|
65
|
-
if self.has_day:
|
|
66
|
-
features.append("day_of_month")
|
|
67
|
-
if self.has_day_of_week:
|
|
68
|
-
features.append("day_of_week")
|
|
69
|
-
if self.has_hour:
|
|
70
|
-
features.append("hour")
|
|
71
|
-
if self.has_is_weekend:
|
|
72
|
-
features.append("is_weekend")
|
|
73
|
-
if self.has_is_month_end:
|
|
74
|
-
features.append("is_month_end")
|
|
75
|
-
return features
|
|
76
|
-
|
|
77
|
-
def to_dict(self) -> dict:
|
|
78
|
-
"""Serialise the temporal signals to a plain dictionary.
|
|
79
|
-
|
|
80
|
-
Returns
|
|
81
|
-
-------
|
|
82
|
-
dict
|
|
83
|
-
All ``has_*`` flags plus an ``extractable_features`` key
|
|
84
|
-
containing the result of :meth:`extractable_features`.
|
|
85
|
-
"""
|
|
86
|
-
return {
|
|
87
|
-
"has_year": self.has_year,
|
|
88
|
-
"has_month": self.has_month,
|
|
89
|
-
"has_day": self.has_day,
|
|
90
|
-
"has_day_of_week": self.has_day_of_week,
|
|
91
|
-
"has_hour": self.has_hour,
|
|
92
|
-
"has_is_weekend": self.has_is_weekend,
|
|
93
|
-
"has_is_month_end": self.has_is_month_end,
|
|
94
|
-
"extractable_features": self.extractable_features(),
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
|
|
98
34
|
@dataclass
|
|
99
35
|
class DatetimeStats:
|
|
100
36
|
"""Statistical summary of a single Datetime column.
|
|
101
37
|
|
|
102
38
|
Produced by ``DatetimeProfiler`` for each opted-in column. Stores
|
|
103
|
-
range, gap regularity, inferred granularity
|
|
104
|
-
indicating which time components are available for feature extraction.
|
|
39
|
+
range, gap regularity, and inferred granularity.
|
|
105
40
|
"""
|
|
106
41
|
|
|
107
42
|
min_date: Optional[str] = None
|
|
@@ -111,7 +46,6 @@ class DatetimeStats:
|
|
|
111
46
|
inferred_granularity: Optional[InferredGranularity] = None
|
|
112
47
|
median_gap_seconds: Optional[float] = None
|
|
113
48
|
gap_cv: Optional[float] = None
|
|
114
|
-
signals: TemporalSignals = field(default_factory=TemporalSignals)
|
|
115
49
|
flags: list[DatetimeFlag] = field(default_factory=list)
|
|
116
50
|
|
|
117
51
|
def has_flag(self, flag: DatetimeFlag) -> bool:
|
|
@@ -137,8 +71,7 @@ class DatetimeStats:
|
|
|
137
71
|
-------
|
|
138
72
|
dict
|
|
139
73
|
All fields keyed by field name. ``inferred_granularity`` is
|
|
140
|
-
serialised as its string value; ``
|
|
141
|
-
:meth:`TemporalSignals.to_dict`; ``flags`` are serialised as
|
|
74
|
+
serialised as its string value; ``flags`` are serialised as
|
|
142
75
|
their string values.
|
|
143
76
|
"""
|
|
144
77
|
return {
|
|
@@ -149,7 +82,6 @@ class DatetimeStats:
|
|
|
149
82
|
"inferred_granularity": str(self.inferred_granularity) if self.inferred_granularity else None,
|
|
150
83
|
"median_gap_seconds": self.median_gap_seconds,
|
|
151
84
|
"gap_cv": self.gap_cv,
|
|
152
|
-
"signals": self.signals.to_dict(),
|
|
153
85
|
"flags": [str(f) for f in self.flags],
|
|
154
86
|
}
|
|
155
87
|
|
|
@@ -7,9 +7,10 @@ Per-column metrics (opt-in via ProfileConfig.datetime_columns):
|
|
|
7
7
|
3. Future dates – count of values > now, with context note
|
|
8
8
|
4. Granularity – inferred periodicity from median consecutive gap;
|
|
9
9
|
high gap-CV flagged as irregular
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
|
|
11
|
+
Temporal-component variance (whether year, month, day-of-week, etc. vary) is
|
|
12
|
+
intentionally not profiled here; Phase 5 Encoding derives it on demand from
|
|
13
|
+
the column.
|
|
13
14
|
|
|
14
15
|
Granularity inference bands (median gap in seconds):
|
|
15
16
|
< 90 s → secondly
|
|
@@ -34,7 +35,6 @@ from ._datetime_config import (
|
|
|
34
35
|
DatetimeStats,
|
|
35
36
|
InferredGranularity,
|
|
36
37
|
DatetimeFlag,
|
|
37
|
-
TemporalSignals,
|
|
38
38
|
)
|
|
39
39
|
|
|
40
40
|
# Granularity bands — upper bound (exclusive) in seconds for each label.
|
|
@@ -226,9 +226,6 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
|
|
|
226
226
|
# 6. Granularity
|
|
227
227
|
self._infer_granularity(clean, profile)
|
|
228
228
|
|
|
229
|
-
# 7. Temporal signals
|
|
230
|
-
self._audit_temporal_signals(clean, profile)
|
|
231
|
-
|
|
232
229
|
return profile
|
|
233
230
|
|
|
234
231
|
# ------------------------------------------------------------------
|
|
@@ -377,51 +374,3 @@ class DatetimeProfiler(ColumnBatchProfiler[DatetimeProfileResult]):
|
|
|
377
374
|
break
|
|
378
375
|
|
|
379
376
|
profile.inferred_granularity = granularity
|
|
380
|
-
|
|
381
|
-
# ------------------------------------------------------------------
|
|
382
|
-
# Step 7: Temporal signal audit
|
|
383
|
-
# ------------------------------------------------------------------
|
|
384
|
-
|
|
385
|
-
@staticmethod
|
|
386
|
-
def _audit_temporal_signals(
|
|
387
|
-
clean: pl.Series,
|
|
388
|
-
profile: DatetimeStats,
|
|
389
|
-
) -> None:
|
|
390
|
-
"""
|
|
391
|
-
Check which temporal features vary across rows.
|
|
392
|
-
|
|
393
|
-
All checks are done via Polars expressions on the full clean series,
|
|
394
|
-
so no Python-level loops are required.
|
|
395
|
-
"""
|
|
396
|
-
signals = TemporalSignals()
|
|
397
|
-
|
|
398
|
-
years = clean.dt.year()
|
|
399
|
-
months = clean.dt.month()
|
|
400
|
-
days = clean.dt.day()
|
|
401
|
-
dow = clean.dt.weekday() # 0=Monday … 6=Sunday
|
|
402
|
-
hours = clean.dt.hour()
|
|
403
|
-
|
|
404
|
-
signals.has_year = years.n_unique() > 1
|
|
405
|
-
signals.has_month = months.n_unique() > 1
|
|
406
|
-
signals.has_day = days.n_unique() > 1
|
|
407
|
-
signals.has_day_of_week = dow.n_unique() > 1
|
|
408
|
-
signals.has_hour = int(hours.max()) > 0 # type: ignore[arg-type]
|
|
409
|
-
|
|
410
|
-
# Weekend signal is only meaningful when day-of-week varies
|
|
411
|
-
if signals.has_day_of_week:
|
|
412
|
-
weekend_mask = dow >= 5 # Saturday=5, Sunday=6
|
|
413
|
-
signals.has_is_weekend = bool(weekend_mask.any())
|
|
414
|
-
|
|
415
|
-
# Month-end: day == last day of the respective month
|
|
416
|
-
try:
|
|
417
|
-
month_end_ts = clean.dt.month_end()
|
|
418
|
-
is_month_end_mask = (
|
|
419
|
-
(clean.dt.year() == month_end_ts.dt.year())
|
|
420
|
-
& (clean.dt.month() == month_end_ts.dt.month())
|
|
421
|
-
& (clean.dt.day() == month_end_ts.dt.day())
|
|
422
|
-
)
|
|
423
|
-
signals.has_is_month_end = bool(is_month_end_mask.any())
|
|
424
|
-
except Exception:
|
|
425
|
-
signals.has_is_month_end = False
|
|
426
|
-
|
|
427
|
-
profile.signals = signals
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_correlation_profiler.py
RENAMED
|
File without changes
|
|
File without changes
|
{dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_missingness_profiler.py
RENAMED
|
File without changes
|
{dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_nonlinearity_profiler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataforge_ml-2.0.7 → dataforge_ml-2.0.8}/src/dataforge_ml/profiling/_type_detection_config.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|