tsagentkit 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tsagentkit/__init__.py +126 -0
- tsagentkit/anomaly/__init__.py +130 -0
- tsagentkit/backtest/__init__.py +48 -0
- tsagentkit/backtest/engine.py +788 -0
- tsagentkit/backtest/metrics.py +244 -0
- tsagentkit/backtest/report.py +342 -0
- tsagentkit/calibration/__init__.py +136 -0
- tsagentkit/contracts/__init__.py +133 -0
- tsagentkit/contracts/errors.py +275 -0
- tsagentkit/contracts/results.py +418 -0
- tsagentkit/contracts/schema.py +44 -0
- tsagentkit/contracts/task_spec.py +300 -0
- tsagentkit/covariates/__init__.py +340 -0
- tsagentkit/eval/__init__.py +285 -0
- tsagentkit/features/__init__.py +20 -0
- tsagentkit/features/covariates.py +328 -0
- tsagentkit/features/extra/__init__.py +5 -0
- tsagentkit/features/extra/native.py +179 -0
- tsagentkit/features/factory.py +187 -0
- tsagentkit/features/matrix.py +159 -0
- tsagentkit/features/tsfeatures_adapter.py +115 -0
- tsagentkit/features/versioning.py +203 -0
- tsagentkit/hierarchy/__init__.py +39 -0
- tsagentkit/hierarchy/aggregation.py +62 -0
- tsagentkit/hierarchy/evaluator.py +400 -0
- tsagentkit/hierarchy/reconciliation.py +232 -0
- tsagentkit/hierarchy/structure.py +453 -0
- tsagentkit/models/__init__.py +182 -0
- tsagentkit/models/adapters/__init__.py +83 -0
- tsagentkit/models/adapters/base.py +321 -0
- tsagentkit/models/adapters/chronos.py +387 -0
- tsagentkit/models/adapters/moirai.py +256 -0
- tsagentkit/models/adapters/registry.py +171 -0
- tsagentkit/models/adapters/timesfm.py +440 -0
- tsagentkit/models/baselines.py +207 -0
- tsagentkit/models/sktime.py +307 -0
- tsagentkit/monitoring/__init__.py +51 -0
- tsagentkit/monitoring/alerts.py +302 -0
- tsagentkit/monitoring/coverage.py +203 -0
- tsagentkit/monitoring/drift.py +330 -0
- tsagentkit/monitoring/report.py +214 -0
- tsagentkit/monitoring/stability.py +275 -0
- tsagentkit/monitoring/triggers.py +423 -0
- tsagentkit/qa/__init__.py +347 -0
- tsagentkit/router/__init__.py +37 -0
- tsagentkit/router/bucketing.py +489 -0
- tsagentkit/router/fallback.py +132 -0
- tsagentkit/router/plan.py +23 -0
- tsagentkit/router/router.py +271 -0
- tsagentkit/series/__init__.py +26 -0
- tsagentkit/series/alignment.py +206 -0
- tsagentkit/series/dataset.py +449 -0
- tsagentkit/series/sparsity.py +261 -0
- tsagentkit/series/validation.py +393 -0
- tsagentkit/serving/__init__.py +39 -0
- tsagentkit/serving/orchestration.py +943 -0
- tsagentkit/serving/packaging.py +73 -0
- tsagentkit/serving/provenance.py +317 -0
- tsagentkit/serving/tsfm_cache.py +214 -0
- tsagentkit/skill/README.md +135 -0
- tsagentkit/skill/__init__.py +8 -0
- tsagentkit/skill/recipes.md +429 -0
- tsagentkit/skill/tool_map.md +21 -0
- tsagentkit/time/__init__.py +134 -0
- tsagentkit/utils/__init__.py +20 -0
- tsagentkit/utils/quantiles.py +83 -0
- tsagentkit/utils/signature.py +47 -0
- tsagentkit/utils/temporal.py +41 -0
- tsagentkit-1.0.2.dist-info/METADATA +371 -0
- tsagentkit-1.0.2.dist-info/RECORD +72 -0
- tsagentkit-1.0.2.dist-info/WHEEL +4 -0
- tsagentkit-1.0.2.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
"""Deterministic routing logic aligned to the PRD PlanSpec."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from tsagentkit.contracts import PlanSpec, RouteDecision, RouterConfig, RouterThresholds, TaskSpec
|
|
11
|
+
from tsagentkit.time import normalize_pandas_freq
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from tsagentkit.qa import QAReport
|
|
15
|
+
from tsagentkit.series import TSDataset
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def make_plan(
|
|
19
|
+
dataset: TSDataset,
|
|
20
|
+
task_spec: TaskSpec,
|
|
21
|
+
qa: QAReport | None = None,
|
|
22
|
+
router_config: RouterConfig | None = None,
|
|
23
|
+
use_tsfm: bool = True,
|
|
24
|
+
tsfm_preference: list[str] | None = None,
|
|
25
|
+
) -> tuple[PlanSpec, RouteDecision]:
|
|
26
|
+
"""Create a deterministic PlanSpec and RouteDecision for a dataset.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Tuple of (PlanSpec, RouteDecision) containing the execution plan
|
|
30
|
+
and detailed routing decision information.
|
|
31
|
+
"""
|
|
32
|
+
thresholds = (router_config or RouterConfig()).thresholds
|
|
33
|
+
stats, buckets = _compute_router_stats(dataset, task_spec, thresholds)
|
|
34
|
+
|
|
35
|
+
# Determine TSFM availability
|
|
36
|
+
available_tsfms: list[str] = []
|
|
37
|
+
if use_tsfm and _tsfm_allowed(dataset, thresholds):
|
|
38
|
+
from tsagentkit.models.adapters import AdapterRegistry
|
|
39
|
+
|
|
40
|
+
for name in (tsfm_preference or ["chronos", "moirai", "timesfm"]):
|
|
41
|
+
is_avail, _ = AdapterRegistry.check_availability(name)
|
|
42
|
+
if is_avail:
|
|
43
|
+
available_tsfms.append(name)
|
|
44
|
+
|
|
45
|
+
# Build candidate model list
|
|
46
|
+
if "intermittent" in buckets:
|
|
47
|
+
candidates = ["Croston", "Naive"]
|
|
48
|
+
elif "short_history" in buckets:
|
|
49
|
+
candidates = ["HistoricAverage", "Naive"]
|
|
50
|
+
else:
|
|
51
|
+
candidates = ["SeasonalNaive", "HistoricAverage", "Naive"]
|
|
52
|
+
|
|
53
|
+
if available_tsfms and "intermittent" not in buckets:
|
|
54
|
+
tsfm_models = [f"tsfm-{name}" for name in available_tsfms]
|
|
55
|
+
candidates = tsfm_models + candidates
|
|
56
|
+
|
|
57
|
+
plan = PlanSpec(
|
|
58
|
+
plan_name="default",
|
|
59
|
+
candidate_models=candidates,
|
|
60
|
+
use_static=True,
|
|
61
|
+
use_past=True,
|
|
62
|
+
use_future_known=True,
|
|
63
|
+
min_train_size=thresholds.min_train_size,
|
|
64
|
+
max_train_size=thresholds.max_points_per_series_for_tsfm,
|
|
65
|
+
interval_mode=task_spec.forecast_contract.interval_mode,
|
|
66
|
+
levels=task_spec.forecast_contract.levels,
|
|
67
|
+
quantiles=task_spec.forecast_contract.quantiles,
|
|
68
|
+
allow_drop_covariates=True,
|
|
69
|
+
allow_baseline=True,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Build RouteDecision for audit trail
|
|
73
|
+
reasons = [
|
|
74
|
+
f"selected_models: {candidates}",
|
|
75
|
+
f"buckets: {buckets}",
|
|
76
|
+
f"tsfm_available: {bool(available_tsfms)}",
|
|
77
|
+
]
|
|
78
|
+
if available_tsfms:
|
|
79
|
+
reasons.append(f"tsfm_models: {available_tsfms}")
|
|
80
|
+
|
|
81
|
+
route_decision = RouteDecision(
|
|
82
|
+
stats=stats,
|
|
83
|
+
buckets=buckets,
|
|
84
|
+
selected_plan=plan,
|
|
85
|
+
reasons=reasons,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
return plan, route_decision
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def get_model_for_series(
|
|
92
|
+
unique_id: str,
|
|
93
|
+
dataset: TSDataset,
|
|
94
|
+
task_spec: TaskSpec,
|
|
95
|
+
thresholds: RouterThresholds | None = None,
|
|
96
|
+
) -> str:
|
|
97
|
+
"""Get recommended model for a specific series."""
|
|
98
|
+
thresholds = thresholds or RouterThresholds()
|
|
99
|
+
series_df = dataset.get_series(unique_id)
|
|
100
|
+
stats, buckets = _compute_series_stats(series_df, task_spec, thresholds)
|
|
101
|
+
|
|
102
|
+
if "intermittent" in buckets:
|
|
103
|
+
return "Croston"
|
|
104
|
+
if "short_history" in buckets:
|
|
105
|
+
return "HistoricAverage"
|
|
106
|
+
return "SeasonalNaive"
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _compute_router_stats(
|
|
110
|
+
dataset: TSDataset,
|
|
111
|
+
task_spec: TaskSpec,
|
|
112
|
+
thresholds: RouterThresholds,
|
|
113
|
+
) -> tuple[dict[str, float], list[str]]:
|
|
114
|
+
df = dataset.df
|
|
115
|
+
stats: dict[str, float] = {}
|
|
116
|
+
buckets: list[str] = []
|
|
117
|
+
|
|
118
|
+
lengths = df.groupby("unique_id").size()
|
|
119
|
+
min_len = int(lengths.min()) if not lengths.empty else 0
|
|
120
|
+
stats["min_series_length"] = float(min_len)
|
|
121
|
+
if min_len < thresholds.min_train_size:
|
|
122
|
+
buckets.append("short_history")
|
|
123
|
+
|
|
124
|
+
missing_ratio = _compute_missing_ratio(df, task_spec)
|
|
125
|
+
stats["missing_ratio"] = float(missing_ratio)
|
|
126
|
+
if missing_ratio > thresholds.max_missing_ratio:
|
|
127
|
+
buckets.append("sparse")
|
|
128
|
+
|
|
129
|
+
uid_col = task_spec.panel_contract.unique_id_col
|
|
130
|
+
ds_col = task_spec.panel_contract.ds_col
|
|
131
|
+
y_col = task_spec.panel_contract.y_col
|
|
132
|
+
|
|
133
|
+
intermittency = _compute_intermittency(df, thresholds, uid_col, ds_col, y_col)
|
|
134
|
+
stats.update(intermittency)
|
|
135
|
+
if intermittency.get("intermittent_series_ratio", 0.0) > 0:
|
|
136
|
+
buckets.append("intermittent")
|
|
137
|
+
|
|
138
|
+
season_conf = _seasonality_confidence(df, task_spec, uid_col, y_col)
|
|
139
|
+
stats["seasonality_confidence"] = float(season_conf)
|
|
140
|
+
if season_conf >= thresholds.min_seasonality_conf:
|
|
141
|
+
buckets.append("seasonal_candidate")
|
|
142
|
+
|
|
143
|
+
return stats, buckets
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _compute_series_stats(
|
|
147
|
+
series_df: pd.DataFrame,
|
|
148
|
+
task_spec: TaskSpec,
|
|
149
|
+
thresholds: RouterThresholds,
|
|
150
|
+
) -> tuple[dict[str, float], list[str]]:
|
|
151
|
+
stats: dict[str, float] = {}
|
|
152
|
+
buckets: list[str] = []
|
|
153
|
+
|
|
154
|
+
length = len(series_df)
|
|
155
|
+
stats["series_length"] = float(length)
|
|
156
|
+
if length < thresholds.min_train_size:
|
|
157
|
+
buckets.append("short_history")
|
|
158
|
+
|
|
159
|
+
missing_ratio = _compute_missing_ratio(series_df, task_spec)
|
|
160
|
+
stats["missing_ratio"] = float(missing_ratio)
|
|
161
|
+
if missing_ratio > thresholds.max_missing_ratio:
|
|
162
|
+
buckets.append("sparse")
|
|
163
|
+
|
|
164
|
+
uid_col = task_spec.panel_contract.unique_id_col
|
|
165
|
+
ds_col = task_spec.panel_contract.ds_col
|
|
166
|
+
y_col = task_spec.panel_contract.y_col
|
|
167
|
+
|
|
168
|
+
intermittency = _compute_intermittency(series_df, thresholds, uid_col, ds_col, y_col)
|
|
169
|
+
stats.update(intermittency)
|
|
170
|
+
if intermittency.get("intermittent_series_ratio", 0.0) > 0:
|
|
171
|
+
buckets.append("intermittent")
|
|
172
|
+
|
|
173
|
+
season_conf = _seasonality_confidence(series_df, task_spec, uid_col, y_col)
|
|
174
|
+
stats["seasonality_confidence"] = float(season_conf)
|
|
175
|
+
if season_conf >= thresholds.min_seasonality_conf:
|
|
176
|
+
buckets.append("seasonal_candidate")
|
|
177
|
+
|
|
178
|
+
return stats, buckets
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _compute_missing_ratio(df: pd.DataFrame, task_spec: TaskSpec) -> float:
|
|
182
|
+
if df.empty:
|
|
183
|
+
return 0.0
|
|
184
|
+
uid_col = task_spec.panel_contract.unique_id_col
|
|
185
|
+
ds_col = task_spec.panel_contract.ds_col
|
|
186
|
+
|
|
187
|
+
ratios = []
|
|
188
|
+
for uid in df[uid_col].unique():
|
|
189
|
+
series = df[df[uid_col] == uid].sort_values(ds_col)
|
|
190
|
+
if series.empty:
|
|
191
|
+
continue
|
|
192
|
+
full_range = pd.date_range(
|
|
193
|
+
start=series[ds_col].min(),
|
|
194
|
+
end=series[ds_col].max(),
|
|
195
|
+
freq=normalize_pandas_freq(task_spec.freq),
|
|
196
|
+
)
|
|
197
|
+
missing = len(full_range) - len(series)
|
|
198
|
+
ratio = missing / max(len(full_range), 1)
|
|
199
|
+
ratios.append(ratio)
|
|
200
|
+
return float(np.mean(ratios)) if ratios else 0.0
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _compute_intermittency(
|
|
204
|
+
df: pd.DataFrame,
|
|
205
|
+
thresholds: RouterThresholds,
|
|
206
|
+
uid_col: str,
|
|
207
|
+
ds_col: str,
|
|
208
|
+
y_col: str,
|
|
209
|
+
) -> dict[str, float]:
|
|
210
|
+
intermittent = 0
|
|
211
|
+
total = 0
|
|
212
|
+
|
|
213
|
+
for uid in df[uid_col].unique():
|
|
214
|
+
series = df[df[uid_col] == uid].sort_values(ds_col)
|
|
215
|
+
y = series[y_col].values
|
|
216
|
+
total += 1
|
|
217
|
+
|
|
218
|
+
non_zero_idx = np.where(y > 0)[0]
|
|
219
|
+
if len(non_zero_idx) <= 1:
|
|
220
|
+
adi = float("inf")
|
|
221
|
+
cv2 = float("inf")
|
|
222
|
+
else:
|
|
223
|
+
intervals = np.diff(non_zero_idx)
|
|
224
|
+
adi = float(np.mean(intervals)) if len(intervals) > 0 else float("inf")
|
|
225
|
+
non_zero_vals = y[non_zero_idx]
|
|
226
|
+
mean = np.mean(non_zero_vals) if len(non_zero_vals) > 0 else 0.0
|
|
227
|
+
std = np.std(non_zero_vals) if len(non_zero_vals) > 0 else 0.0
|
|
228
|
+
cv2 = float((std / mean) ** 2) if mean != 0 else float("inf")
|
|
229
|
+
|
|
230
|
+
if adi >= thresholds.max_intermittency_adi and cv2 >= thresholds.max_intermittency_cv2:
|
|
231
|
+
intermittent += 1
|
|
232
|
+
|
|
233
|
+
ratio = intermittent / total if total > 0 else 0.0
|
|
234
|
+
return {
|
|
235
|
+
"intermittent_series_ratio": ratio,
|
|
236
|
+
"intermittent_series_count": float(intermittent),
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _seasonality_confidence(
|
|
241
|
+
df: pd.DataFrame,
|
|
242
|
+
task_spec: TaskSpec,
|
|
243
|
+
uid_col: str,
|
|
244
|
+
y_col: str,
|
|
245
|
+
) -> float:
|
|
246
|
+
season_length = task_spec.season_length
|
|
247
|
+
if season_length is None or season_length <= 1:
|
|
248
|
+
return 0.0
|
|
249
|
+
confs: list[float] = []
|
|
250
|
+
for uid in df[uid_col].unique():
|
|
251
|
+
series = df[df[uid_col] == uid][y_col].values
|
|
252
|
+
if len(series) <= season_length:
|
|
253
|
+
continue
|
|
254
|
+
series = series - np.mean(series)
|
|
255
|
+
denom = np.dot(series, series)
|
|
256
|
+
if denom == 0:
|
|
257
|
+
continue
|
|
258
|
+
lagged = np.roll(series, season_length)
|
|
259
|
+
corr = np.dot(series[season_length:], lagged[season_length:]) / denom
|
|
260
|
+
confs.append(abs(float(corr)))
|
|
261
|
+
return float(np.mean(confs)) if confs else 0.0
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _tsfm_allowed(dataset: TSDataset, thresholds: RouterThresholds) -> bool:
|
|
265
|
+
if dataset.n_series > thresholds.max_series_count_for_tsfm:
|
|
266
|
+
return False
|
|
267
|
+
max_points = dataset.df.groupby("unique_id").size().max()
|
|
268
|
+
return max_points <= thresholds.max_points_per_series_for_tsfm
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
__all__ = ["make_plan", "get_model_for_series", "RouteDecision"]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Series module for tsagentkit.
|
|
2
|
+
|
|
3
|
+
Provides time series data structures and operations.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from .alignment import align_timezone, fill_gaps, resample_series
|
|
7
|
+
from .dataset import TSDataset, build_dataset
|
|
8
|
+
from .sparsity import SparsityClass, SparsityProfile, compute_sparsity_profile
|
|
9
|
+
from .validation import normalize_panel_columns, validate_contract
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
# Dataset
|
|
13
|
+
"TSDataset",
|
|
14
|
+
"build_dataset",
|
|
15
|
+
# Sparsity
|
|
16
|
+
"SparsityProfile",
|
|
17
|
+
"SparsityClass",
|
|
18
|
+
"compute_sparsity_profile",
|
|
19
|
+
# Alignment
|
|
20
|
+
"align_timezone",
|
|
21
|
+
"resample_series",
|
|
22
|
+
"fill_gaps",
|
|
23
|
+
# Validation helpers (series layer)
|
|
24
|
+
"validate_contract",
|
|
25
|
+
"normalize_panel_columns",
|
|
26
|
+
]
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
"""Time alignment and resampling utilities.
|
|
2
|
+
|
|
3
|
+
Provides timezone unification and resampling for time series data.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import Literal
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def align_timezone(
|
|
14
|
+
df: pd.DataFrame,
|
|
15
|
+
target_tz: str | None = "UTC",
|
|
16
|
+
ds_col: str = "ds",
|
|
17
|
+
) -> pd.DataFrame:
|
|
18
|
+
"""Unify timezones across a dataset.
|
|
19
|
+
|
|
20
|
+
Converts all datetime values to the target timezone. Handles
|
|
21
|
+
timezone-aware and timezone-naive datetimes appropriately.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
df: DataFrame with datetime column
|
|
25
|
+
target_tz: Target timezone (default: "UTC", None for naive)
|
|
26
|
+
ds_col: Name of datetime column (default: "ds")
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
DataFrame with unified timezone
|
|
30
|
+
|
|
31
|
+
Raises:
|
|
32
|
+
ValueError: If ds_col is not found or not datetime
|
|
33
|
+
"""
|
|
34
|
+
if ds_col not in df.columns:
|
|
35
|
+
raise ValueError(f"Column '{ds_col}' not found in DataFrame")
|
|
36
|
+
|
|
37
|
+
if not pd.api.types.is_datetime64_any_dtype(df[ds_col]):
|
|
38
|
+
raise ValueError(f"Column '{ds_col}' must be datetime type")
|
|
39
|
+
|
|
40
|
+
result = df.copy()
|
|
41
|
+
|
|
42
|
+
# Handle timezone
|
|
43
|
+
if target_tz is None:
|
|
44
|
+
# Make timezone-naive
|
|
45
|
+
if result[ds_col].dt.tz is not None:
|
|
46
|
+
result[ds_col] = result[ds_col].dt.tz_localize(None)
|
|
47
|
+
else:
|
|
48
|
+
# Convert to target timezone
|
|
49
|
+
if result[ds_col].dt.tz is None:
|
|
50
|
+
# Assume UTC for naive datetimes, then convert
|
|
51
|
+
result[ds_col] = result[ds_col].dt.tz_localize("UTC").dt.tz_convert(target_tz)
|
|
52
|
+
else:
|
|
53
|
+
result[ds_col] = result[ds_col].dt.tz_convert(target_tz)
|
|
54
|
+
|
|
55
|
+
return result
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def resample_series(
|
|
59
|
+
df: pd.DataFrame,
|
|
60
|
+
freq: str,
|
|
61
|
+
agg_func: Literal["sum", "mean", "last", "first", "max", "min"] = "sum",
|
|
62
|
+
ds_col: str = "ds",
|
|
63
|
+
unique_id_col: str = "unique_id",
|
|
64
|
+
y_col: str = "y",
|
|
65
|
+
) -> pd.DataFrame:
|
|
66
|
+
"""Resample time series to a new frequency.
|
|
67
|
+
|
|
68
|
+
Resamples each series independently to the target frequency using
|
|
69
|
+
the specified aggregation function.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
df: DataFrame with time series data
|
|
73
|
+
freq: Target frequency (pandas freq string, e.g., 'D', 'H', 'M')
|
|
74
|
+
agg_func: Aggregation function (default: "sum")
|
|
75
|
+
ds_col: Name of datetime column (default: "ds")
|
|
76
|
+
unique_id_col: Name of series ID column (default: "unique_id")
|
|
77
|
+
y_col: Name of target column (default: "y")
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Resampled DataFrame
|
|
81
|
+
|
|
82
|
+
Raises:
|
|
83
|
+
ValueError: If required columns not found
|
|
84
|
+
"""
|
|
85
|
+
required_cols = {ds_col, unique_id_col, y_col}
|
|
86
|
+
missing = required_cols - set(df.columns)
|
|
87
|
+
if missing:
|
|
88
|
+
raise ValueError(f"Missing required columns: {missing}")
|
|
89
|
+
|
|
90
|
+
# Ensure datetime
|
|
91
|
+
if not pd.api.types.is_datetime64_any_dtype(df[ds_col]):
|
|
92
|
+
raise ValueError(f"Column '{ds_col}' must be datetime type")
|
|
93
|
+
|
|
94
|
+
# Resample each series
|
|
95
|
+
resampled_frames: list[pd.DataFrame] = []
|
|
96
|
+
|
|
97
|
+
for uid in df[unique_id_col].unique():
|
|
98
|
+
series = df[df[unique_id_col] == uid].set_index(ds_col).sort_index()
|
|
99
|
+
|
|
100
|
+
# Select numeric columns for resampling
|
|
101
|
+
numeric_cols = series.select_dtypes(include=["number"]).columns.tolist()
|
|
102
|
+
|
|
103
|
+
if not numeric_cols:
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
# Resample
|
|
107
|
+
resampler = series[numeric_cols].resample(freq)
|
|
108
|
+
|
|
109
|
+
# Apply aggregation
|
|
110
|
+
if agg_func == "sum":
|
|
111
|
+
resampled = resampler.sum()
|
|
112
|
+
elif agg_func == "mean":
|
|
113
|
+
resampled = resampler.mean()
|
|
114
|
+
elif agg_func == "last":
|
|
115
|
+
resampled = resampler.last()
|
|
116
|
+
elif agg_func == "first":
|
|
117
|
+
resampled = resampler.first()
|
|
118
|
+
elif agg_func == "max":
|
|
119
|
+
resampled = resampler.max()
|
|
120
|
+
elif agg_func == "min":
|
|
121
|
+
resampled = resampler.min()
|
|
122
|
+
else:
|
|
123
|
+
raise ValueError(f"Unknown aggregation function: {agg_func}")
|
|
124
|
+
|
|
125
|
+
# Add back unique_id
|
|
126
|
+
resampled[unique_id_col] = uid
|
|
127
|
+
resampled = resampled.reset_index()
|
|
128
|
+
|
|
129
|
+
resampled_frames.append(resampled)
|
|
130
|
+
|
|
131
|
+
if not resampled_frames:
|
|
132
|
+
# Return empty DataFrame with correct structure
|
|
133
|
+
return pd.DataFrame(columns=[unique_id_col, ds_col, y_col])
|
|
134
|
+
|
|
135
|
+
# Combine all series
|
|
136
|
+
result = pd.concat(resampled_frames, ignore_index=True)
|
|
137
|
+
|
|
138
|
+
# Reorder columns to standard order
|
|
139
|
+
cols = [unique_id_col, ds_col] + [c for c in result.columns if c not in {unique_id_col, ds_col}]
|
|
140
|
+
|
|
141
|
+
return result[cols]
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def fill_gaps(
|
|
145
|
+
df: pd.DataFrame,
|
|
146
|
+
freq: str,
|
|
147
|
+
method: Literal["interpolate", "forward", "backward", "zero"] = "interpolate",
|
|
148
|
+
ds_col: str = "ds",
|
|
149
|
+
unique_id_col: str = "unique_id",
|
|
150
|
+
) -> pd.DataFrame:
|
|
151
|
+
"""Fill gaps in time series data.
|
|
152
|
+
|
|
153
|
+
Identifies missing timestamps and fills them using the specified method.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
df: DataFrame with time series data
|
|
157
|
+
freq: Expected frequency (pandas freq string)
|
|
158
|
+
method: Fill method (default: "interpolate")
|
|
159
|
+
ds_col: Name of datetime column (default: "ds")
|
|
160
|
+
unique_id_col: Name of series ID column (default: "unique_id")
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
DataFrame with gaps filled
|
|
164
|
+
"""
|
|
165
|
+
filled_frames: list[pd.DataFrame] = []
|
|
166
|
+
|
|
167
|
+
for uid in df[unique_id_col].unique():
|
|
168
|
+
series = df[df[unique_id_col] == uid].set_index(ds_col).sort_index()
|
|
169
|
+
|
|
170
|
+
# Create complete date range
|
|
171
|
+
full_range = pd.date_range(start=series.index.min(), end=series.index.max(), freq=freq)
|
|
172
|
+
|
|
173
|
+
# Reindex to include gaps
|
|
174
|
+
series_filled = series.reindex(full_range)
|
|
175
|
+
|
|
176
|
+
# Select only numeric columns for filling
|
|
177
|
+
numeric_cols = series_filled.select_dtypes(include=["number"]).columns.tolist()
|
|
178
|
+
|
|
179
|
+
# Fill missing values in numeric columns only
|
|
180
|
+
if method == "interpolate":
|
|
181
|
+
series_filled[numeric_cols] = series_filled[numeric_cols].interpolate(method="linear")
|
|
182
|
+
elif method == "forward":
|
|
183
|
+
series_filled[numeric_cols] = series_filled[numeric_cols].ffill()
|
|
184
|
+
elif method == "backward":
|
|
185
|
+
series_filled[numeric_cols] = series_filled[numeric_cols].bfill()
|
|
186
|
+
elif method == "zero":
|
|
187
|
+
series_filled[numeric_cols] = series_filled[numeric_cols].fillna(0)
|
|
188
|
+
|
|
189
|
+
# Add back unique_id
|
|
190
|
+
series_filled[unique_id_col] = uid
|
|
191
|
+
series_filled = series_filled.reset_index()
|
|
192
|
+
# Rename the datetime column (could be "index" or the original index name)
|
|
193
|
+
if "index" in series_filled.columns:
|
|
194
|
+
series_filled = series_filled.rename(columns={"index": ds_col})
|
|
195
|
+
|
|
196
|
+
filled_frames.append(series_filled)
|
|
197
|
+
|
|
198
|
+
if not filled_frames:
|
|
199
|
+
return df.copy()
|
|
200
|
+
|
|
201
|
+
result = pd.concat(filled_frames, ignore_index=True)
|
|
202
|
+
|
|
203
|
+
# Reorder columns
|
|
204
|
+
cols = [unique_id_col, ds_col] + [c for c in result.columns if c not in {unique_id_col, ds_col}]
|
|
205
|
+
|
|
206
|
+
return result[cols]
|