tsagentkit 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tsagentkit/__init__.py +126 -0
- tsagentkit/anomaly/__init__.py +130 -0
- tsagentkit/backtest/__init__.py +48 -0
- tsagentkit/backtest/engine.py +788 -0
- tsagentkit/backtest/metrics.py +244 -0
- tsagentkit/backtest/report.py +342 -0
- tsagentkit/calibration/__init__.py +136 -0
- tsagentkit/contracts/__init__.py +133 -0
- tsagentkit/contracts/errors.py +275 -0
- tsagentkit/contracts/results.py +418 -0
- tsagentkit/contracts/schema.py +44 -0
- tsagentkit/contracts/task_spec.py +300 -0
- tsagentkit/covariates/__init__.py +340 -0
- tsagentkit/eval/__init__.py +285 -0
- tsagentkit/features/__init__.py +20 -0
- tsagentkit/features/covariates.py +328 -0
- tsagentkit/features/extra/__init__.py +5 -0
- tsagentkit/features/extra/native.py +179 -0
- tsagentkit/features/factory.py +187 -0
- tsagentkit/features/matrix.py +159 -0
- tsagentkit/features/tsfeatures_adapter.py +115 -0
- tsagentkit/features/versioning.py +203 -0
- tsagentkit/hierarchy/__init__.py +39 -0
- tsagentkit/hierarchy/aggregation.py +62 -0
- tsagentkit/hierarchy/evaluator.py +400 -0
- tsagentkit/hierarchy/reconciliation.py +232 -0
- tsagentkit/hierarchy/structure.py +453 -0
- tsagentkit/models/__init__.py +182 -0
- tsagentkit/models/adapters/__init__.py +83 -0
- tsagentkit/models/adapters/base.py +321 -0
- tsagentkit/models/adapters/chronos.py +387 -0
- tsagentkit/models/adapters/moirai.py +256 -0
- tsagentkit/models/adapters/registry.py +171 -0
- tsagentkit/models/adapters/timesfm.py +440 -0
- tsagentkit/models/baselines.py +207 -0
- tsagentkit/models/sktime.py +307 -0
- tsagentkit/monitoring/__init__.py +51 -0
- tsagentkit/monitoring/alerts.py +302 -0
- tsagentkit/monitoring/coverage.py +203 -0
- tsagentkit/monitoring/drift.py +330 -0
- tsagentkit/monitoring/report.py +214 -0
- tsagentkit/monitoring/stability.py +275 -0
- tsagentkit/monitoring/triggers.py +423 -0
- tsagentkit/qa/__init__.py +347 -0
- tsagentkit/router/__init__.py +37 -0
- tsagentkit/router/bucketing.py +489 -0
- tsagentkit/router/fallback.py +132 -0
- tsagentkit/router/plan.py +23 -0
- tsagentkit/router/router.py +271 -0
- tsagentkit/series/__init__.py +26 -0
- tsagentkit/series/alignment.py +206 -0
- tsagentkit/series/dataset.py +449 -0
- tsagentkit/series/sparsity.py +261 -0
- tsagentkit/series/validation.py +393 -0
- tsagentkit/serving/__init__.py +39 -0
- tsagentkit/serving/orchestration.py +943 -0
- tsagentkit/serving/packaging.py +73 -0
- tsagentkit/serving/provenance.py +317 -0
- tsagentkit/serving/tsfm_cache.py +214 -0
- tsagentkit/skill/README.md +135 -0
- tsagentkit/skill/__init__.py +8 -0
- tsagentkit/skill/recipes.md +429 -0
- tsagentkit/skill/tool_map.md +21 -0
- tsagentkit/time/__init__.py +134 -0
- tsagentkit/utils/__init__.py +20 -0
- tsagentkit/utils/quantiles.py +83 -0
- tsagentkit/utils/signature.py +47 -0
- tsagentkit/utils/temporal.py +41 -0
- tsagentkit-1.0.2.dist-info/METADATA +371 -0
- tsagentkit-1.0.2.dist-info/RECORD +72 -0
- tsagentkit-1.0.2.dist-info/WHEEL +4 -0
- tsagentkit-1.0.2.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,449 @@
|
|
|
1
|
+
"""TSDataset implementation.
|
|
2
|
+
|
|
3
|
+
Immutable wrapper around DataFrame for time series data with
|
|
4
|
+
guaranteed schema and temporal integrity.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass, field, replace
|
|
10
|
+
from typing import TYPE_CHECKING, Any
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
from tsagentkit.contracts import PanelContract, TaskSpec, validate_contract
|
|
15
|
+
from tsagentkit.series.validation import normalize_panel_columns
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from tsagentkit.covariates import AlignedDataset, CovariateBundle
|
|
19
|
+
from tsagentkit.hierarchy import HierarchyStructure
|
|
20
|
+
|
|
21
|
+
from .sparsity import SparsityProfile, compute_sparsity_profile
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass(frozen=True)
|
|
25
|
+
class TSDataset:
|
|
26
|
+
"""Immutable time series dataset container.
|
|
27
|
+
|
|
28
|
+
Wraps a DataFrame with guaranteed schema and provides methods
|
|
29
|
+
for time series operations. All operations return new instances.
|
|
30
|
+
|
|
31
|
+
Attributes:
|
|
32
|
+
df: The underlying DataFrame (guaranteed sorted by unique_id, ds)
|
|
33
|
+
task_spec: Task specification for this dataset
|
|
34
|
+
sparsity_profile: Computed sparsity profile
|
|
35
|
+
metadata: Additional dataset metadata
|
|
36
|
+
|
|
37
|
+
Examples:
|
|
38
|
+
>>> from tsagentkit import TaskSpec
|
|
39
|
+
>>> import pandas as pd
|
|
40
|
+
>>> df = pd.DataFrame({
|
|
41
|
+
... "unique_id": ["A", "A", "B", "B"],
|
|
42
|
+
... "ds": pd.date_range("2024-01-01", periods=4, freq="D"),
|
|
43
|
+
... "y": [1.0, 2.0, 3.0, 4.0],
|
|
44
|
+
... })
|
|
45
|
+
>>> spec = TaskSpec(h=7, freq="D")
|
|
46
|
+
>>> dataset = TSDataset.from_dataframe(df, spec)
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
df: pd.DataFrame
|
|
50
|
+
task_spec: TaskSpec
|
|
51
|
+
sparsity_profile: SparsityProfile | None = field(default=None)
|
|
52
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
53
|
+
hierarchy: HierarchyStructure | None = field(default=None)
|
|
54
|
+
static_x: pd.DataFrame | None = field(default=None)
|
|
55
|
+
past_x: pd.DataFrame | None = field(default=None)
|
|
56
|
+
future_x: pd.DataFrame | None = field(default=None)
|
|
57
|
+
future_index: pd.DataFrame | None = field(default=None)
|
|
58
|
+
covariate_spec: Any | None = field(default=None)
|
|
59
|
+
covariate_bundle: CovariateBundle | None = field(default=None)
|
|
60
|
+
panel_with_covariates: pd.DataFrame | None = field(default=None)
|
|
61
|
+
|
|
62
|
+
def __post_init__(self) -> None:
|
|
63
|
+
"""Validate the dataset after creation."""
|
|
64
|
+
# Since dataclass is frozen, we can't modify, but we can validate
|
|
65
|
+
required_cols = {"unique_id", "ds", "y"}
|
|
66
|
+
missing = required_cols - set(self.df.columns)
|
|
67
|
+
if missing:
|
|
68
|
+
raise ValueError(f"TSDataset missing required columns: {missing}")
|
|
69
|
+
|
|
70
|
+
# Ensure datetime type
|
|
71
|
+
if not pd.api.types.is_datetime64_any_dtype(self.df["ds"]):
|
|
72
|
+
raise ValueError("Column 'ds' must be datetime type")
|
|
73
|
+
|
|
74
|
+
@staticmethod
|
|
75
|
+
def _normalize_panel_columns(
|
|
76
|
+
df: pd.DataFrame,
|
|
77
|
+
contract: PanelContract,
|
|
78
|
+
) -> tuple[pd.DataFrame, dict[str, str] | None]:
|
|
79
|
+
return normalize_panel_columns(df, contract)
|
|
80
|
+
|
|
81
|
+
@classmethod
|
|
82
|
+
def from_dataframe(
|
|
83
|
+
cls,
|
|
84
|
+
data: pd.DataFrame,
|
|
85
|
+
task_spec: TaskSpec,
|
|
86
|
+
validate: bool = True,
|
|
87
|
+
compute_sparsity: bool = True,
|
|
88
|
+
) -> TSDataset:
|
|
89
|
+
"""Create TSDataset from DataFrame.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
data: Input DataFrame
|
|
93
|
+
task_spec: Task specification
|
|
94
|
+
validate: Whether to validate input (default: True)
|
|
95
|
+
compute_sparsity: Whether to compute sparsity profile (default: True)
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
New TSDataset instance
|
|
99
|
+
|
|
100
|
+
Raises:
|
|
101
|
+
ValueError: If validation fails
|
|
102
|
+
"""
|
|
103
|
+
df = data.copy()
|
|
104
|
+
contract = task_spec.panel_contract
|
|
105
|
+
|
|
106
|
+
# Validate if requested
|
|
107
|
+
if validate:
|
|
108
|
+
report, df = validate_contract(
|
|
109
|
+
df,
|
|
110
|
+
panel_contract=contract,
|
|
111
|
+
apply_aggregation=True,
|
|
112
|
+
return_data=True,
|
|
113
|
+
)
|
|
114
|
+
if not report.valid:
|
|
115
|
+
report.raise_if_errors()
|
|
116
|
+
|
|
117
|
+
# Normalize to canonical column names
|
|
118
|
+
df, column_map = cls._normalize_panel_columns(df, contract)
|
|
119
|
+
if column_map:
|
|
120
|
+
task_spec = task_spec.model_copy(update={"panel_contract": PanelContract()})
|
|
121
|
+
|
|
122
|
+
# Ensure datetime
|
|
123
|
+
if not pd.api.types.is_datetime64_any_dtype(df["ds"]):
|
|
124
|
+
df["ds"] = pd.to_datetime(df["ds"])
|
|
125
|
+
|
|
126
|
+
# Sort by unique_id, ds
|
|
127
|
+
df = df.sort_values(["unique_id", "ds"]).reset_index(drop=True)
|
|
128
|
+
|
|
129
|
+
# Compute sparsity profile
|
|
130
|
+
sparsity = None
|
|
131
|
+
if compute_sparsity:
|
|
132
|
+
sparsity = compute_sparsity_profile(df)
|
|
133
|
+
|
|
134
|
+
return cls(
|
|
135
|
+
df=df,
|
|
136
|
+
task_spec=task_spec,
|
|
137
|
+
sparsity_profile=sparsity,
|
|
138
|
+
metadata={
|
|
139
|
+
"panel_contract": contract.model_dump() if hasattr(contract, "model_dump") else {},
|
|
140
|
+
"column_map": column_map or {},
|
|
141
|
+
},
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
@property
|
|
145
|
+
def n_series(self) -> int:
|
|
146
|
+
"""Number of unique series."""
|
|
147
|
+
return self.df["unique_id"].nunique()
|
|
148
|
+
|
|
149
|
+
@property
|
|
150
|
+
def n_observations(self) -> int:
|
|
151
|
+
"""Total number of observations."""
|
|
152
|
+
return len(self.df)
|
|
153
|
+
|
|
154
|
+
@property
|
|
155
|
+
def date_range(self) -> tuple[pd.Timestamp, pd.Timestamp]:
|
|
156
|
+
"""Date range of the dataset."""
|
|
157
|
+
return (self.df["ds"].min(), self.df["ds"].max())
|
|
158
|
+
|
|
159
|
+
@property
|
|
160
|
+
def series_ids(self) -> list[str]:
|
|
161
|
+
"""List of unique series IDs."""
|
|
162
|
+
return sorted(self.df["unique_id"].unique().tolist())
|
|
163
|
+
|
|
164
|
+
@property
|
|
165
|
+
def freq(self) -> str:
|
|
166
|
+
"""Frequency from task spec."""
|
|
167
|
+
return self.task_spec.freq
|
|
168
|
+
|
|
169
|
+
def get_series(self, unique_id: str) -> pd.DataFrame:
|
|
170
|
+
"""Get data for a single series.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
unique_id: Series identifier
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
DataFrame with series data
|
|
177
|
+
"""
|
|
178
|
+
return self.df[self.df["unique_id"] == unique_id].copy()
|
|
179
|
+
|
|
180
|
+
def filter_series(self, series_ids: list[str]) -> TSDataset:
|
|
181
|
+
"""Create new dataset with only specified series.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
series_ids: List of series IDs to keep
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
New TSDataset instance
|
|
188
|
+
"""
|
|
189
|
+
mask = self.df["unique_id"].isin(series_ids)
|
|
190
|
+
new_df = self.df[mask].copy()
|
|
191
|
+
|
|
192
|
+
return TSDataset(
|
|
193
|
+
df=new_df,
|
|
194
|
+
task_spec=self.task_spec,
|
|
195
|
+
sparsity_profile=self.sparsity_profile, # Keep original profile
|
|
196
|
+
metadata=self.metadata.copy(),
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
def filter_dates(
|
|
200
|
+
self,
|
|
201
|
+
start: str | pd.Timestamp | None = None,
|
|
202
|
+
end: str | pd.Timestamp | None = None,
|
|
203
|
+
) -> TSDataset:
|
|
204
|
+
"""Create new dataset filtered by date range.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
start: Start date (inclusive)
|
|
208
|
+
end: End date (inclusive)
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
New TSDataset instance
|
|
212
|
+
"""
|
|
213
|
+
mask = pd.Series(True, index=self.df.index)
|
|
214
|
+
|
|
215
|
+
if start is not None:
|
|
216
|
+
start_ts = pd.to_datetime(start)
|
|
217
|
+
mask &= self.df["ds"] >= start_ts
|
|
218
|
+
|
|
219
|
+
if end is not None:
|
|
220
|
+
end_ts = pd.to_datetime(end)
|
|
221
|
+
mask &= self.df["ds"] <= end_ts
|
|
222
|
+
|
|
223
|
+
new_df = self.df[mask].copy()
|
|
224
|
+
|
|
225
|
+
return TSDataset(
|
|
226
|
+
df=new_df,
|
|
227
|
+
task_spec=self.task_spec,
|
|
228
|
+
sparsity_profile=None, # Need to recompute
|
|
229
|
+
metadata=self.metadata.copy(),
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
def split_train_test(
|
|
233
|
+
self,
|
|
234
|
+
test_size: int | None = None,
|
|
235
|
+
test_start: str | pd.Timestamp | None = None,
|
|
236
|
+
) -> tuple[TSDataset, TSDataset]:
|
|
237
|
+
"""Split dataset into train and test sets.
|
|
238
|
+
|
|
239
|
+
Temporal split - uses cutoff date or last N observations per series.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
test_size: Number of observations for test (per series)
|
|
243
|
+
test_start: Start date for test set
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
Tuple of (train_dataset, test_dataset)
|
|
247
|
+
|
|
248
|
+
Raises:
|
|
249
|
+
ValueError: If neither test_size nor test_start provided
|
|
250
|
+
"""
|
|
251
|
+
if test_start is not None:
|
|
252
|
+
# Use date-based split
|
|
253
|
+
cutoff = pd.to_datetime(test_start)
|
|
254
|
+
train_mask = self.df["ds"] < cutoff
|
|
255
|
+
elif test_size is not None:
|
|
256
|
+
# Use last N observations per series
|
|
257
|
+
train_mask = pd.Series(False, index=self.df.index)
|
|
258
|
+
|
|
259
|
+
for uid in self.df["unique_id"].unique():
|
|
260
|
+
series_idx = self.df[self.df["unique_id"] == uid].index
|
|
261
|
+
if len(series_idx) > test_size:
|
|
262
|
+
train_idx = series_idx[:-test_size]
|
|
263
|
+
train_mask.loc[train_idx] = True
|
|
264
|
+
else:
|
|
265
|
+
raise ValueError("Must provide either test_size or test_start")
|
|
266
|
+
|
|
267
|
+
train_df = self.df[train_mask].copy()
|
|
268
|
+
test_df = self.df[~train_mask].copy()
|
|
269
|
+
|
|
270
|
+
train_ds = TSDataset(
|
|
271
|
+
df=train_df,
|
|
272
|
+
task_spec=self.task_spec,
|
|
273
|
+
sparsity_profile=None, # Need to recompute
|
|
274
|
+
metadata=self.metadata.copy(),
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
test_ds = TSDataset(
|
|
278
|
+
df=test_df,
|
|
279
|
+
task_spec=self.task_spec,
|
|
280
|
+
sparsity_profile=None,
|
|
281
|
+
metadata=self.metadata.copy(),
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
return train_ds, test_ds
|
|
285
|
+
|
|
286
|
+
def to_dict(self) -> dict[str, Any]:
|
|
287
|
+
"""Convert to dictionary for serialization.
|
|
288
|
+
|
|
289
|
+
Note: DataFrame is converted to records format.
|
|
290
|
+
"""
|
|
291
|
+
return {
|
|
292
|
+
"df": self.df.to_dict("records"),
|
|
293
|
+
"task_spec": self.task_spec.model_dump(),
|
|
294
|
+
"sparsity_profile": self.sparsity_profile.series_profiles if self.sparsity_profile else None,
|
|
295
|
+
"metadata": self.metadata,
|
|
296
|
+
"hierarchy": self.hierarchy is not None,
|
|
297
|
+
"covariates": {
|
|
298
|
+
"static_x_rows": int(len(self.static_x)) if self.static_x is not None else 0,
|
|
299
|
+
"past_x_rows": int(len(self.past_x)) if self.past_x is not None else 0,
|
|
300
|
+
"future_x_rows": int(len(self.future_x)) if self.future_x is not None else 0,
|
|
301
|
+
"future_index_rows": int(len(self.future_index)) if self.future_index is not None else 0,
|
|
302
|
+
"covariate_spec": (
|
|
303
|
+
self.covariate_spec.model_dump()
|
|
304
|
+
if hasattr(self.covariate_spec, "model_dump")
|
|
305
|
+
else self.covariate_spec
|
|
306
|
+
),
|
|
307
|
+
},
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
def with_hierarchy(self, hierarchy: HierarchyStructure) -> TSDataset:
|
|
311
|
+
"""Return new TSDataset with hierarchy attached.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
hierarchy: Hierarchy structure to attach
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
New TSDataset instance with hierarchy
|
|
318
|
+
"""
|
|
319
|
+
return replace(self, hierarchy=hierarchy)
|
|
320
|
+
|
|
321
|
+
def with_covariates(
|
|
322
|
+
self,
|
|
323
|
+
aligned: AlignedDataset | None,
|
|
324
|
+
panel_with_covariates: pd.DataFrame | None = None,
|
|
325
|
+
covariate_bundle: CovariateBundle | None = None,
|
|
326
|
+
) -> TSDataset:
|
|
327
|
+
"""Return new TSDataset with covariates attached."""
|
|
328
|
+
if aligned is None:
|
|
329
|
+
return replace(
|
|
330
|
+
self,
|
|
331
|
+
static_x=None,
|
|
332
|
+
past_x=None,
|
|
333
|
+
future_x=None,
|
|
334
|
+
future_index=None,
|
|
335
|
+
covariate_spec=None,
|
|
336
|
+
covariate_bundle=covariate_bundle,
|
|
337
|
+
panel_with_covariates=panel_with_covariates,
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
return replace(
|
|
341
|
+
self,
|
|
342
|
+
static_x=aligned.static_x,
|
|
343
|
+
past_x=aligned.past_x,
|
|
344
|
+
future_x=aligned.future_x,
|
|
345
|
+
future_index=aligned.future_index,
|
|
346
|
+
covariate_spec=aligned.covariate_spec,
|
|
347
|
+
covariate_bundle=covariate_bundle,
|
|
348
|
+
panel_with_covariates=panel_with_covariates,
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
def is_hierarchical(self) -> bool:
|
|
352
|
+
"""Check if dataset has hierarchy.
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
True if hierarchy is attached
|
|
356
|
+
"""
|
|
357
|
+
return self.hierarchy is not None
|
|
358
|
+
|
|
359
|
+
def get_level_series(self, level: int) -> list[str]:
|
|
360
|
+
"""Get all series IDs at a specific hierarchy level.
|
|
361
|
+
|
|
362
|
+
Args:
|
|
363
|
+
level: Hierarchy level (0 = root)
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
List of series IDs at that level
|
|
367
|
+
"""
|
|
368
|
+
if not self.hierarchy:
|
|
369
|
+
return self.series_ids
|
|
370
|
+
|
|
371
|
+
return [
|
|
372
|
+
node for node in self.hierarchy.all_nodes
|
|
373
|
+
if self.hierarchy.get_level(node) == level
|
|
374
|
+
]
|
|
375
|
+
|
|
376
|
+
def aggregate_to_level(self, target_level: int) -> TSDataset:
|
|
377
|
+
"""Aggregate bottom-level data to target hierarchy level.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
target_level: Target hierarchy level
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
New TSDataset with aggregated data
|
|
384
|
+
"""
|
|
385
|
+
if not self.hierarchy:
|
|
386
|
+
raise ValueError("Dataset does not have hierarchy")
|
|
387
|
+
|
|
388
|
+
target_nodes = self.hierarchy.get_nodes_at_level(target_level)
|
|
389
|
+
|
|
390
|
+
# Aggregate data for target nodes
|
|
391
|
+
aggregated_rows = []
|
|
392
|
+
for node in target_nodes:
|
|
393
|
+
# Find all bottom nodes that contribute to this node
|
|
394
|
+
bottom_contributors = []
|
|
395
|
+
for bottom_node in self.hierarchy.bottom_nodes:
|
|
396
|
+
bottom_idx = self.hierarchy.all_nodes.index(bottom_node)
|
|
397
|
+
node_idx = self.hierarchy.all_nodes.index(node)
|
|
398
|
+
if self.hierarchy.s_matrix[node_idx, bottom_idx] == 1:
|
|
399
|
+
bottom_contributors.append(bottom_node)
|
|
400
|
+
|
|
401
|
+
# Get data for contributors
|
|
402
|
+
node_data = self.df[self.df["unique_id"].isin(bottom_contributors)]
|
|
403
|
+
|
|
404
|
+
# Aggregate by date
|
|
405
|
+
if not node_data.empty:
|
|
406
|
+
aggregated = node_data.groupby("ds")["y"].sum().reset_index()
|
|
407
|
+
aggregated["unique_id"] = node
|
|
408
|
+
aggregated_rows.append(aggregated)
|
|
409
|
+
|
|
410
|
+
if not aggregated_rows:
|
|
411
|
+
raise ValueError(f"No data found for level {target_level}")
|
|
412
|
+
|
|
413
|
+
new_df = pd.concat(aggregated_rows, ignore_index=True)
|
|
414
|
+
|
|
415
|
+
# Reorder columns to match expected format
|
|
416
|
+
new_df = new_df[["unique_id", "ds", "y"]]
|
|
417
|
+
|
|
418
|
+
return replace(
|
|
419
|
+
self,
|
|
420
|
+
df=new_df,
|
|
421
|
+
sparsity_profile=None, # Need to recompute
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def build_dataset(
|
|
426
|
+
data: pd.DataFrame,
|
|
427
|
+
task_spec: TaskSpec,
|
|
428
|
+
validate: bool = True,
|
|
429
|
+
compute_sparsity: bool = True,
|
|
430
|
+
) -> TSDataset:
|
|
431
|
+
"""Build a TSDataset from raw data.
|
|
432
|
+
|
|
433
|
+
Convenience function for creating TSDataset.
|
|
434
|
+
|
|
435
|
+
Args:
|
|
436
|
+
data: Input DataFrame
|
|
437
|
+
task_spec: Task specification
|
|
438
|
+
validate: Whether to validate input (default: True)
|
|
439
|
+
compute_sparsity: Whether to compute sparsity profile (default: True)
|
|
440
|
+
|
|
441
|
+
Returns:
|
|
442
|
+
New TSDataset instance
|
|
443
|
+
"""
|
|
444
|
+
return TSDataset.from_dataframe(
|
|
445
|
+
data=data,
|
|
446
|
+
task_spec=task_spec,
|
|
447
|
+
validate=validate,
|
|
448
|
+
compute_sparsity=compute_sparsity,
|
|
449
|
+
)
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
"""Sparsity profiling for time series.
|
|
2
|
+
|
|
3
|
+
Identifies intermittent, cold-start, and sparse series for the router.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from enum import StrEnum
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SparsityClass(StrEnum):
|
|
16
|
+
"""Classification of series sparsity patterns."""
|
|
17
|
+
|
|
18
|
+
REGULAR = "regular"
|
|
19
|
+
"""Regular series with consistent observations."""
|
|
20
|
+
|
|
21
|
+
INTERMITTENT = "intermittent"
|
|
22
|
+
"""Series with many zero values (intermittent demand)."""
|
|
23
|
+
|
|
24
|
+
SPARSE = "sparse"
|
|
25
|
+
"""Series with irregular gaps in observations."""
|
|
26
|
+
|
|
27
|
+
COLD_START = "cold_start"
|
|
28
|
+
"""Series with very few observations (new series)."""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass(frozen=True)
|
|
32
|
+
class SparsityProfile:
|
|
33
|
+
"""Sparsity profile for a time series dataset.
|
|
34
|
+
|
|
35
|
+
Contains classification and metrics for each series in the dataset,
|
|
36
|
+
used by the router for model selection.
|
|
37
|
+
|
|
38
|
+
Attributes:
|
|
39
|
+
series_profiles: Dict mapping unique_id to profile metrics
|
|
40
|
+
dataset_metrics: Aggregate metrics across all series
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
series_profiles: dict[str, dict]
|
|
44
|
+
dataset_metrics: dict
|
|
45
|
+
|
|
46
|
+
def get_classification(self, unique_id: str) -> SparsityClass:
|
|
47
|
+
"""Get the sparsity classification for a series.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
unique_id: The series identifier
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
SparsityClass enum value
|
|
54
|
+
"""
|
|
55
|
+
profile = self.series_profiles.get(unique_id, {})
|
|
56
|
+
return SparsityClass(profile.get("classification", "regular"))
|
|
57
|
+
|
|
58
|
+
def get_series_by_class(self, cls: SparsityClass) -> list[str]:
|
|
59
|
+
"""Get all series IDs with a given classification.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
cls: Sparsity class to filter by
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
List of unique_id values
|
|
66
|
+
"""
|
|
67
|
+
return [
|
|
68
|
+
uid
|
|
69
|
+
for uid, profile in self.series_profiles.items()
|
|
70
|
+
if profile.get("classification") == cls.value
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
def has_intermittent(self) -> bool:
|
|
74
|
+
"""Check if dataset has any intermittent series."""
|
|
75
|
+
return any(
|
|
76
|
+
p.get("classification") == SparsityClass.INTERMITTENT.value
|
|
77
|
+
for p in self.series_profiles.values()
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def has_cold_start(self) -> bool:
|
|
81
|
+
"""Check if dataset has any cold-start series."""
|
|
82
|
+
return any(
|
|
83
|
+
p.get("classification") == SparsityClass.COLD_START.value
|
|
84
|
+
for p in self.series_profiles.values()
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def compute_sparsity_profile(
|
|
89
|
+
df: pd.DataFrame,
|
|
90
|
+
min_observations: int = 10,
|
|
91
|
+
zero_threshold: float = 0.3,
|
|
92
|
+
gap_threshold: float = 0.2,
|
|
93
|
+
) -> SparsityProfile:
|
|
94
|
+
"""Compute sparsity profile for a dataset.
|
|
95
|
+
|
|
96
|
+
Analyzes each series to classify as regular, intermittent, sparse,
|
|
97
|
+
or cold-start based on observation patterns.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
df: DataFrame with columns [unique_id, ds, y]
|
|
101
|
+
min_observations: Minimum observations for non-cold-start (default: 10)
|
|
102
|
+
zero_threshold: Threshold for zero ratio to be intermittent (default: 0.3)
|
|
103
|
+
gap_threshold: Threshold for gap ratio to be sparse (default: 0.2)
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
SparsityProfile with classifications and metrics
|
|
107
|
+
"""
|
|
108
|
+
series_profiles: dict[str, dict] = {}
|
|
109
|
+
|
|
110
|
+
for uid in df["unique_id"].unique():
|
|
111
|
+
series = df[df["unique_id"] == uid].sort_values("ds")
|
|
112
|
+
profile = _analyze_series(
|
|
113
|
+
series,
|
|
114
|
+
min_observations=min_observations,
|
|
115
|
+
zero_threshold=zero_threshold,
|
|
116
|
+
gap_threshold=gap_threshold,
|
|
117
|
+
)
|
|
118
|
+
series_profiles[uid] = profile
|
|
119
|
+
|
|
120
|
+
# Compute dataset-level metrics
|
|
121
|
+
dataset_metrics = _compute_dataset_metrics(series_profiles)
|
|
122
|
+
|
|
123
|
+
return SparsityProfile(
|
|
124
|
+
series_profiles=series_profiles,
|
|
125
|
+
dataset_metrics=dataset_metrics,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _analyze_series(
|
|
130
|
+
series: pd.DataFrame,
|
|
131
|
+
min_observations: int,
|
|
132
|
+
zero_threshold: float,
|
|
133
|
+
gap_threshold: float,
|
|
134
|
+
) -> dict:
|
|
135
|
+
"""Analyze a single series for sparsity patterns.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
series: DataFrame for a single series
|
|
139
|
+
min_observations: Minimum observations threshold
|
|
140
|
+
zero_threshold: Zero ratio threshold for intermittent
|
|
141
|
+
gap_threshold: Gap ratio threshold for sparse
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Dictionary with classification and metrics
|
|
145
|
+
"""
|
|
146
|
+
y = series["y"].values
|
|
147
|
+
n = len(y)
|
|
148
|
+
|
|
149
|
+
# Basic metrics
|
|
150
|
+
metrics = {
|
|
151
|
+
"n_observations": n,
|
|
152
|
+
"zero_ratio": float(np.mean(y == 0)) if len(y) > 0 else 0.0,
|
|
153
|
+
"missing_ratio": float(series["y"].isna().mean()),
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
# Compute gap metrics if we have datetime
|
|
157
|
+
if n > 1 and pd.api.types.is_datetime64_any_dtype(series["ds"]):
|
|
158
|
+
ds = pd.to_datetime(series["ds"])
|
|
159
|
+
time_diffs = ds.diff().dropna()
|
|
160
|
+
|
|
161
|
+
if len(time_diffs) > 0:
|
|
162
|
+
# Most common interval
|
|
163
|
+
mode_diff = time_diffs.mode()
|
|
164
|
+
if len(mode_diff) > 0:
|
|
165
|
+
expected_interval = mode_diff.iloc[0]
|
|
166
|
+
# Count gaps (intervals significantly larger than expected)
|
|
167
|
+
gap_count = int((time_diffs > expected_interval * 1.5).sum())
|
|
168
|
+
metrics["gap_count"] = gap_count
|
|
169
|
+
metrics["gap_ratio"] = gap_count / len(time_diffs) if len(time_diffs) > 0 else 0.0
|
|
170
|
+
else:
|
|
171
|
+
metrics["gap_count"] = 0
|
|
172
|
+
metrics["gap_ratio"] = 0.0
|
|
173
|
+
else:
|
|
174
|
+
metrics["gap_count"] = 0
|
|
175
|
+
metrics["gap_ratio"] = 0.0
|
|
176
|
+
else:
|
|
177
|
+
metrics["gap_count"] = 0
|
|
178
|
+
metrics["gap_ratio"] = 0.0
|
|
179
|
+
|
|
180
|
+
# Classification logic
|
|
181
|
+
classification = _classify_series(
|
|
182
|
+
metrics, n, min_observations, zero_threshold, gap_threshold
|
|
183
|
+
)
|
|
184
|
+
metrics["classification"] = classification
|
|
185
|
+
|
|
186
|
+
return metrics
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _classify_series(
|
|
190
|
+
metrics: dict,
|
|
191
|
+
n: int,
|
|
192
|
+
min_observations: int,
|
|
193
|
+
zero_threshold: float,
|
|
194
|
+
gap_threshold: float,
|
|
195
|
+
) -> str:
|
|
196
|
+
"""Classify a series based on its metrics.
|
|
197
|
+
|
|
198
|
+
Classification order matters:
|
|
199
|
+
1. Cold start (too few observations)
|
|
200
|
+
2. Intermittent (many zeros)
|
|
201
|
+
3. Sparse (many gaps)
|
|
202
|
+
4. Regular (default)
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
metrics: Series metrics dictionary
|
|
206
|
+
n: Number of observations
|
|
207
|
+
min_observations: Threshold for cold-start
|
|
208
|
+
zero_threshold: Threshold for intermittent
|
|
209
|
+
gap_threshold: Threshold for sparse
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
Classification string
|
|
213
|
+
"""
|
|
214
|
+
# Cold start: too few observations
|
|
215
|
+
if n < min_observations:
|
|
216
|
+
return SparsityClass.COLD_START.value
|
|
217
|
+
|
|
218
|
+
# Intermittent: many zero values
|
|
219
|
+
if metrics.get("zero_ratio", 0) > zero_threshold:
|
|
220
|
+
return SparsityClass.INTERMITTENT.value
|
|
221
|
+
|
|
222
|
+
# Sparse: significant gaps
|
|
223
|
+
if metrics.get("gap_ratio", 0) > gap_threshold:
|
|
224
|
+
return SparsityClass.SPARSE.value
|
|
225
|
+
|
|
226
|
+
# Default: regular
|
|
227
|
+
return SparsityClass.REGULAR.value
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _compute_dataset_metrics(series_profiles: dict[str, dict]) -> dict:
|
|
231
|
+
"""Compute aggregate metrics across all series.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
series_profiles: Dict of series profiles
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
Dictionary of dataset-level metrics
|
|
238
|
+
"""
|
|
239
|
+
if not series_profiles:
|
|
240
|
+
return {
|
|
241
|
+
"total_series": 0,
|
|
242
|
+
"classification_counts": {},
|
|
243
|
+
"avg_observations": 0.0,
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
# Count classifications
|
|
247
|
+
class_counts: dict[str, int] = {}
|
|
248
|
+
for profile in series_profiles.values():
|
|
249
|
+
cls = profile.get("classification", "unknown")
|
|
250
|
+
class_counts[cls] = class_counts.get(cls, 0) + 1
|
|
251
|
+
|
|
252
|
+
# Average observations
|
|
253
|
+
avg_obs = sum(p.get("n_observations", 0) for p in series_profiles.values()) / len(
|
|
254
|
+
series_profiles
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
return {
|
|
258
|
+
"total_series": len(series_profiles),
|
|
259
|
+
"classification_counts": class_counts,
|
|
260
|
+
"avg_observations": avg_obs,
|
|
261
|
+
}
|