tsagentkit 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tsagentkit/__init__.py +126 -0
- tsagentkit/anomaly/__init__.py +130 -0
- tsagentkit/backtest/__init__.py +48 -0
- tsagentkit/backtest/engine.py +788 -0
- tsagentkit/backtest/metrics.py +244 -0
- tsagentkit/backtest/report.py +342 -0
- tsagentkit/calibration/__init__.py +136 -0
- tsagentkit/contracts/__init__.py +133 -0
- tsagentkit/contracts/errors.py +275 -0
- tsagentkit/contracts/results.py +418 -0
- tsagentkit/contracts/schema.py +44 -0
- tsagentkit/contracts/task_spec.py +300 -0
- tsagentkit/covariates/__init__.py +340 -0
- tsagentkit/eval/__init__.py +285 -0
- tsagentkit/features/__init__.py +20 -0
- tsagentkit/features/covariates.py +328 -0
- tsagentkit/features/extra/__init__.py +5 -0
- tsagentkit/features/extra/native.py +179 -0
- tsagentkit/features/factory.py +187 -0
- tsagentkit/features/matrix.py +159 -0
- tsagentkit/features/tsfeatures_adapter.py +115 -0
- tsagentkit/features/versioning.py +203 -0
- tsagentkit/hierarchy/__init__.py +39 -0
- tsagentkit/hierarchy/aggregation.py +62 -0
- tsagentkit/hierarchy/evaluator.py +400 -0
- tsagentkit/hierarchy/reconciliation.py +232 -0
- tsagentkit/hierarchy/structure.py +453 -0
- tsagentkit/models/__init__.py +182 -0
- tsagentkit/models/adapters/__init__.py +83 -0
- tsagentkit/models/adapters/base.py +321 -0
- tsagentkit/models/adapters/chronos.py +387 -0
- tsagentkit/models/adapters/moirai.py +256 -0
- tsagentkit/models/adapters/registry.py +171 -0
- tsagentkit/models/adapters/timesfm.py +440 -0
- tsagentkit/models/baselines.py +207 -0
- tsagentkit/models/sktime.py +307 -0
- tsagentkit/monitoring/__init__.py +51 -0
- tsagentkit/monitoring/alerts.py +302 -0
- tsagentkit/monitoring/coverage.py +203 -0
- tsagentkit/monitoring/drift.py +330 -0
- tsagentkit/monitoring/report.py +214 -0
- tsagentkit/monitoring/stability.py +275 -0
- tsagentkit/monitoring/triggers.py +423 -0
- tsagentkit/qa/__init__.py +347 -0
- tsagentkit/router/__init__.py +37 -0
- tsagentkit/router/bucketing.py +489 -0
- tsagentkit/router/fallback.py +132 -0
- tsagentkit/router/plan.py +23 -0
- tsagentkit/router/router.py +271 -0
- tsagentkit/series/__init__.py +26 -0
- tsagentkit/series/alignment.py +206 -0
- tsagentkit/series/dataset.py +449 -0
- tsagentkit/series/sparsity.py +261 -0
- tsagentkit/series/validation.py +393 -0
- tsagentkit/serving/__init__.py +39 -0
- tsagentkit/serving/orchestration.py +943 -0
- tsagentkit/serving/packaging.py +73 -0
- tsagentkit/serving/provenance.py +317 -0
- tsagentkit/serving/tsfm_cache.py +214 -0
- tsagentkit/skill/README.md +135 -0
- tsagentkit/skill/__init__.py +8 -0
- tsagentkit/skill/recipes.md +429 -0
- tsagentkit/skill/tool_map.md +21 -0
- tsagentkit/time/__init__.py +134 -0
- tsagentkit/utils/__init__.py +20 -0
- tsagentkit/utils/quantiles.py +83 -0
- tsagentkit/utils/signature.py +47 -0
- tsagentkit/utils/temporal.py +41 -0
- tsagentkit-1.0.2.dist-info/METADATA +371 -0
- tsagentkit-1.0.2.dist-info/RECORD +72 -0
- tsagentkit-1.0.2.dist-info/WHEEL +4 -0
- tsagentkit-1.0.2.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
"""Evaluation utilities for forecast metrics and summaries."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from functools import partial
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from tsagentkit.utils import parse_quantile_column
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True)
|
|
16
|
+
class MetricFrame:
|
|
17
|
+
"""Container for metric results."""
|
|
18
|
+
|
|
19
|
+
df: pd.DataFrame
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True)
|
|
23
|
+
class ScoreSummary:
|
|
24
|
+
"""Aggregate metric summary."""
|
|
25
|
+
|
|
26
|
+
df: pd.DataFrame
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _maybe_import_utilsforecast():
|
|
30
|
+
try:
|
|
31
|
+
from utilsforecast import evaluation as ufeval
|
|
32
|
+
from utilsforecast import losses as uflosses
|
|
33
|
+
except Exception:
|
|
34
|
+
return None, None
|
|
35
|
+
return ufeval, uflosses
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _wide_predictions(
|
|
39
|
+
df: pd.DataFrame,
|
|
40
|
+
id_col: str,
|
|
41
|
+
ds_col: str,
|
|
42
|
+
target_col: str,
|
|
43
|
+
model_col: str,
|
|
44
|
+
pred_col: str,
|
|
45
|
+
cutoff_col: str | None,
|
|
46
|
+
) -> tuple[pd.DataFrame, list[str], dict[str, dict[float, str]]]:
|
|
47
|
+
index_cols = [id_col, ds_col]
|
|
48
|
+
if cutoff_col and cutoff_col in df.columns:
|
|
49
|
+
index_cols.append(cutoff_col)
|
|
50
|
+
pivot = df.pivot_table(
|
|
51
|
+
index=index_cols,
|
|
52
|
+
columns=model_col,
|
|
53
|
+
values=pred_col,
|
|
54
|
+
aggfunc="mean",
|
|
55
|
+
)
|
|
56
|
+
wide = pivot.reset_index()
|
|
57
|
+
actuals = df[index_cols + [target_col]].drop_duplicates(subset=index_cols)
|
|
58
|
+
wide = wide.merge(actuals, on=index_cols, how="left")
|
|
59
|
+
model_cols = [c for c in wide.columns if c not in index_cols + [target_col]]
|
|
60
|
+
quantile_cols = [c for c in df.columns if parse_quantile_column(c) is not None]
|
|
61
|
+
quantile_map: dict[str, dict[float, str]] = {}
|
|
62
|
+
|
|
63
|
+
if quantile_cols:
|
|
64
|
+
for q_col in quantile_cols:
|
|
65
|
+
q = parse_quantile_column(q_col)
|
|
66
|
+
if q is None:
|
|
67
|
+
continue
|
|
68
|
+
q_pivot = df.pivot_table(
|
|
69
|
+
index=index_cols,
|
|
70
|
+
columns=model_col,
|
|
71
|
+
values=q_col,
|
|
72
|
+
aggfunc="mean",
|
|
73
|
+
).reset_index()
|
|
74
|
+
rename_map: dict[str, str] = {}
|
|
75
|
+
for col in q_pivot.columns:
|
|
76
|
+
if col in index_cols:
|
|
77
|
+
continue
|
|
78
|
+
new_col = f"{col}__{q_col}"
|
|
79
|
+
rename_map[col] = new_col
|
|
80
|
+
quantile_map.setdefault(col, {})[q] = new_col
|
|
81
|
+
if rename_map:
|
|
82
|
+
q_pivot = q_pivot.rename(columns=rename_map)
|
|
83
|
+
wide = wide.merge(q_pivot, on=index_cols, how="left")
|
|
84
|
+
|
|
85
|
+
return wide, model_cols, quantile_map
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _wrap_metric_name(func: Any, name: str) -> Any:
|
|
89
|
+
func.__name__ = name
|
|
90
|
+
return func
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _make_wape_metric(uflosses: Any, cutoff_col: str) -> Any:
|
|
94
|
+
def _metric(
|
|
95
|
+
df: pd.DataFrame,
|
|
96
|
+
models: list[str],
|
|
97
|
+
id_col: str = "unique_id",
|
|
98
|
+
target_col: str = "y",
|
|
99
|
+
**_: Any,
|
|
100
|
+
) -> pd.DataFrame:
|
|
101
|
+
return uflosses.nd(
|
|
102
|
+
df=df,
|
|
103
|
+
models=models,
|
|
104
|
+
id_col=id_col,
|
|
105
|
+
target_col=target_col,
|
|
106
|
+
cutoff_col=cutoff_col,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
return _wrap_metric_name(_metric, "wape")
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _make_quantile_loss_metric(
|
|
113
|
+
uflosses: Any,
|
|
114
|
+
q: float,
|
|
115
|
+
quantile_models: dict[str, str],
|
|
116
|
+
cutoff_col: str,
|
|
117
|
+
) -> Any:
|
|
118
|
+
def _metric(
|
|
119
|
+
df: pd.DataFrame,
|
|
120
|
+
models: list[str],
|
|
121
|
+
id_col: str = "unique_id",
|
|
122
|
+
target_col: str = "y",
|
|
123
|
+
**_: Any,
|
|
124
|
+
) -> pd.DataFrame:
|
|
125
|
+
return uflosses.quantile_loss(
|
|
126
|
+
df=df,
|
|
127
|
+
models=quantile_models,
|
|
128
|
+
q=q,
|
|
129
|
+
id_col=id_col,
|
|
130
|
+
target_col=target_col,
|
|
131
|
+
cutoff_col=cutoff_col,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
return _wrap_metric_name(_metric, f"pinball_{q:.3f}")
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _make_wql_metric(
|
|
138
|
+
uflosses: Any,
|
|
139
|
+
quantile_models: dict[str, list[str]],
|
|
140
|
+
quantiles: np.ndarray,
|
|
141
|
+
cutoff_col: str,
|
|
142
|
+
) -> Any:
|
|
143
|
+
def _metric(
|
|
144
|
+
df: pd.DataFrame,
|
|
145
|
+
models: list[str],
|
|
146
|
+
id_col: str = "unique_id",
|
|
147
|
+
target_col: str = "y",
|
|
148
|
+
**_: Any,
|
|
149
|
+
) -> pd.DataFrame:
|
|
150
|
+
return uflosses.mqloss(
|
|
151
|
+
df=df,
|
|
152
|
+
models=quantile_models,
|
|
153
|
+
quantiles=quantiles,
|
|
154
|
+
id_col=id_col,
|
|
155
|
+
target_col=target_col,
|
|
156
|
+
cutoff_col=cutoff_col,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
return _wrap_metric_name(_metric, "wql")
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def evaluate_forecasts(
|
|
163
|
+
df: pd.DataFrame,
|
|
164
|
+
train_df: pd.DataFrame | None = None,
|
|
165
|
+
season_length: int | None = None,
|
|
166
|
+
id_col: str = "unique_id",
|
|
167
|
+
ds_col: str = "ds",
|
|
168
|
+
target_col: str = "y",
|
|
169
|
+
model_col: str = "model",
|
|
170
|
+
pred_col: str = "yhat",
|
|
171
|
+
cutoff_col: str | None = "cutoff",
|
|
172
|
+
) -> tuple[MetricFrame, ScoreSummary]:
|
|
173
|
+
"""Compute point + quantile metrics in a stable long schema."""
|
|
174
|
+
if df.empty:
|
|
175
|
+
return MetricFrame(pd.DataFrame()), ScoreSummary(pd.DataFrame())
|
|
176
|
+
|
|
177
|
+
if model_col not in df.columns:
|
|
178
|
+
df = df.copy()
|
|
179
|
+
df[model_col] = "model"
|
|
180
|
+
|
|
181
|
+
wide, model_cols, quantile_map = _wide_predictions(
|
|
182
|
+
df,
|
|
183
|
+
id_col=id_col,
|
|
184
|
+
ds_col=ds_col,
|
|
185
|
+
target_col=target_col,
|
|
186
|
+
model_col=model_col,
|
|
187
|
+
pred_col=pred_col,
|
|
188
|
+
cutoff_col=cutoff_col,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
ufeval, uflosses = _maybe_import_utilsforecast()
|
|
192
|
+
if ufeval is None or uflosses is None or not model_cols:
|
|
193
|
+
return MetricFrame(pd.DataFrame()), ScoreSummary(pd.DataFrame())
|
|
194
|
+
|
|
195
|
+
cutoff_present = cutoff_col is not None and cutoff_col in wide.columns
|
|
196
|
+
cutoff_name = cutoff_col if cutoff_col is not None else "cutoff"
|
|
197
|
+
wide_eval = wide
|
|
198
|
+
|
|
199
|
+
metrics: list[Any] = [uflosses.mae, uflosses.rmse, uflosses.smape]
|
|
200
|
+
if hasattr(uflosses, "nd"):
|
|
201
|
+
metrics.append(_make_wape_metric(uflosses, cutoff_name))
|
|
202
|
+
|
|
203
|
+
if train_df is not None and season_length and hasattr(uflosses, "mase"):
|
|
204
|
+
mase_metric = partial(uflosses.mase, seasonality=season_length)
|
|
205
|
+
metrics.append(_wrap_metric_name(mase_metric, "mase"))
|
|
206
|
+
|
|
207
|
+
if quantile_map and hasattr(uflosses, "quantile_loss"):
|
|
208
|
+
available_models = [model for model in model_cols if model in quantile_map]
|
|
209
|
+
if available_models:
|
|
210
|
+
common_quantiles = set.intersection(
|
|
211
|
+
*[
|
|
212
|
+
set(quantile_map[model].keys())
|
|
213
|
+
for model in available_models
|
|
214
|
+
]
|
|
215
|
+
)
|
|
216
|
+
else:
|
|
217
|
+
common_quantiles = set()
|
|
218
|
+
|
|
219
|
+
if common_quantiles:
|
|
220
|
+
quantiles_sorted = sorted(common_quantiles)
|
|
221
|
+
for q in quantiles_sorted:
|
|
222
|
+
per_q_models = {
|
|
223
|
+
model: quantile_map[model][q]
|
|
224
|
+
for model in available_models
|
|
225
|
+
if q in quantile_map[model]
|
|
226
|
+
}
|
|
227
|
+
if per_q_models:
|
|
228
|
+
metrics.append(
|
|
229
|
+
_make_quantile_loss_metric(
|
|
230
|
+
uflosses=uflosses,
|
|
231
|
+
q=q,
|
|
232
|
+
quantile_models=per_q_models,
|
|
233
|
+
cutoff_col=cutoff_name,
|
|
234
|
+
)
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
per_model_quantiles = {
|
|
238
|
+
model: [quantile_map[model][q] for q in quantiles_sorted]
|
|
239
|
+
for model in available_models
|
|
240
|
+
if all(q in quantile_map[model] for q in quantiles_sorted)
|
|
241
|
+
}
|
|
242
|
+
if per_model_quantiles and hasattr(uflosses, "mqloss"):
|
|
243
|
+
metrics.append(
|
|
244
|
+
_make_wql_metric(
|
|
245
|
+
uflosses=uflosses,
|
|
246
|
+
quantile_models=per_model_quantiles,
|
|
247
|
+
quantiles=np.asarray(quantiles_sorted, dtype=float),
|
|
248
|
+
cutoff_col=cutoff_name,
|
|
249
|
+
)
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
metric_df = ufeval.evaluate(
|
|
253
|
+
wide_eval,
|
|
254
|
+
metrics=metrics,
|
|
255
|
+
models=model_cols,
|
|
256
|
+
train_df=train_df,
|
|
257
|
+
id_col=id_col,
|
|
258
|
+
time_col=ds_col,
|
|
259
|
+
target_col=target_col,
|
|
260
|
+
cutoff_col=cutoff_name,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
metrics_long = metric_df.copy()
|
|
264
|
+
index_cols = [id_col]
|
|
265
|
+
if cutoff_present and cutoff_col and cutoff_col in metrics_long.columns:
|
|
266
|
+
index_cols.append(cutoff_col)
|
|
267
|
+
|
|
268
|
+
metric_cols = [c for c in metrics_long.columns if c not in index_cols + ["metric"]]
|
|
269
|
+
metrics_long = metrics_long.melt(
|
|
270
|
+
id_vars=index_cols + ["metric"],
|
|
271
|
+
value_vars=metric_cols,
|
|
272
|
+
var_name="model",
|
|
273
|
+
value_name="value",
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
summary = (
|
|
277
|
+
metrics_long.groupby(["model", "metric"])["value"]
|
|
278
|
+
.mean()
|
|
279
|
+
.reset_index()
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
return MetricFrame(metrics_long), ScoreSummary(summary)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
__all__ = ["MetricFrame", "ScoreSummary", "evaluate_forecasts"]
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Feature engineering module for time series forecasting.
|
|
2
|
+
|
|
3
|
+
Provides point-in-time safe feature engineering with full versioning support.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from tsagentkit.features.covariates import CovariateManager, CovariatePolicy
|
|
9
|
+
from tsagentkit.features.factory import FeatureConfig, FeatureFactory
|
|
10
|
+
from tsagentkit.features.matrix import FeatureMatrix
|
|
11
|
+
from tsagentkit.features.versioning import compute_feature_hash
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"FeatureMatrix",
|
|
15
|
+
"FeatureFactory",
|
|
16
|
+
"FeatureConfig",
|
|
17
|
+
"CovariateManager",
|
|
18
|
+
"CovariatePolicy",
|
|
19
|
+
"compute_feature_hash",
|
|
20
|
+
]
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
"""Covariate management for known vs observed covariates with leakage protection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from tsagentkit.contracts.errors import ECovariateLeakage
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CovariatePolicy(Enum):
|
|
19
|
+
"""Policy for handling different covariate types.
|
|
20
|
+
|
|
21
|
+
- KNOWN: Covariates known for all time steps (e.g., holidays, promotions planned in advance)
|
|
22
|
+
- OBSERVED: Covariates only observed up to current time (e.g., actual sales, weather)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
KNOWN = "known"
|
|
26
|
+
OBSERVED = "observed"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True)
|
|
30
|
+
class CovariateConfig:
|
|
31
|
+
"""Configuration specifying covariate types.
|
|
32
|
+
|
|
33
|
+
Attributes:
|
|
34
|
+
known: List of column names for known covariates
|
|
35
|
+
observed: List of column names for observed covariates
|
|
36
|
+
|
|
37
|
+
Example:
|
|
38
|
+
>>> config = CovariateConfig(
|
|
39
|
+
... known=["holiday", "promotion_planned"],
|
|
40
|
+
... observed=["competitor_price", "weather"],
|
|
41
|
+
... )
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
known: list[str] = field(default_factory=list)
|
|
45
|
+
observed: list[str] = field(default_factory=list)
|
|
46
|
+
|
|
47
|
+
def __post_init__(self) -> None:
|
|
48
|
+
"""Validate no overlap between known and observed."""
|
|
49
|
+
overlap = set(self.known) & set(self.observed)
|
|
50
|
+
if overlap:
|
|
51
|
+
raise ValueError(f"Covariates cannot be both known and observed: {overlap}")
|
|
52
|
+
|
|
53
|
+
def get_policy(self, column: str) -> CovariatePolicy | None:
|
|
54
|
+
"""Get the policy for a specific column.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
column: Column name to check
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
CovariatePolicy or None if column is not a covariate
|
|
61
|
+
"""
|
|
62
|
+
if column in self.known:
|
|
63
|
+
return CovariatePolicy.KNOWN
|
|
64
|
+
elif column in self.observed:
|
|
65
|
+
return CovariatePolicy.OBSERVED
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
def all_covariates(self) -> list[str]:
|
|
69
|
+
"""Return all covariate column names."""
|
|
70
|
+
return self.known + self.observed
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def infer_covariate_config(
|
|
74
|
+
df: pd.DataFrame,
|
|
75
|
+
policy: str,
|
|
76
|
+
id_col: str = "unique_id",
|
|
77
|
+
ds_col: str = "ds",
|
|
78
|
+
target_col: str = "y",
|
|
79
|
+
) -> CovariateConfig:
|
|
80
|
+
"""Infer covariate configuration based on policy and data."""
|
|
81
|
+
if policy == "ignore":
|
|
82
|
+
return CovariateConfig()
|
|
83
|
+
|
|
84
|
+
covariate_cols = [
|
|
85
|
+
c for c in df.columns
|
|
86
|
+
if c not in {id_col, ds_col, target_col}
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
if not covariate_cols:
|
|
90
|
+
return CovariateConfig()
|
|
91
|
+
|
|
92
|
+
if policy == "known":
|
|
93
|
+
return CovariateConfig(known=covariate_cols, observed=[])
|
|
94
|
+
if policy == "observed":
|
|
95
|
+
return CovariateConfig(known=[], observed=covariate_cols)
|
|
96
|
+
|
|
97
|
+
# Auto policy: infer based on future rows (y is null)
|
|
98
|
+
future_mask = df[target_col].isna() if target_col in df.columns else None
|
|
99
|
+
known: list[str] = []
|
|
100
|
+
observed: list[str] = []
|
|
101
|
+
|
|
102
|
+
for col in covariate_cols:
|
|
103
|
+
if future_mask is not None and future_mask.any():
|
|
104
|
+
has_future_values = df.loc[future_mask, col].notna().any()
|
|
105
|
+
if has_future_values:
|
|
106
|
+
known.append(col)
|
|
107
|
+
else:
|
|
108
|
+
observed.append(col)
|
|
109
|
+
else:
|
|
110
|
+
# Default to observed if we can't see future values
|
|
111
|
+
observed.append(col)
|
|
112
|
+
|
|
113
|
+
return CovariateConfig(known=known, observed=observed)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class CovariateManager:
|
|
117
|
+
"""Manage known vs observed covariates with leakage protection.
|
|
118
|
+
|
|
119
|
+
This class ensures that observed covariates are properly handled to prevent
|
|
120
|
+
future information from leaking into training or predictions.
|
|
121
|
+
|
|
122
|
+
Example:
|
|
123
|
+
>>> manager = CovariateManager(
|
|
124
|
+
... known_covariates=["holiday"],
|
|
125
|
+
... observed_covariates=["promotion"],
|
|
126
|
+
... )
|
|
127
|
+
>>>
|
|
128
|
+
>>> # Validate no leakage
|
|
129
|
+
>>> manager.validate_for_prediction(
|
|
130
|
+
... df, forecast_start=datetime(2024, 1, 1), horizon=7
|
|
131
|
+
... )
|
|
132
|
+
>>>
|
|
133
|
+
>>> # Mask observed covariates for training
|
|
134
|
+
>>> train_df = manager.mask_observed_for_training(df, target_col="y")
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
def __init__(
|
|
138
|
+
self,
|
|
139
|
+
known_covariates: list[str] | None = None,
|
|
140
|
+
observed_covariates: list[str] | None = None,
|
|
141
|
+
):
|
|
142
|
+
"""Initialize the covariate manager.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
known_covariates: Columns known for all time steps
|
|
146
|
+
observed_covariates: Columns only observed up to current time
|
|
147
|
+
"""
|
|
148
|
+
self.known_covariates = known_covariates or []
|
|
149
|
+
self.observed_covariates = observed_covariates or []
|
|
150
|
+
|
|
151
|
+
# Check for overlap
|
|
152
|
+
overlap = set(self.known_covariates) & set(self.observed_covariates)
|
|
153
|
+
if overlap:
|
|
154
|
+
raise ValueError(f"Covariates cannot be both known and observed: {overlap}")
|
|
155
|
+
|
|
156
|
+
def validate_for_prediction(
|
|
157
|
+
self,
|
|
158
|
+
df: pd.DataFrame,
|
|
159
|
+
forecast_start: datetime,
|
|
160
|
+
horizon: int,
|
|
161
|
+
ds_col: str = "ds",
|
|
162
|
+
) -> None:
|
|
163
|
+
"""Validate that observed covariates don't leak future information.
|
|
164
|
+
|
|
165
|
+
This checks that observed covariates do not have values beyond the
|
|
166
|
+
forecast start time, which would indicate future information leakage.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
df: DataFrame with covariates
|
|
170
|
+
forecast_start: Start time of the forecast period
|
|
171
|
+
horizon: Forecast horizon
|
|
172
|
+
ds_col: Name of the timestamp column
|
|
173
|
+
|
|
174
|
+
Raises:
|
|
175
|
+
ECovariateLeakage: If observed covariates extend beyond forecast_start
|
|
176
|
+
|
|
177
|
+
Example:
|
|
178
|
+
>>> manager = CovariateManager(observed_covariates=["promo"])
|
|
179
|
+
>>> # This will raise if promo has values after 2024-01-01
|
|
180
|
+
>>> manager.validate_for_prediction(df, datetime(2024, 1, 1), horizon=7)
|
|
181
|
+
"""
|
|
182
|
+
if not self.observed_covariates:
|
|
183
|
+
return
|
|
184
|
+
|
|
185
|
+
# Check each observed covariate
|
|
186
|
+
for col in self.observed_covariates:
|
|
187
|
+
if col not in df.columns:
|
|
188
|
+
continue
|
|
189
|
+
|
|
190
|
+
# Find rows where observed covariate has non-null values beyond forecast_start
|
|
191
|
+
future_mask = (df[ds_col] >= forecast_start) & df[col].notna()
|
|
192
|
+
future_count = future_mask.sum()
|
|
193
|
+
|
|
194
|
+
if future_count > 0:
|
|
195
|
+
raise ECovariateLeakage(
|
|
196
|
+
f"Observed covariate '{col}' has {future_count} values "
|
|
197
|
+
f"at or after forecast start time {forecast_start}. "
|
|
198
|
+
"Observed covariates cannot be known in advance.",
|
|
199
|
+
context={
|
|
200
|
+
"covariate": col,
|
|
201
|
+
"forecast_start": forecast_start.isoformat(),
|
|
202
|
+
"future_values_count": int(future_count),
|
|
203
|
+
},
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
def mask_observed_for_training(
|
|
207
|
+
self,
|
|
208
|
+
df: pd.DataFrame,
|
|
209
|
+
target_col: str = "y",
|
|
210
|
+
ds_col: str = "ds",
|
|
211
|
+
unique_id_col: str = "unique_id",
|
|
212
|
+
) -> pd.DataFrame:
|
|
213
|
+
"""Mask observed covariates at time t to prevent leakage during training.
|
|
214
|
+
|
|
215
|
+
For observed covariates, we should only use values that would be available
|
|
216
|
+
at prediction time. This means observed covariates at time t should be
|
|
217
|
+
lagged (using values from before t) to prevent leakage.
|
|
218
|
+
|
|
219
|
+
By default, this sets observed covariates to null for the target timestamp
|
|
220
|
+
to ensure proper training. The caller is responsible for creating lagged
|
|
221
|
+
versions of observed covariates before calling this method.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
df: DataFrame with covariates
|
|
225
|
+
target_col: Name of target column
|
|
226
|
+
ds_col: Name of timestamp column
|
|
227
|
+
unique_id_col: Name of unique_id column
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
DataFrame with observed covariates masked at target time
|
|
231
|
+
"""
|
|
232
|
+
if not self.observed_covariates:
|
|
233
|
+
return df.copy()
|
|
234
|
+
|
|
235
|
+
df = df.copy()
|
|
236
|
+
|
|
237
|
+
# For training, we mask observed covariates at the prediction time
|
|
238
|
+
# since they wouldn't be known yet. The model should use lagged versions.
|
|
239
|
+
for col in self.observed_covariates:
|
|
240
|
+
if col in df.columns:
|
|
241
|
+
# Set to null - caller should create lagged features
|
|
242
|
+
df[col] = None
|
|
243
|
+
|
|
244
|
+
return df
|
|
245
|
+
|
|
246
|
+
def create_lagged_observed_features(
|
|
247
|
+
self,
|
|
248
|
+
df: pd.DataFrame,
|
|
249
|
+
lags: list[int],
|
|
250
|
+
ds_col: str = "ds",
|
|
251
|
+
unique_id_col: str = "unique_id",
|
|
252
|
+
) -> pd.DataFrame:
|
|
253
|
+
"""Create lagged versions of observed covariates.
|
|
254
|
+
|
|
255
|
+
This creates lagged features for observed covariates to ensure
|
|
256
|
+
point-in-time correctness. For a horizon h, observed covariates
|
|
257
|
+
should be lagged by at least h to prevent leakage.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
df: DataFrame with covariates
|
|
261
|
+
lags: List of lag periods to create
|
|
262
|
+
ds_col: Name of timestamp column
|
|
263
|
+
unique_id_col: Name of unique_id column
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
DataFrame with added lagged observed covariate columns
|
|
267
|
+
"""
|
|
268
|
+
if not self.observed_covariates or not lags:
|
|
269
|
+
return df.copy()
|
|
270
|
+
|
|
271
|
+
df = df.copy()
|
|
272
|
+
|
|
273
|
+
for col in self.observed_covariates:
|
|
274
|
+
if col not in df.columns:
|
|
275
|
+
continue
|
|
276
|
+
|
|
277
|
+
for lag in lags:
|
|
278
|
+
lag_col = f"{col}_lag_{lag}"
|
|
279
|
+
df[lag_col] = (
|
|
280
|
+
df.groupby(unique_id_col)[col]
|
|
281
|
+
.shift(lag)
|
|
282
|
+
.values
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
return df
|
|
286
|
+
|
|
287
|
+
def separate_covariates_for_prediction(
|
|
288
|
+
self,
|
|
289
|
+
df: pd.DataFrame,
|
|
290
|
+
forecast_start: datetime,
|
|
291
|
+
ds_col: str = "ds",
|
|
292
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
293
|
+
"""Separate known and observed covariates for prediction setup.
|
|
294
|
+
|
|
295
|
+
Returns two DataFrames:
|
|
296
|
+
1. Known covariates: Can be used directly (values known for all time steps)
|
|
297
|
+
2. Observed covariates: Should be masked/handle carefully
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
df: DataFrame with covariates
|
|
301
|
+
forecast_start: Start of forecast period
|
|
302
|
+
ds_col: Name of timestamp column
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
Tuple of (known_covariates_df, observed_covariates_df)
|
|
306
|
+
"""
|
|
307
|
+
all_cols = ["unique_id", ds_col]
|
|
308
|
+
|
|
309
|
+
known_cols = all_cols + [
|
|
310
|
+
col for col in self.known_covariates if col in df.columns
|
|
311
|
+
]
|
|
312
|
+
observed_cols = all_cols + [
|
|
313
|
+
col for col in self.observed_covariates if col in df.columns
|
|
314
|
+
]
|
|
315
|
+
|
|
316
|
+
known_df = df[known_cols].copy() if len(known_cols) > 2 else pd.DataFrame()
|
|
317
|
+
observed_df = (
|
|
318
|
+
df[observed_cols].copy() if len(observed_cols) > 2 else pd.DataFrame()
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
return known_df, observed_df
|
|
322
|
+
|
|
323
|
+
def get_config(self) -> CovariateConfig:
|
|
324
|
+
"""Get covariate configuration."""
|
|
325
|
+
return CovariateConfig(
|
|
326
|
+
known=self.known_covariates,
|
|
327
|
+
observed=self.observed_covariates,
|
|
328
|
+
)
|