autogluon.timeseries 1.4.1b20250907__py3-none-any.whl → 1.5.1b20260122__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of autogluon.timeseries might be problematic. Click here for more details.
- autogluon/timeseries/configs/hyperparameter_presets.py +13 -28
- autogluon/timeseries/configs/predictor_presets.py +23 -39
- autogluon/timeseries/dataset/ts_dataframe.py +97 -86
- autogluon/timeseries/learner.py +70 -35
- autogluon/timeseries/metrics/__init__.py +4 -4
- autogluon/timeseries/metrics/abstract.py +8 -8
- autogluon/timeseries/metrics/point.py +9 -9
- autogluon/timeseries/metrics/quantile.py +5 -5
- autogluon/timeseries/metrics/utils.py +4 -4
- autogluon/timeseries/models/__init__.py +4 -1
- autogluon/timeseries/models/abstract/abstract_timeseries_model.py +52 -50
- autogluon/timeseries/models/abstract/model_trial.py +2 -1
- autogluon/timeseries/models/abstract/tunable.py +8 -8
- autogluon/timeseries/models/autogluon_tabular/mlforecast.py +58 -62
- autogluon/timeseries/models/autogluon_tabular/per_step.py +27 -16
- autogluon/timeseries/models/autogluon_tabular/transforms.py +11 -9
- autogluon/timeseries/models/chronos/__init__.py +2 -1
- autogluon/timeseries/models/chronos/chronos2.py +395 -0
- autogluon/timeseries/models/chronos/model.py +127 -89
- autogluon/timeseries/models/chronos/{pipeline/utils.py → utils.py} +69 -37
- autogluon/timeseries/models/ensemble/__init__.py +36 -2
- autogluon/timeseries/models/ensemble/abstract.py +14 -46
- autogluon/timeseries/models/ensemble/array_based/__init__.py +3 -0
- autogluon/timeseries/models/ensemble/array_based/abstract.py +240 -0
- autogluon/timeseries/models/ensemble/array_based/models.py +185 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/__init__.py +12 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/abstract.py +88 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/linear_stacker.py +186 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/per_quantile_tabular.py +94 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/tabular.py +107 -0
- autogluon/timeseries/models/ensemble/{greedy.py → ensemble_selection.py} +41 -61
- autogluon/timeseries/models/ensemble/per_item_greedy.py +172 -0
- autogluon/timeseries/models/ensemble/weighted/__init__.py +8 -0
- autogluon/timeseries/models/ensemble/weighted/abstract.py +45 -0
- autogluon/timeseries/models/ensemble/{basic.py → weighted/basic.py} +25 -22
- autogluon/timeseries/models/ensemble/weighted/greedy.py +64 -0
- autogluon/timeseries/models/gluonts/abstract.py +32 -31
- autogluon/timeseries/models/gluonts/dataset.py +11 -11
- autogluon/timeseries/models/gluonts/models.py +0 -7
- autogluon/timeseries/models/local/__init__.py +0 -7
- autogluon/timeseries/models/local/abstract_local_model.py +15 -18
- autogluon/timeseries/models/local/naive.py +2 -2
- autogluon/timeseries/models/local/npts.py +7 -1
- autogluon/timeseries/models/local/statsforecast.py +13 -13
- autogluon/timeseries/models/multi_window/multi_window_model.py +39 -24
- autogluon/timeseries/models/registry.py +3 -4
- autogluon/timeseries/models/toto/__init__.py +3 -0
- autogluon/timeseries/models/toto/_internal/__init__.py +9 -0
- autogluon/timeseries/models/toto/_internal/backbone/__init__.py +3 -0
- autogluon/timeseries/models/toto/_internal/backbone/attention.py +196 -0
- autogluon/timeseries/models/toto/_internal/backbone/backbone.py +262 -0
- autogluon/timeseries/models/toto/_internal/backbone/distribution.py +70 -0
- autogluon/timeseries/models/toto/_internal/backbone/kvcache.py +136 -0
- autogluon/timeseries/models/toto/_internal/backbone/rope.py +89 -0
- autogluon/timeseries/models/toto/_internal/backbone/rotary_embedding_torch.py +342 -0
- autogluon/timeseries/models/toto/_internal/backbone/scaler.py +305 -0
- autogluon/timeseries/models/toto/_internal/backbone/transformer.py +333 -0
- autogluon/timeseries/models/toto/_internal/dataset.py +165 -0
- autogluon/timeseries/models/toto/_internal/forecaster.py +423 -0
- autogluon/timeseries/models/toto/dataloader.py +108 -0
- autogluon/timeseries/models/toto/hf_pretrained_model.py +200 -0
- autogluon/timeseries/models/toto/model.py +249 -0
- autogluon/timeseries/predictor.py +541 -162
- autogluon/timeseries/regressor.py +27 -30
- autogluon/timeseries/splitter.py +3 -27
- autogluon/timeseries/trainer/ensemble_composer.py +444 -0
- autogluon/timeseries/trainer/model_set_builder.py +9 -9
- autogluon/timeseries/trainer/prediction_cache.py +16 -16
- autogluon/timeseries/trainer/trainer.py +300 -279
- autogluon/timeseries/trainer/utils.py +17 -0
- autogluon/timeseries/transforms/covariate_scaler.py +8 -8
- autogluon/timeseries/transforms/target_scaler.py +15 -15
- autogluon/timeseries/utils/constants.py +10 -0
- autogluon/timeseries/utils/datetime/lags.py +1 -3
- autogluon/timeseries/utils/datetime/seasonality.py +1 -3
- autogluon/timeseries/utils/features.py +31 -14
- autogluon/timeseries/utils/forecast.py +6 -7
- autogluon/timeseries/utils/timer.py +173 -0
- autogluon/timeseries/version.py +1 -1
- autogluon.timeseries-1.5.1b20260122-py3.11-nspkg.pth +1 -0
- {autogluon.timeseries-1.4.1b20250907.dist-info → autogluon_timeseries-1.5.1b20260122.dist-info}/METADATA +39 -22
- autogluon_timeseries-1.5.1b20260122.dist-info/RECORD +103 -0
- {autogluon.timeseries-1.4.1b20250907.dist-info → autogluon_timeseries-1.5.1b20260122.dist-info}/WHEEL +1 -1
- autogluon/timeseries/evaluator.py +0 -6
- autogluon/timeseries/models/chronos/pipeline/__init__.py +0 -10
- autogluon/timeseries/models/chronos/pipeline/base.py +0 -160
- autogluon/timeseries/models/chronos/pipeline/chronos.py +0 -544
- autogluon/timeseries/models/chronos/pipeline/chronos_bolt.py +0 -580
- autogluon.timeseries-1.4.1b20250907-py3.9-nspkg.pth +0 -1
- autogluon.timeseries-1.4.1b20250907.dist-info/RECORD +0 -75
- {autogluon.timeseries-1.4.1b20250907.dist-info → autogluon_timeseries-1.5.1b20260122.dist-info/licenses}/LICENSE +0 -0
- {autogluon.timeseries-1.4.1b20250907.dist-info → autogluon_timeseries-1.5.1b20260122.dist-info/licenses}/NOTICE +0 -0
- {autogluon.timeseries-1.4.1b20250907.dist-info → autogluon_timeseries-1.5.1b20260122.dist-info}/namespace_packages.txt +0 -0
- {autogluon.timeseries-1.4.1b20250907.dist-info → autogluon_timeseries-1.5.1b20260122.dist-info}/top_level.txt +0 -0
- {autogluon.timeseries-1.4.1b20250907.dist-info → autogluon_timeseries-1.5.1b20260122.dist-info}/zip-safe +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import time
|
|
3
3
|
from multiprocessing import TimeoutError
|
|
4
|
-
from typing import Any, Callable
|
|
4
|
+
from typing import Any, Callable
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
@@ -9,19 +9,16 @@ from joblib import Parallel, cpu_count, delayed
|
|
|
9
9
|
from scipy.stats import norm
|
|
10
10
|
|
|
11
11
|
from autogluon.core.utils.exceptions import TimeLimitExceeded
|
|
12
|
-
from autogluon.timeseries.dataset
|
|
12
|
+
from autogluon.timeseries.dataset import TimeSeriesDataFrame
|
|
13
13
|
from autogluon.timeseries.metrics import TimeSeriesScorer
|
|
14
14
|
from autogluon.timeseries.models.abstract import AbstractTimeSeriesModel
|
|
15
|
+
from autogluon.timeseries.utils.constants import AG_DEFAULT_N_JOBS
|
|
15
16
|
from autogluon.timeseries.utils.datetime import get_seasonality
|
|
16
17
|
from autogluon.timeseries.utils.warning_filters import warning_filter
|
|
17
18
|
|
|
18
19
|
logger = logging.getLogger(__name__)
|
|
19
20
|
|
|
20
21
|
|
|
21
|
-
# We use the same default n_jobs across AG-TS to ensure that Joblib reuses the process pool
|
|
22
|
-
AG_DEFAULT_N_JOBS = max(cpu_count(only_physical_cores=True), 1)
|
|
23
|
-
|
|
24
|
-
|
|
25
22
|
class AbstractLocalModel(AbstractTimeSeriesModel):
|
|
26
23
|
"""Abstract class for local forecasting models that are trained separately for each time series.
|
|
27
24
|
|
|
@@ -40,18 +37,18 @@ class AbstractLocalModel(AbstractTimeSeriesModel):
|
|
|
40
37
|
"""
|
|
41
38
|
|
|
42
39
|
allowed_local_model_args: list[str] = []
|
|
43
|
-
default_max_ts_length:
|
|
40
|
+
default_max_ts_length: int | None = 2500
|
|
44
41
|
default_max_time_limit_ratio = 1.0
|
|
45
42
|
init_time_in_seconds: int = 0
|
|
46
43
|
|
|
47
44
|
def __init__(
|
|
48
45
|
self,
|
|
49
|
-
freq:
|
|
46
|
+
freq: str | None = None,
|
|
50
47
|
prediction_length: int = 1,
|
|
51
|
-
path:
|
|
52
|
-
name:
|
|
53
|
-
eval_metric:
|
|
54
|
-
hyperparameters:
|
|
48
|
+
path: str | None = None,
|
|
49
|
+
name: str | None = None,
|
|
50
|
+
eval_metric: str | TimeSeriesScorer | None = None,
|
|
51
|
+
hyperparameters: dict[str, Any] | None = None,
|
|
55
52
|
**kwargs, # noqa
|
|
56
53
|
):
|
|
57
54
|
super().__init__(
|
|
@@ -79,10 +76,10 @@ class AbstractLocalModel(AbstractTimeSeriesModel):
|
|
|
79
76
|
def preprocess(
|
|
80
77
|
self,
|
|
81
78
|
data: TimeSeriesDataFrame,
|
|
82
|
-
known_covariates:
|
|
79
|
+
known_covariates: TimeSeriesDataFrame | None = None,
|
|
83
80
|
is_train: bool = False,
|
|
84
81
|
**kwargs,
|
|
85
|
-
) -> tuple[TimeSeriesDataFrame,
|
|
82
|
+
) -> tuple[TimeSeriesDataFrame, TimeSeriesDataFrame | None]:
|
|
86
83
|
if not self._get_tags()["allow_nan"]:
|
|
87
84
|
data = data.fill_missing_values()
|
|
88
85
|
return data, known_covariates
|
|
@@ -95,7 +92,7 @@ class AbstractLocalModel(AbstractTimeSeriesModel):
|
|
|
95
92
|
}
|
|
96
93
|
|
|
97
94
|
@staticmethod
|
|
98
|
-
def _compute_n_jobs(n_jobs:
|
|
95
|
+
def _compute_n_jobs(n_jobs: int | float) -> int:
|
|
99
96
|
if isinstance(n_jobs, float) and 0 < n_jobs <= 1:
|
|
100
97
|
return max(int(cpu_count() * n_jobs), 1)
|
|
101
98
|
elif isinstance(n_jobs, int):
|
|
@@ -103,7 +100,7 @@ class AbstractLocalModel(AbstractTimeSeriesModel):
|
|
|
103
100
|
else:
|
|
104
101
|
raise ValueError(f"n_jobs must be a float between 0 and 1 or an integer (received n_jobs = {n_jobs})")
|
|
105
102
|
|
|
106
|
-
def _fit(self, train_data: TimeSeriesDataFrame, time_limit:
|
|
103
|
+
def _fit(self, train_data: TimeSeriesDataFrame, time_limit: int | None = None, **kwargs):
|
|
107
104
|
self._check_fit_params()
|
|
108
105
|
|
|
109
106
|
if time_limit is not None and time_limit < self.init_time_in_seconds:
|
|
@@ -145,7 +142,7 @@ class AbstractLocalModel(AbstractTimeSeriesModel):
|
|
|
145
142
|
data = data.slice_by_timestep(-max_ts_length, None)
|
|
146
143
|
|
|
147
144
|
indptr = data.get_indptr()
|
|
148
|
-
target_series = data[self.target].droplevel(level=ITEMID)
|
|
145
|
+
target_series = data[self.target].droplevel(level=TimeSeriesDataFrame.ITEMID)
|
|
149
146
|
all_series = (target_series[indptr[i] : indptr[i + 1]] for i in range(len(indptr) - 1))
|
|
150
147
|
|
|
151
148
|
# timeout ensures that no individual job takes longer than time_limit
|
|
@@ -184,7 +181,7 @@ class AbstractLocalModel(AbstractTimeSeriesModel):
|
|
|
184
181
|
self,
|
|
185
182
|
time_series: pd.Series,
|
|
186
183
|
use_fallback_model: bool,
|
|
187
|
-
end_time:
|
|
184
|
+
end_time: float | None = None,
|
|
188
185
|
) -> tuple[pd.DataFrame, bool]:
|
|
189
186
|
if end_time is not None and time.time() >= end_time:
|
|
190
187
|
raise TimeLimitExceeded
|
|
@@ -96,7 +96,7 @@ class AverageModel(AbstractLocalModel):
|
|
|
96
96
|
When set to a float between 0.0 and 1.0, that fraction of available CPU cores is used.
|
|
97
97
|
When set to a positive integer, that many cores are used.
|
|
98
98
|
When set to -1, all CPU cores are used.
|
|
99
|
-
max_ts_length :
|
|
99
|
+
max_ts_length : int | None, default = None
|
|
100
100
|
If not None, only the last ``max_ts_length`` time steps of each time series will be used to train the model.
|
|
101
101
|
This significantly speeds up fitting and usually leads to no change in accuracy.
|
|
102
102
|
"""
|
|
@@ -136,7 +136,7 @@ class SeasonalAverageModel(AbstractLocalModel):
|
|
|
136
136
|
When set to a float between 0.0 and 1.0, that fraction of available CPU cores is used.
|
|
137
137
|
When set to a positive integer, that many cores are used.
|
|
138
138
|
When set to -1, all CPU cores are used.
|
|
139
|
-
max_ts_length :
|
|
139
|
+
max_ts_length : int | None, default = None
|
|
140
140
|
If not None, only the last ``max_ts_length`` time steps of each time series will be used to train the model.
|
|
141
141
|
This significantly speeds up fitting and usually leads to no change in accuracy.
|
|
142
142
|
"""
|
|
@@ -31,7 +31,7 @@ class NPTSModel(AbstractLocalModel):
|
|
|
31
31
|
When set to a float between 0.0 and 1.0, that fraction of available CPU cores is used.
|
|
32
32
|
When set to a positive integer, that many cores are used.
|
|
33
33
|
When set to -1, all CPU cores are used.
|
|
34
|
-
max_ts_length :
|
|
34
|
+
max_ts_length : int | None, default = 2500
|
|
35
35
|
If not None, only the last ``max_ts_length`` time steps of each time series will be used to train the model.
|
|
36
36
|
This significantly speeds up fitting and usually leads to no change in accuracy.
|
|
37
37
|
"""
|
|
@@ -59,6 +59,11 @@ class NPTSModel(AbstractLocalModel):
|
|
|
59
59
|
) -> pd.DataFrame:
|
|
60
60
|
from gluonts.model.npts import NPTSPredictor
|
|
61
61
|
|
|
62
|
+
# NPTS model is non-deterministic due to sampling. Set seed for reproducibility in parallel processes
|
|
63
|
+
# and restore original state to avoid side effects when running with n_jobs=1
|
|
64
|
+
original_random_state = np.random.get_state()
|
|
65
|
+
np.random.seed(123)
|
|
66
|
+
|
|
62
67
|
local_model_args.pop("seasonal_period")
|
|
63
68
|
num_samples = local_model_args.pop("num_samples")
|
|
64
69
|
num_default_time_features = local_model_args.pop("num_default_time_features")
|
|
@@ -88,6 +93,7 @@ class NPTSModel(AbstractLocalModel):
|
|
|
88
93
|
forecast_dict = {"mean": forecast.mean}
|
|
89
94
|
for q in self.quantile_levels:
|
|
90
95
|
forecast_dict[str(q)] = forecast.quantile(q)
|
|
96
|
+
np.random.set_state(original_random_state)
|
|
91
97
|
return pd.DataFrame(forecast_dict)
|
|
92
98
|
|
|
93
99
|
def _more_tags(self) -> dict:
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any,
|
|
2
|
+
from typing import Any, Type
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pandas as pd
|
|
@@ -19,7 +19,7 @@ class AbstractStatsForecastModel(AbstractLocalModel):
|
|
|
19
19
|
local_model_args["season_length"] = seasonal_period
|
|
20
20
|
return local_model_args
|
|
21
21
|
|
|
22
|
-
def _get_model_type(self, variant:
|
|
22
|
+
def _get_model_type(self, variant: str | None = None) -> Type:
|
|
23
23
|
raise NotImplementedError
|
|
24
24
|
|
|
25
25
|
def _get_local_model(self, local_model_args: dict):
|
|
@@ -162,7 +162,7 @@ class AutoARIMAModel(AbstractProbabilisticStatsForecastModel):
|
|
|
162
162
|
local_model_args.setdefault("allowmean", True)
|
|
163
163
|
return local_model_args
|
|
164
164
|
|
|
165
|
-
def _get_model_type(self, variant:
|
|
165
|
+
def _get_model_type(self, variant: str | None = None):
|
|
166
166
|
from statsforecast.models import AutoARIMA
|
|
167
167
|
|
|
168
168
|
return AutoARIMA
|
|
@@ -232,7 +232,7 @@ class ARIMAModel(AbstractProbabilisticStatsForecastModel):
|
|
|
232
232
|
local_model_args.setdefault("order", (1, 1, 1))
|
|
233
233
|
return local_model_args
|
|
234
234
|
|
|
235
|
-
def _get_model_type(self, variant:
|
|
235
|
+
def _get_model_type(self, variant: str | None = None):
|
|
236
236
|
from statsforecast.models import ARIMA
|
|
237
237
|
|
|
238
238
|
return ARIMA
|
|
@@ -269,7 +269,7 @@ class AutoETSModel(AbstractProbabilisticStatsForecastModel):
|
|
|
269
269
|
This significantly speeds up fitting and usually leads to no change in accuracy.
|
|
270
270
|
"""
|
|
271
271
|
|
|
272
|
-
ag_priority =
|
|
272
|
+
ag_priority = 60
|
|
273
273
|
init_time_in_seconds = 0 # C++ models require no compilation
|
|
274
274
|
allowed_local_model_args = [
|
|
275
275
|
"damped",
|
|
@@ -277,7 +277,7 @@ class AutoETSModel(AbstractProbabilisticStatsForecastModel):
|
|
|
277
277
|
"seasonal_period",
|
|
278
278
|
]
|
|
279
279
|
|
|
280
|
-
def _get_model_type(self, variant:
|
|
280
|
+
def _get_model_type(self, variant: str | None = None):
|
|
281
281
|
from statsforecast.models import AutoETS
|
|
282
282
|
|
|
283
283
|
return AutoETS
|
|
@@ -380,7 +380,7 @@ class DynamicOptimizedThetaModel(AbstractProbabilisticStatsForecastModel):
|
|
|
380
380
|
"seasonal_period",
|
|
381
381
|
]
|
|
382
382
|
|
|
383
|
-
def _get_model_type(self, variant:
|
|
383
|
+
def _get_model_type(self, variant: str | None = None):
|
|
384
384
|
from statsforecast.models import DynamicOptimizedTheta
|
|
385
385
|
|
|
386
386
|
return DynamicOptimizedTheta
|
|
@@ -425,7 +425,7 @@ class ThetaModel(AbstractProbabilisticStatsForecastModel):
|
|
|
425
425
|
"seasonal_period",
|
|
426
426
|
]
|
|
427
427
|
|
|
428
|
-
def _get_model_type(self, variant:
|
|
428
|
+
def _get_model_type(self, variant: str | None = None):
|
|
429
429
|
from statsforecast.models import Theta
|
|
430
430
|
|
|
431
431
|
return Theta
|
|
@@ -546,7 +546,7 @@ class AutoCESModel(AbstractProbabilisticStatsForecastModel):
|
|
|
546
546
|
"seasonal_period",
|
|
547
547
|
]
|
|
548
548
|
|
|
549
|
-
def _get_model_type(self, variant:
|
|
549
|
+
def _get_model_type(self, variant: str | None = None):
|
|
550
550
|
from statsforecast.models import AutoCES
|
|
551
551
|
|
|
552
552
|
return AutoCES
|
|
@@ -610,7 +610,7 @@ class ADIDAModel(AbstractStatsForecastIntermittentDemandModel):
|
|
|
610
610
|
|
|
611
611
|
ag_priority = 10
|
|
612
612
|
|
|
613
|
-
def _get_model_type(self, variant:
|
|
613
|
+
def _get_model_type(self, variant: str | None = None):
|
|
614
614
|
from statsforecast.models import ADIDA
|
|
615
615
|
|
|
616
616
|
return ADIDA
|
|
@@ -652,7 +652,7 @@ class CrostonModel(AbstractStatsForecastIntermittentDemandModel):
|
|
|
652
652
|
"variant",
|
|
653
653
|
]
|
|
654
654
|
|
|
655
|
-
def _get_model_type(self, variant:
|
|
655
|
+
def _get_model_type(self, variant: str | None = None):
|
|
656
656
|
from statsforecast.models import CrostonClassic, CrostonOptimized, CrostonSBA
|
|
657
657
|
|
|
658
658
|
model_variants = {
|
|
@@ -702,7 +702,7 @@ class IMAPAModel(AbstractStatsForecastIntermittentDemandModel):
|
|
|
702
702
|
|
|
703
703
|
ag_priority = 10
|
|
704
704
|
|
|
705
|
-
def _get_model_type(self, variant:
|
|
705
|
+
def _get_model_type(self, variant: str | None = None):
|
|
706
706
|
from statsforecast.models import IMAPA
|
|
707
707
|
|
|
708
708
|
return IMAPA
|
|
@@ -726,7 +726,7 @@ class ZeroModel(AbstractStatsForecastIntermittentDemandModel):
|
|
|
726
726
|
|
|
727
727
|
ag_priority = 100
|
|
728
728
|
|
|
729
|
-
def _get_model_type(self, variant:
|
|
729
|
+
def _get_model_type(self, variant: str | None = None):
|
|
730
730
|
# ZeroModel does not depend on a StatsForecast implementation
|
|
731
731
|
raise NotImplementedError
|
|
732
732
|
|
|
@@ -4,13 +4,13 @@ import logging
|
|
|
4
4
|
import math
|
|
5
5
|
import os
|
|
6
6
|
import time
|
|
7
|
-
from typing import Any,
|
|
7
|
+
from typing import Any, Type
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
from typing_extensions import Self
|
|
11
11
|
|
|
12
12
|
import autogluon.core as ag
|
|
13
|
-
from autogluon.timeseries.dataset
|
|
13
|
+
from autogluon.timeseries.dataset import TimeSeriesDataFrame
|
|
14
14
|
from autogluon.timeseries.models.abstract import AbstractTimeSeriesModel
|
|
15
15
|
from autogluon.timeseries.models.local.abstract_local_model import AbstractLocalModel
|
|
16
16
|
from autogluon.timeseries.splitter import AbstractWindowSplitter, ExpandingWindowSplitter
|
|
@@ -38,8 +38,8 @@ class MultiWindowBacktestingModel(AbstractTimeSeriesModel):
|
|
|
38
38
|
|
|
39
39
|
def __init__(
|
|
40
40
|
self,
|
|
41
|
-
model_base:
|
|
42
|
-
model_base_kwargs:
|
|
41
|
+
model_base: AbstractTimeSeriesModel | Type[AbstractTimeSeriesModel],
|
|
42
|
+
model_base_kwargs: dict[str, Any] | None = None,
|
|
43
43
|
**kwargs,
|
|
44
44
|
):
|
|
45
45
|
if inspect.isclass(model_base) and issubclass(model_base, AbstractTimeSeriesModel):
|
|
@@ -58,8 +58,8 @@ class MultiWindowBacktestingModel(AbstractTimeSeriesModel):
|
|
|
58
58
|
self.model_base_type = type(self.model_base)
|
|
59
59
|
self.info_per_val_window = []
|
|
60
60
|
|
|
61
|
-
self.most_recent_model:
|
|
62
|
-
self.most_recent_model_folder:
|
|
61
|
+
self.most_recent_model: AbstractTimeSeriesModel | None = None
|
|
62
|
+
self.most_recent_model_folder: str | None = None
|
|
63
63
|
super().__init__(**kwargs)
|
|
64
64
|
|
|
65
65
|
@property
|
|
@@ -83,19 +83,19 @@ class MultiWindowBacktestingModel(AbstractTimeSeriesModel):
|
|
|
83
83
|
def _is_gpu_available(self) -> bool:
|
|
84
84
|
return self._get_model_base()._is_gpu_available()
|
|
85
85
|
|
|
86
|
-
def get_minimum_resources(self, is_gpu_available: bool = False) -> dict[str,
|
|
86
|
+
def get_minimum_resources(self, is_gpu_available: bool = False) -> dict[str, int | float]:
|
|
87
87
|
return self._get_model_base().get_minimum_resources(is_gpu_available)
|
|
88
88
|
|
|
89
89
|
def _fit(
|
|
90
90
|
self,
|
|
91
91
|
train_data: TimeSeriesDataFrame,
|
|
92
|
-
val_data:
|
|
93
|
-
time_limit:
|
|
94
|
-
num_cpus:
|
|
95
|
-
num_gpus:
|
|
92
|
+
val_data: TimeSeriesDataFrame | None = None,
|
|
93
|
+
time_limit: float | None = None,
|
|
94
|
+
num_cpus: int | None = None,
|
|
95
|
+
num_gpus: int | None = None,
|
|
96
96
|
verbosity: int = 2,
|
|
97
|
-
val_splitter:
|
|
98
|
-
refit_every_n_windows:
|
|
97
|
+
val_splitter: AbstractWindowSplitter | None = None,
|
|
98
|
+
refit_every_n_windows: int | None = 1,
|
|
99
99
|
**kwargs,
|
|
100
100
|
):
|
|
101
101
|
# TODO: use incremental training for GluonTS models?
|
|
@@ -109,9 +109,9 @@ class MultiWindowBacktestingModel(AbstractTimeSeriesModel):
|
|
|
109
109
|
if refit_every_n_windows is None:
|
|
110
110
|
refit_every_n_windows = val_splitter.num_val_windows + 1 # only fit model for the first window
|
|
111
111
|
|
|
112
|
-
oof_predictions_per_window = []
|
|
112
|
+
oof_predictions_per_window: list[TimeSeriesDataFrame] = []
|
|
113
113
|
global_fit_start_time = time.time()
|
|
114
|
-
model:
|
|
114
|
+
model: AbstractTimeSeriesModel | None = None
|
|
115
115
|
|
|
116
116
|
for window_index, (train_fold, val_fold) in enumerate(val_splitter.split(train_data)):
|
|
117
117
|
logger.debug(f"\tWindow {window_index}")
|
|
@@ -142,6 +142,7 @@ class MultiWindowBacktestingModel(AbstractTimeSeriesModel):
|
|
|
142
142
|
train_data=train_fold,
|
|
143
143
|
val_data=val_fold,
|
|
144
144
|
time_limit=time_left_for_window,
|
|
145
|
+
verbosity=verbosity,
|
|
145
146
|
**kwargs,
|
|
146
147
|
)
|
|
147
148
|
model.fit_time = time.time() - model_fit_start_time
|
|
@@ -182,8 +183,9 @@ class MultiWindowBacktestingModel(AbstractTimeSeriesModel):
|
|
|
182
183
|
self.most_recent_model_folder = most_recent_refit_window # type: ignore
|
|
183
184
|
self.predict_time = self.most_recent_model.predict_time
|
|
184
185
|
self.fit_time = time.time() - global_fit_start_time - self.predict_time # type: ignore
|
|
185
|
-
self.
|
|
186
|
-
|
|
186
|
+
self.cache_oof_predictions(oof_predictions_per_window)
|
|
187
|
+
|
|
188
|
+
self.val_score = float(np.mean([info["val_score"] for info in self.info_per_val_window]))
|
|
187
189
|
|
|
188
190
|
def get_info(self) -> dict:
|
|
189
191
|
info = super().get_info()
|
|
@@ -198,7 +200,7 @@ class MultiWindowBacktestingModel(AbstractTimeSeriesModel):
|
|
|
198
200
|
def _predict(
|
|
199
201
|
self,
|
|
200
202
|
data: TimeSeriesDataFrame,
|
|
201
|
-
known_covariates:
|
|
203
|
+
known_covariates: TimeSeriesDataFrame | None = None,
|
|
202
204
|
**kwargs,
|
|
203
205
|
) -> TimeSeriesDataFrame:
|
|
204
206
|
if self.most_recent_model is None:
|
|
@@ -212,12 +214,25 @@ class MultiWindowBacktestingModel(AbstractTimeSeriesModel):
|
|
|
212
214
|
store_predict_time: bool = False,
|
|
213
215
|
**predict_kwargs,
|
|
214
216
|
) -> None:
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
217
|
+
if self._oof_predictions is None or self.most_recent_model is None:
|
|
218
|
+
raise ValueError(f"{self.name} must be fit before calling score_and_cache_oof")
|
|
219
|
+
|
|
220
|
+
# Score on val_data using the most recent model
|
|
221
|
+
past_data, known_covariates = val_data.get_model_inputs_for_scoring(
|
|
222
|
+
prediction_length=self.prediction_length, known_covariates_names=self.covariate_metadata.known_covariates
|
|
223
|
+
)
|
|
224
|
+
predict_start_time = time.time()
|
|
225
|
+
val_predictions = self.most_recent_model.predict(
|
|
226
|
+
past_data, known_covariates=known_covariates, **predict_kwargs
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
self._oof_predictions.append(val_predictions)
|
|
230
|
+
|
|
219
231
|
if store_predict_time:
|
|
220
|
-
|
|
232
|
+
self.predict_time = time.time() - predict_start_time
|
|
233
|
+
|
|
234
|
+
if store_val_score:
|
|
235
|
+
self.val_score = self._score_with_predictions(val_data, val_predictions)
|
|
221
236
|
|
|
222
237
|
def _get_search_space(self):
|
|
223
238
|
return self.model_base._get_search_space()
|
|
@@ -234,7 +249,7 @@ class MultiWindowBacktestingModel(AbstractTimeSeriesModel):
|
|
|
234
249
|
train_fn_kwargs["init_params"]["model_base_kwargs"] = self.get_params()
|
|
235
250
|
return train_fn_kwargs
|
|
236
251
|
|
|
237
|
-
def save(self, path:
|
|
252
|
+
def save(self, path: str | None = None, verbose: bool = True) -> str:
|
|
238
253
|
most_recent_model = self.most_recent_model
|
|
239
254
|
self.most_recent_model = None
|
|
240
255
|
save_path = super().save(path, verbose)
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from abc import ABCMeta
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from inspect import isabstract
|
|
4
|
-
from typing import Union
|
|
5
4
|
|
|
6
5
|
|
|
7
6
|
@dataclass
|
|
@@ -44,7 +43,7 @@ class ModelRegistry(ABCMeta):
|
|
|
44
43
|
cls.REGISTRY[alias] = record
|
|
45
44
|
|
|
46
45
|
@classmethod
|
|
47
|
-
def _get_model_record(cls, alias:
|
|
46
|
+
def _get_model_record(cls, alias: str | type) -> ModelRecord:
|
|
48
47
|
if isinstance(alias, type):
|
|
49
48
|
alias = alias.__name__
|
|
50
49
|
alias = alias.removesuffix("Model")
|
|
@@ -53,11 +52,11 @@ class ModelRegistry(ABCMeta):
|
|
|
53
52
|
return cls.REGISTRY[alias]
|
|
54
53
|
|
|
55
54
|
@classmethod
|
|
56
|
-
def get_model_class(cls, alias:
|
|
55
|
+
def get_model_class(cls, alias: str | type) -> type:
|
|
57
56
|
return cls._get_model_record(alias).model_class
|
|
58
57
|
|
|
59
58
|
@classmethod
|
|
60
|
-
def get_model_priority(cls, alias:
|
|
59
|
+
def get_model_priority(cls, alias: str | type) -> int:
|
|
61
60
|
return cls._get_model_record(alias).ag_priority
|
|
62
61
|
|
|
63
62
|
@classmethod
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
# Unless explicitly stated otherwise all files in this repository are licensed under the Apache-2.0 License.
|
|
2
|
+
#
|
|
3
|
+
# This product includes software developed at Datadog (https://www.datadoghq.com/)
|
|
4
|
+
# Copyright 2025 Datadog, Inc.
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from enum import Enum
|
|
8
|
+
|
|
9
|
+
import torch
|
|
10
|
+
from einops import rearrange
|
|
11
|
+
from torch.nn.functional import scaled_dot_product_attention
|
|
12
|
+
|
|
13
|
+
from .rope import TimeAwareRotaryEmbedding
|
|
14
|
+
|
|
15
|
+
log = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class AttentionAxis(Enum):
|
|
19
|
+
TIME = 1
|
|
20
|
+
SPACE = 2
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class BaseMultiheadAttention(torch.nn.Module):
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
embed_dim: int,
|
|
27
|
+
num_heads: int,
|
|
28
|
+
dropout: float,
|
|
29
|
+
rotary_emb: TimeAwareRotaryEmbedding | None,
|
|
30
|
+
use_memory_efficient_attention: bool,
|
|
31
|
+
):
|
|
32
|
+
super().__init__()
|
|
33
|
+
self.embed_dim = embed_dim
|
|
34
|
+
self.num_heads = num_heads
|
|
35
|
+
assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by number of heads."
|
|
36
|
+
self.head_dim = embed_dim // num_heads
|
|
37
|
+
self.rotary_emb = rotary_emb
|
|
38
|
+
|
|
39
|
+
# We allocate a single tensor for the q, k, and v projection matrices,
|
|
40
|
+
# multiply them with the inputs, and then split the projected tensors into q, k, and v using unbind.
|
|
41
|
+
# This reduces overhead a bit vs. having multiple separate Linear layers,
|
|
42
|
+
# which need to be initialized, tracked by the optimizer, etc.
|
|
43
|
+
self.wQKV = torch.nn.Linear(embed_dim, embed_dim * 3)
|
|
44
|
+
self.dropout = dropout
|
|
45
|
+
self.use_memory_efficient_attention = use_memory_efficient_attention
|
|
46
|
+
self.wO = torch.nn.Linear(embed_dim, embed_dim)
|
|
47
|
+
|
|
48
|
+
assert not self.use_memory_efficient_attention, (
|
|
49
|
+
"xformers is not available, so use_memory_efficient_attention must be False"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
if not hasattr(self, "attention_axis") or self.attention_axis not in (AttentionAxis.TIME, AttentionAxis.SPACE):
|
|
53
|
+
raise ValueError("Child class must define attention_axis as AttentionAxis.TIME or AttentionAxis.SPACE.")
|
|
54
|
+
|
|
55
|
+
def rearrange_inputs(self, inputs: torch.Tensor) -> torch.Tensor:
|
|
56
|
+
pattern = (
|
|
57
|
+
"batch variate seq_len embed_dim -> (batch variate) seq_len embed_dim"
|
|
58
|
+
if self.attention_axis == AttentionAxis.TIME
|
|
59
|
+
else "batch variate seq_len embed_dim -> (batch seq_len) variate embed_dim"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
return rearrange(inputs, pattern)
|
|
63
|
+
|
|
64
|
+
def get_qkv(
|
|
65
|
+
self,
|
|
66
|
+
inputs: torch.Tensor,
|
|
67
|
+
) -> tuple[torch.Tensor, ...]:
|
|
68
|
+
pattern: str = ""
|
|
69
|
+
if self.attention_axis == AttentionAxis.TIME and self.use_memory_efficient_attention:
|
|
70
|
+
pattern = "batch_X_variate seq_len (qkv head_dim n_heads) -> qkv batch_X_variate seq_len n_heads head_dim"
|
|
71
|
+
elif self.attention_axis == AttentionAxis.TIME and not self.use_memory_efficient_attention:
|
|
72
|
+
pattern = "batch_X_variate seq_len (qkv head_dim n_heads) -> qkv batch_X_variate n_heads seq_len head_dim"
|
|
73
|
+
elif self.attention_axis == AttentionAxis.SPACE and self.use_memory_efficient_attention:
|
|
74
|
+
pattern = "batch_X_seq_len variate (qkv head_dim n_heads) -> qkv batch_X_seq_len variate n_heads head_dim"
|
|
75
|
+
elif self.attention_axis == AttentionAxis.SPACE and not self.use_memory_efficient_attention:
|
|
76
|
+
pattern = "batch_X_seq_len variate (qkv head_dim n_heads) -> qkv batch_X_seq_len n_heads variate head_dim"
|
|
77
|
+
|
|
78
|
+
assert pattern
|
|
79
|
+
qkv = self.wQKV(inputs.contiguous())
|
|
80
|
+
return rearrange(qkv, pattern, qkv=3, head_dim=self.head_dim, n_heads=self.num_heads).unbind(dim=0)
|
|
81
|
+
|
|
82
|
+
def positional_embedding(self, q, k, v, kv_cache, layer_idx):
|
|
83
|
+
# Apply the rotary embeddings
|
|
84
|
+
seq_pos_offset = 0
|
|
85
|
+
if self.rotary_emb is not None and self.attention_axis == AttentionAxis.TIME:
|
|
86
|
+
if kv_cache is not None:
|
|
87
|
+
seq_pos_offset = kv_cache.seq_len(layer_idx)
|
|
88
|
+
|
|
89
|
+
# We need to permute because rotary embeddings expect the sequence dimension to be the second-to-last dimension
|
|
90
|
+
q, k = self.rotary_emb.rotate_queries_and_keys(q, k, seq_pos_offset=seq_pos_offset)
|
|
91
|
+
|
|
92
|
+
if kv_cache is not None and self.attention_axis == AttentionAxis.TIME:
|
|
93
|
+
# First, we append the current input key and value tensors to the cache.
|
|
94
|
+
# This concatenates the current key and value tensors to the existing key and value tensors
|
|
95
|
+
kv_cache.append(layer_idx, (k, v))
|
|
96
|
+
# Then, we retrieve the key and value tensors from the cache.
|
|
97
|
+
# This includes all the key and value tensors from previous time steps
|
|
98
|
+
# as well as the current time step.
|
|
99
|
+
k, v = kv_cache[layer_idx]
|
|
100
|
+
|
|
101
|
+
q = q.contiguous()
|
|
102
|
+
k = k.contiguous().to(q.dtype) # Ensure k is the same dtype as q; this is necessary when using mixed precision
|
|
103
|
+
v = v.contiguous().to(q.dtype) # Ensure v is the same dtype as q; this is necessary when using mixed precision
|
|
104
|
+
|
|
105
|
+
return q, k, v, seq_pos_offset
|
|
106
|
+
|
|
107
|
+
def rearrange_output(self, output: torch.Tensor, batch: int, variate: int, seq_len: int) -> torch.Tensor:
|
|
108
|
+
if self.attention_axis == AttentionAxis.TIME and self.use_memory_efficient_attention:
|
|
109
|
+
pattern = "(batch variate) seq_len n_heads head_dim -> batch variate seq_len (n_heads head_dim)"
|
|
110
|
+
elif self.attention_axis == AttentionAxis.TIME and not self.use_memory_efficient_attention:
|
|
111
|
+
pattern = "(batch variate) n_heads seq_len head_dim -> batch variate seq_len (n_heads head_dim)"
|
|
112
|
+
elif self.attention_axis == AttentionAxis.SPACE and self.use_memory_efficient_attention:
|
|
113
|
+
pattern = "(batch seq_len) variate n_heads head_dim -> batch variate seq_len (n_heads head_dim)"
|
|
114
|
+
elif self.attention_axis == AttentionAxis.SPACE and not self.use_memory_efficient_attention:
|
|
115
|
+
pattern = "(batch seq_len) n_heads variate head_dim -> batch variate seq_len (n_heads head_dim)"
|
|
116
|
+
|
|
117
|
+
return rearrange(output, pattern, batch=batch, variate=variate, seq_len=seq_len) # type: ignore
|
|
118
|
+
|
|
119
|
+
def run_attention(self, attention_mask, q, k, v, seq_pos_offset, dropout, seq_len, variate):
|
|
120
|
+
# Determine dimension ranges for attention
|
|
121
|
+
# Ensure the last query vector index is used from the cache
|
|
122
|
+
q_dim_start, q_dim_end = seq_pos_offset, seq_pos_offset + seq_len
|
|
123
|
+
kv_dim_start, kv_dim_end = 0, v.shape[1] if self.use_memory_efficient_attention else v.shape[2]
|
|
124
|
+
if self.attention_axis == AttentionAxis.TIME:
|
|
125
|
+
attention_mask = (
|
|
126
|
+
attention_mask[..., q_dim_start:q_dim_end, kv_dim_start:kv_dim_end]
|
|
127
|
+
if torch.is_tensor(attention_mask)
|
|
128
|
+
else None
|
|
129
|
+
)
|
|
130
|
+
return scaled_dot_product_attention(
|
|
131
|
+
q,
|
|
132
|
+
k,
|
|
133
|
+
v,
|
|
134
|
+
attn_mask=attention_mask,
|
|
135
|
+
dropout_p=dropout,
|
|
136
|
+
is_causal=(attention_mask is None and seq_pos_offset == 0),
|
|
137
|
+
)
|
|
138
|
+
elif self.attention_axis == AttentionAxis.SPACE:
|
|
139
|
+
# We don't use causal masking for space-wise attention
|
|
140
|
+
attention_mask = (
|
|
141
|
+
attention_mask[..., kv_dim_start:kv_dim_end, kv_dim_start:kv_dim_end]
|
|
142
|
+
if torch.is_tensor(attention_mask)
|
|
143
|
+
else None
|
|
144
|
+
)
|
|
145
|
+
return scaled_dot_product_attention(q, k, v, attn_mask=attention_mask, dropout_p=dropout, is_causal=False)
|
|
146
|
+
else:
|
|
147
|
+
raise ValueError("Invalid attention axis")
|
|
148
|
+
|
|
149
|
+
def forward(
|
|
150
|
+
self,
|
|
151
|
+
layer_idx: int,
|
|
152
|
+
inputs: torch.Tensor,
|
|
153
|
+
attention_mask: torch.Tensor | None = None,
|
|
154
|
+
kv_cache=None,
|
|
155
|
+
) -> torch.Tensor:
|
|
156
|
+
batch_size, variate, seq_len, _ = inputs.shape
|
|
157
|
+
dropout = self.dropout if self.training else 0.0
|
|
158
|
+
|
|
159
|
+
rearranged_inputs = self.rearrange_inputs(inputs)
|
|
160
|
+
q, k, v = self.get_qkv(rearranged_inputs)
|
|
161
|
+
|
|
162
|
+
q, k, v, seq_pos_offset = self.positional_embedding(q, k, v, kv_cache, layer_idx)
|
|
163
|
+
|
|
164
|
+
output = self.run_attention(attention_mask, q, k, v, seq_pos_offset, dropout, seq_len, variate)
|
|
165
|
+
|
|
166
|
+
output = self.rearrange_output(output, batch_size, variate, seq_len)
|
|
167
|
+
return self.wO(output)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class TimeWiseMultiheadAttention(BaseMultiheadAttention):
|
|
171
|
+
"""
|
|
172
|
+
Computes standard multihead causal attention over the time axis.
|
|
173
|
+
It does this by flattening out the variates along the batch dimension.
|
|
174
|
+
It also applies rotary position embeddings to the query and key matrices
|
|
175
|
+
in order to incorporate relative positional information.
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
attention_axis = AttentionAxis.TIME
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class SpaceWiseMultiheadAttention(BaseMultiheadAttention):
|
|
182
|
+
"""
|
|
183
|
+
Computes bidirectional multihead attention over the space axis (i.e. across variates within
|
|
184
|
+
a multi-variate time series). This is done by flattening out the time axis along the batch dimension.
|
|
185
|
+
This allows the model to attend to different variates at the same time point. By alternating
|
|
186
|
+
between time-wise and space-wise attention, the model can learn both temporal and cross-variate
|
|
187
|
+
dependencies in the data.
|
|
188
|
+
|
|
189
|
+
Unlike with time-wise attention, don't apply rotary embeddings here
|
|
190
|
+
because we want cross-variate attention to be invariant to the order of the variates.
|
|
191
|
+
"""
|
|
192
|
+
|
|
193
|
+
attention_axis = AttentionAxis.SPACE
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
MultiHeadAttention = TimeWiseMultiheadAttention | SpaceWiseMultiheadAttention
|