autogluon.timeseries 1.2.1b20250424__py3-none-any.whl → 1.2.1b20250426__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autogluon/timeseries/dataset/ts_dataframe.py +9 -2
- autogluon/timeseries/learner.py +1 -4
- autogluon/timeseries/metrics/__init__.py +36 -8
- autogluon/timeseries/metrics/abstract.py +77 -7
- autogluon/timeseries/metrics/point.py +136 -47
- autogluon/timeseries/metrics/quantile.py +42 -17
- autogluon/timeseries/models/abstract/abstract_timeseries_model.py +7 -20
- autogluon/timeseries/models/autogluon_tabular/mlforecast.py +106 -66
- autogluon/timeseries/models/autogluon_tabular/transforms.py +15 -10
- autogluon/timeseries/models/ensemble/greedy.py +8 -7
- autogluon/timeseries/models/local/abstract_local_model.py +43 -36
- autogluon/timeseries/models/multi_window/multi_window_model.py +1 -1
- autogluon/timeseries/models/presets.py +0 -2
- autogluon/timeseries/predictor.py +37 -29
- autogluon/timeseries/trainer.py +23 -16
- autogluon/timeseries/version.py +1 -1
- {autogluon.timeseries-1.2.1b20250424.dist-info → autogluon.timeseries-1.2.1b20250426.dist-info}/METADATA +5 -5
- {autogluon.timeseries-1.2.1b20250424.dist-info → autogluon.timeseries-1.2.1b20250426.dist-info}/RECORD +25 -25
- /autogluon.timeseries-1.2.1b20250424-py3.9-nspkg.pth → /autogluon.timeseries-1.2.1b20250426-py3.9-nspkg.pth +0 -0
- {autogluon.timeseries-1.2.1b20250424.dist-info → autogluon.timeseries-1.2.1b20250426.dist-info}/LICENSE +0 -0
- {autogluon.timeseries-1.2.1b20250424.dist-info → autogluon.timeseries-1.2.1b20250426.dist-info}/NOTICE +0 -0
- {autogluon.timeseries-1.2.1b20250424.dist-info → autogluon.timeseries-1.2.1b20250426.dist-info}/WHEEL +0 -0
- {autogluon.timeseries-1.2.1b20250424.dist-info → autogluon.timeseries-1.2.1b20250426.dist-info}/namespace_packages.txt +0 -0
- {autogluon.timeseries-1.2.1b20250424.dist-info → autogluon.timeseries-1.2.1b20250426.dist-info}/top_level.txt +0 -0
- {autogluon.timeseries-1.2.1b20250424.dist-info → autogluon.timeseries-1.2.1b20250426.dist-info}/zip-safe +0 -0
@@ -1,4 +1,4 @@
|
|
1
|
-
from typing import Optional
|
1
|
+
from typing import Optional, Sequence
|
2
2
|
|
3
3
|
import numpy as np
|
4
4
|
import pandas as pd
|
@@ -25,6 +25,7 @@ class WQL(TimeSeriesScorer):
|
|
25
25
|
- scale-dependent (time series with large absolute value contribute more to the loss)
|
26
26
|
- equivalent to WAPE if ``quantile_levels = [0.5]``
|
27
27
|
|
28
|
+
If `horizon_weight` is provided, both the errors and the target time series in the denominator will be re-weighted.
|
28
29
|
|
29
30
|
References
|
30
31
|
----------
|
@@ -34,16 +35,25 @@ class WQL(TimeSeriesScorer):
|
|
34
35
|
needs_quantile = True
|
35
36
|
|
36
37
|
def compute_metric(
|
37
|
-
self,
|
38
|
+
self,
|
39
|
+
data_future: TimeSeriesDataFrame,
|
40
|
+
predictions: TimeSeriesDataFrame,
|
41
|
+
target: str = "target",
|
42
|
+
**kwargs,
|
38
43
|
) -> float:
|
39
44
|
y_true, q_pred, quantile_levels = self._get_quantile_forecast_score_inputs(data_future, predictions, target)
|
40
|
-
|
41
|
-
|
45
|
+
y_true = y_true.to_numpy()[:, None] # shape [N, 1]
|
46
|
+
q_pred = q_pred.to_numpy() # shape [N, len(quantile_levels)]
|
42
47
|
|
43
|
-
|
44
|
-
np.
|
45
|
-
|
48
|
+
errors = (
|
49
|
+
np.abs((q_pred - y_true) * ((y_true <= q_pred) - quantile_levels))
|
50
|
+
.mean(axis=1)
|
51
|
+
.reshape([-1, self.prediction_length])
|
46
52
|
)
|
53
|
+
if self.horizon_weight is not None:
|
54
|
+
errors *= self.horizon_weight
|
55
|
+
y_true = y_true.reshape([-1, self.prediction_length]) * self.horizon_weight
|
56
|
+
return 2 * np.nansum(errors) / np.nansum(np.abs(y_true))
|
47
57
|
|
48
58
|
|
49
59
|
class SQL(TimeSeriesScorer):
|
@@ -79,7 +89,15 @@ class SQL(TimeSeriesScorer):
|
|
79
89
|
|
80
90
|
needs_quantile = True
|
81
91
|
|
82
|
-
def __init__(
|
92
|
+
def __init__(
|
93
|
+
self,
|
94
|
+
prediction_length: int = 1,
|
95
|
+
seasonal_period: Optional[int] = None,
|
96
|
+
horizon_weight: Optional[Sequence[float]] = None,
|
97
|
+
):
|
98
|
+
super().__init__(
|
99
|
+
prediction_length=prediction_length, seasonal_period=seasonal_period, horizon_weight=horizon_weight
|
100
|
+
)
|
83
101
|
self._past_abs_seasonal_error: Optional[pd.Series] = None
|
84
102
|
|
85
103
|
def save_past_metrics(
|
@@ -93,17 +111,24 @@ class SQL(TimeSeriesScorer):
|
|
93
111
|
self._past_abs_seasonal_error = None
|
94
112
|
|
95
113
|
def compute_metric(
|
96
|
-
self,
|
114
|
+
self,
|
115
|
+
data_future: TimeSeriesDataFrame,
|
116
|
+
predictions: TimeSeriesDataFrame,
|
117
|
+
target: str = "target",
|
118
|
+
**kwargs,
|
97
119
|
) -> float:
|
98
120
|
if self._past_abs_seasonal_error is None:
|
99
121
|
raise AssertionError("Call `save_past_metrics` before `compute_metric`")
|
100
122
|
|
101
123
|
y_true, q_pred, quantile_levels = self._get_quantile_forecast_score_inputs(data_future, predictions, target)
|
102
|
-
q_pred = q_pred.
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
124
|
+
q_pred = q_pred.to_numpy()
|
125
|
+
y_true = y_true.to_numpy()[:, None] # shape [N, 1]
|
126
|
+
|
127
|
+
errors = (
|
128
|
+
np.abs((q_pred - y_true) * ((y_true <= q_pred) - quantile_levels))
|
129
|
+
.mean(axis=1)
|
130
|
+
.reshape([-1, self.prediction_length])
|
131
|
+
)
|
132
|
+
if self.horizon_weight is not None:
|
133
|
+
errors *= self.horizon_weight
|
134
|
+
return 2 * self._safemean(errors / self._past_abs_seasonal_error.to_numpy()[:, None])
|
@@ -57,9 +57,6 @@ class TimeSeriesModelBase(ModelBase, ABC):
|
|
57
57
|
Metric by which predictions will be ultimately evaluated on future test data. This only impacts
|
58
58
|
``model.score()``, as eval_metric is not used during training. Available metrics can be found in
|
59
59
|
``autogluon.timeseries.metrics``.
|
60
|
-
eval_metric_seasonal_period : int, optional
|
61
|
-
Seasonal period used to compute some evaluation metrics such as mean absolute scaled error (MASE). Defaults to
|
62
|
-
``None``, in which case the seasonal period is computed based on the data frequency.
|
63
60
|
hyperparameters : dict, default = None
|
64
61
|
Hyperparameters that will be used by the model (can be search spaces instead of fixed values).
|
65
62
|
If None, model defaults are used. This is identical to passing an empty dictionary.
|
@@ -88,7 +85,6 @@ class TimeSeriesModelBase(ModelBase, ABC):
|
|
88
85
|
target: str = "target",
|
89
86
|
quantile_levels: Sequence[float] = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
|
90
87
|
eval_metric: Union[str, TimeSeriesScorer, None] = None,
|
91
|
-
eval_metric_seasonal_period: Optional[int] = None,
|
92
88
|
):
|
93
89
|
self.name = name or re.sub(r"Model$", "", self.__class__.__name__)
|
94
90
|
|
@@ -103,8 +99,7 @@ class TimeSeriesModelBase(ModelBase, ABC):
|
|
103
99
|
|
104
100
|
self.path = os.path.join(self.path_root, self.name)
|
105
101
|
|
106
|
-
self.eval_metric
|
107
|
-
self.eval_metric_seasonal_period = eval_metric_seasonal_period
|
102
|
+
self.eval_metric = check_get_evaluation_metric(eval_metric, prediction_length=prediction_length)
|
108
103
|
self.target: str = target
|
109
104
|
self.covariate_metadata = covariate_metadata or CovariateMetadata()
|
110
105
|
|
@@ -187,7 +182,7 @@ class TimeSeriesModelBase(ModelBase, ABC):
|
|
187
182
|
)
|
188
183
|
return hyperparameters, extra_ag_args
|
189
184
|
|
190
|
-
def save(self, path: Optional[str] = None, verbose=True) -> str:
|
185
|
+
def save(self, path: Optional[str] = None, verbose: bool = True) -> str:
|
191
186
|
if path is None:
|
192
187
|
path = self.path
|
193
188
|
|
@@ -393,8 +388,8 @@ class AbstractTimeSeriesModel(TimeSeriesModelBase, TimeSeriesTunable, ABC):
|
|
393
388
|
target: str = "target",
|
394
389
|
quantile_levels: Sequence[float] = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
|
395
390
|
eval_metric: Union[str, TimeSeriesScorer, None] = None,
|
396
|
-
eval_metric_seasonal_period: Optional[int] = None,
|
397
391
|
):
|
392
|
+
# TODO: make freq a required argument in AbstractTimeSeriesModel
|
398
393
|
super().__init__(
|
399
394
|
path=path,
|
400
395
|
name=name,
|
@@ -405,12 +400,10 @@ class AbstractTimeSeriesModel(TimeSeriesModelBase, TimeSeriesTunable, ABC):
|
|
405
400
|
target=target,
|
406
401
|
quantile_levels=quantile_levels,
|
407
402
|
eval_metric=eval_metric,
|
408
|
-
eval_metric_seasonal_period=eval_metric_seasonal_period,
|
409
403
|
)
|
410
404
|
self.target_scaler: Optional[TargetScaler]
|
411
405
|
self.covariate_scaler: Optional[CovariateScaler]
|
412
406
|
self.covariate_regressor: Optional[CovariateRegressor]
|
413
|
-
self._initialize_transforms_and_regressor()
|
414
407
|
|
415
408
|
def _initialize_transforms_and_regressor(self) -> None:
|
416
409
|
self.target_scaler = get_target_scaler(self.get_hyperparameters().get("target_scaler"), target=self.target)
|
@@ -474,6 +467,7 @@ class AbstractTimeSeriesModel(TimeSeriesModelBase, TimeSeriesTunable, ABC):
|
|
474
467
|
The fitted model object
|
475
468
|
"""
|
476
469
|
start_time = time.monotonic()
|
470
|
+
self._initialize_transforms_and_regressor()
|
477
471
|
|
478
472
|
if self.target_scaler is not None:
|
479
473
|
train_data = self.target_scaler.fit_transform(train_data)
|
@@ -699,19 +693,15 @@ class AbstractTimeSeriesModel(TimeSeriesModelBase, TimeSeriesTunable, ABC):
|
|
699
693
|
self,
|
700
694
|
data: TimeSeriesDataFrame,
|
701
695
|
predictions: TimeSeriesDataFrame,
|
702
|
-
metric: Optional[str] = None,
|
703
696
|
) -> float:
|
704
697
|
"""Compute the score measuring how well the predictions align with the data."""
|
705
|
-
|
706
|
-
return eval_metric.score(
|
698
|
+
return self.eval_metric.score(
|
707
699
|
data=data,
|
708
700
|
predictions=predictions,
|
709
|
-
prediction_length=self.prediction_length,
|
710
701
|
target=self.target,
|
711
|
-
seasonal_period=self.eval_metric_seasonal_period,
|
712
702
|
)
|
713
703
|
|
714
|
-
def score(self, data: TimeSeriesDataFrame
|
704
|
+
def score(self, data: TimeSeriesDataFrame) -> float:
|
715
705
|
"""Return the evaluation scores for given metric and dataset. The last
|
716
706
|
`self.prediction_length` time steps of each time series in the input data set
|
717
707
|
will be held out and used for computing the evaluation score. Time series
|
@@ -721,9 +711,6 @@ class AbstractTimeSeriesModel(TimeSeriesModelBase, TimeSeriesTunable, ABC):
|
|
721
711
|
----------
|
722
712
|
data: TimeSeriesDataFrame
|
723
713
|
Dataset used for scoring.
|
724
|
-
metric: str
|
725
|
-
String identifier of evaluation metric to use, from one of
|
726
|
-
`autogluon.timeseries.utils.metric_utils.AVAILABLE_METRICS`.
|
727
714
|
|
728
715
|
Returns
|
729
716
|
-------
|
@@ -735,7 +722,7 @@ class AbstractTimeSeriesModel(TimeSeriesModelBase, TimeSeriesTunable, ABC):
|
|
735
722
|
prediction_length=self.prediction_length, known_covariates_names=self.covariate_metadata.known_covariates
|
736
723
|
)
|
737
724
|
predictions = self.predict(past_data, known_covariates=known_covariates)
|
738
|
-
return self._score_with_predictions(data=data, predictions=predictions
|
725
|
+
return self._score_with_predictions(data=data, predictions=predictions)
|
739
726
|
|
740
727
|
def score_and_cache_oof(
|
741
728
|
self,
|
@@ -2,15 +2,17 @@ import logging
|
|
2
2
|
import math
|
3
3
|
import os
|
4
4
|
import time
|
5
|
-
from typing import Any, Dict, List, Optional, Tuple
|
5
|
+
from typing import Any, Callable, Collection, Dict, List, Optional, Tuple, Union
|
6
6
|
|
7
7
|
import numpy as np
|
8
8
|
import pandas as pd
|
9
9
|
from sklearn.base import BaseEstimator
|
10
|
+
from typing_extensions import Self
|
10
11
|
|
11
12
|
import autogluon.core as ag
|
12
13
|
from autogluon.tabular import TabularPredictor
|
13
14
|
from autogluon.timeseries.dataset.ts_dataframe import ITEMID, TIMESTAMP, TimeSeriesDataFrame
|
15
|
+
from autogluon.timeseries.metrics.abstract import TimeSeriesScorer
|
14
16
|
from autogluon.timeseries.metrics.utils import in_sample_squared_seasonal_error
|
15
17
|
from autogluon.timeseries.models.abstract import AbstractTimeSeriesModel
|
16
18
|
from autogluon.timeseries.models.local import SeasonalNaiveModel
|
@@ -29,17 +31,21 @@ logger = logging.getLogger(__name__)
|
|
29
31
|
class TabularEstimator(BaseEstimator):
|
30
32
|
"""Scikit-learn compatible interface for TabularPredictor."""
|
31
33
|
|
32
|
-
def __init__(
|
34
|
+
def __init__(
|
35
|
+
self,
|
36
|
+
predictor_init_kwargs: Optional[Dict[str, Any]] = None,
|
37
|
+
predictor_fit_kwargs: Optional[Dict[str, Any]] = None,
|
38
|
+
):
|
33
39
|
self.predictor_init_kwargs = predictor_init_kwargs if predictor_init_kwargs is not None else {}
|
34
40
|
self.predictor_fit_kwargs = predictor_fit_kwargs if predictor_fit_kwargs is not None else {}
|
35
41
|
|
36
|
-
def get_params(self, deep: bool = True) ->
|
42
|
+
def get_params(self, deep: bool = True) -> Dict[str, Any]:
|
37
43
|
return {
|
38
44
|
"predictor_init_kwargs": self.predictor_init_kwargs,
|
39
45
|
"predictor_fit_kwargs": self.predictor_fit_kwargs,
|
40
46
|
}
|
41
47
|
|
42
|
-
def fit(self, X: pd.DataFrame, y: pd.Series) ->
|
48
|
+
def fit(self, X: pd.DataFrame, y: pd.Series) -> Self:
|
43
49
|
assert isinstance(X, pd.DataFrame) and isinstance(y, pd.Series)
|
44
50
|
df = pd.concat([X, y.rename(MLF_TARGET).to_frame()], axis=1)
|
45
51
|
self.predictor = TabularPredictor(**self.predictor_init_kwargs)
|
@@ -49,7 +55,7 @@ class TabularEstimator(BaseEstimator):
|
|
49
55
|
|
50
56
|
def predict(self, X: pd.DataFrame) -> np.ndarray:
|
51
57
|
assert isinstance(X, pd.DataFrame)
|
52
|
-
return self.predictor.predict(X).values
|
58
|
+
return self.predictor.predict(X).values # type: ignore
|
53
59
|
|
54
60
|
|
55
61
|
class AbstractMLForecastModel(AbstractTimeSeriesModel):
|
@@ -62,9 +68,9 @@ class AbstractMLForecastModel(AbstractTimeSeriesModel):
|
|
62
68
|
prediction_length: int = 1,
|
63
69
|
path: Optional[str] = None,
|
64
70
|
name: Optional[str] = None,
|
65
|
-
eval_metric: str = None,
|
66
|
-
hyperparameters: Dict[str, Any] = None,
|
67
|
-
**kwargs,
|
71
|
+
eval_metric: Optional[Union[str, TimeSeriesScorer]] = None,
|
72
|
+
hyperparameters: Optional[Dict[str, Any]] = None,
|
73
|
+
**kwargs,
|
68
74
|
):
|
69
75
|
super().__init__(
|
70
76
|
path=path,
|
@@ -80,14 +86,16 @@ class AbstractMLForecastModel(AbstractTimeSeriesModel):
|
|
80
86
|
|
81
87
|
self._sum_of_differences: int = 0 # number of time steps removed from each series by differencing
|
82
88
|
self._max_ts_length: Optional[int] = None
|
83
|
-
self._target_lags:
|
84
|
-
self._date_features:
|
85
|
-
self._mlf:
|
89
|
+
self._target_lags: np.ndarray
|
90
|
+
self._date_features: List[Callable]
|
91
|
+
self._mlf: MLForecast
|
86
92
|
self._scaler: Optional[BaseTargetTransform] = None
|
87
|
-
self._residuals_std_per_item:
|
93
|
+
self._residuals_std_per_item: pd.Series
|
88
94
|
self._train_target_median: Optional[float] = None
|
89
95
|
self._non_boolean_real_covariates: List[str] = []
|
90
96
|
|
97
|
+
def _initialize_transforms_and_regressor(self):
|
98
|
+
super()._initialize_transforms_and_regressor()
|
91
99
|
# Do not create a scaler in the model, scaler will be passed to MLForecast
|
92
100
|
self.target_scaler = None
|
93
101
|
|
@@ -95,20 +103,23 @@ class AbstractMLForecastModel(AbstractTimeSeriesModel):
|
|
95
103
|
def tabular_predictor_path(self) -> str:
|
96
104
|
return os.path.join(self.path, "tabular_predictor")
|
97
105
|
|
98
|
-
def save(self, path: str = None, verbose: bool = True) -> str:
|
106
|
+
def save(self, path: Optional[str] = None, verbose: bool = True) -> str:
|
99
107
|
assert "mean" in self._mlf.models_, "TabularPredictor must be trained before saving"
|
100
|
-
|
101
|
-
self._mlf.models_["mean"]
|
108
|
+
|
109
|
+
mean_estimator = self._mlf.models_["mean"]
|
110
|
+
assert isinstance(mean_estimator, TabularEstimator)
|
111
|
+
|
112
|
+
tabular_predictor = mean_estimator.predictor
|
113
|
+
mean_estimator.predictor = None # type: ignore
|
102
114
|
save_path = super().save(path=path, verbose=verbose)
|
103
|
-
|
115
|
+
mean_estimator.predictor = tabular_predictor
|
104
116
|
return save_path
|
105
117
|
|
106
118
|
@classmethod
|
107
|
-
def load(
|
108
|
-
cls, path: str, reset_paths: bool = True, load_oof: bool = False, verbose: bool = True
|
109
|
-
) -> "AbstractTimeSeriesModel":
|
119
|
+
def load(cls, path: str, reset_paths: bool = True, load_oof: bool = False, verbose: bool = True) -> Self:
|
110
120
|
model = super().load(path=path, reset_paths=reset_paths, load_oof=load_oof, verbose=verbose)
|
111
121
|
assert "mean" in model._mlf.models_, "Loaded model doesn't have a trained TabularPredictor"
|
122
|
+
assert isinstance(model._mlf.models_["mean"], TabularEstimator)
|
112
123
|
model._mlf.models_["mean"].predictor = TabularPredictor.load(model.tabular_predictor_path)
|
113
124
|
return model
|
114
125
|
|
@@ -131,24 +142,27 @@ class AbstractMLForecastModel(AbstractTimeSeriesModel):
|
|
131
142
|
data[self.target] = data[self.target].fillna(value=self._train_target_median)
|
132
143
|
return data, known_covariates
|
133
144
|
|
134
|
-
def _get_extra_tabular_init_kwargs(self) ->
|
145
|
+
def _get_extra_tabular_init_kwargs(self) -> Dict[str, Any]:
|
135
146
|
raise NotImplementedError
|
136
147
|
|
137
|
-
def
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
148
|
+
def _get_default_hyperparameters(self) -> Dict[str, Any]:
|
149
|
+
return {
|
150
|
+
"max_num_items": 20_000,
|
151
|
+
"max_num_samples": 1_000_000,
|
152
|
+
"tabular_hyperparameters": {"GBM": {}},
|
153
|
+
"tabular_fit_kwargs": {},
|
154
|
+
}
|
144
155
|
|
145
|
-
def _get_mlforecast_init_args(
|
156
|
+
def _get_mlforecast_init_args(
|
157
|
+
self, train_data: TimeSeriesDataFrame, model_params: Dict[str, Any]
|
158
|
+
) -> Dict[str, Any]:
|
146
159
|
from mlforecast.target_transforms import Differences
|
147
160
|
|
148
161
|
from .transforms import MLForecastScaler
|
149
162
|
|
150
163
|
lags = model_params.get("lags")
|
151
164
|
if lags is None:
|
165
|
+
assert self.freq is not None
|
152
166
|
lags = get_lags_for_frequency(self.freq)
|
153
167
|
self._target_lags = np.array(sorted(set(lags)), dtype=np.int64)
|
154
168
|
|
@@ -159,6 +173,7 @@ class AbstractMLForecastModel(AbstractTimeSeriesModel):
|
|
159
173
|
|
160
174
|
target_transforms = []
|
161
175
|
differences = model_params.get("differences")
|
176
|
+
assert isinstance(differences, Collection)
|
162
177
|
|
163
178
|
ts_lengths = train_data.num_timesteps_per_item()
|
164
179
|
required_ts_length = sum(differences) + 1
|
@@ -196,7 +211,7 @@ class AbstractMLForecastModel(AbstractTimeSeriesModel):
|
|
196
211
|
return df
|
197
212
|
|
198
213
|
@staticmethod
|
199
|
-
def _shorten_all_series(mlforecast_df: pd.DataFrame, max_length: int):
|
214
|
+
def _shorten_all_series(mlforecast_df: pd.DataFrame, max_length: int) -> pd.DataFrame:
|
200
215
|
logger.debug(f"Shortening all series to at most {max_length}")
|
201
216
|
return mlforecast_df.groupby(MLF_ITEMID, as_index=False, sort=False).tail(max_length)
|
202
217
|
|
@@ -231,7 +246,7 @@ class AbstractMLForecastModel(AbstractTimeSeriesModel):
|
|
231
246
|
# Unless we set static_features=[], MLForecast interprets all known covariates as static features
|
232
247
|
df = self._mlf.preprocess(mlforecast_df, dropna=False, static_features=[])
|
233
248
|
# df.query results in 2x memory saving compared to df.dropna(subset="y")
|
234
|
-
df = df.query("y.notnull()")
|
249
|
+
df = df.query("y.notnull()") # type: ignore
|
235
250
|
|
236
251
|
df = self._mask_df(df)
|
237
252
|
|
@@ -250,12 +265,12 @@ class AbstractMLForecastModel(AbstractTimeSeriesModel):
|
|
250
265
|
val_df = grouped_df.tail(val_rows_per_item)
|
251
266
|
logger.debug(f"train_df shape: {train_df.shape}, val_df shape: {val_df.shape}")
|
252
267
|
|
253
|
-
return train_df.drop(columns=[MLF_TIMESTAMP]), val_df.drop(columns=[MLF_TIMESTAMP])
|
268
|
+
return train_df.drop(columns=[MLF_TIMESTAMP]), val_df.drop(columns=[MLF_TIMESTAMP]) # type: ignore
|
254
269
|
|
255
270
|
def _to_mlforecast_df(
|
256
271
|
self,
|
257
272
|
data: TimeSeriesDataFrame,
|
258
|
-
static_features: pd.DataFrame,
|
273
|
+
static_features: Optional[pd.DataFrame],
|
259
274
|
include_target: bool = True,
|
260
275
|
) -> pd.DataFrame:
|
261
276
|
"""Convert TimeSeriesDataFrame to a format expected by MLForecast methods `predict` and `preprocess`.
|
@@ -288,7 +303,9 @@ class AbstractMLForecastModel(AbstractTimeSeriesModel):
|
|
288
303
|
self,
|
289
304
|
train_data: TimeSeriesDataFrame,
|
290
305
|
val_data: Optional[TimeSeriesDataFrame] = None,
|
291
|
-
time_limit: Optional[
|
306
|
+
time_limit: Optional[float] = None,
|
307
|
+
num_cpus: Optional[int] = None,
|
308
|
+
num_gpus: Optional[int] = None,
|
292
309
|
verbosity: int = 2,
|
293
310
|
**kwargs,
|
294
311
|
) -> None:
|
@@ -304,6 +321,7 @@ class AbstractMLForecastModel(AbstractTimeSeriesModel):
|
|
304
321
|
model_params = self.get_hyperparameters()
|
305
322
|
|
306
323
|
mlforecast_init_args = self._get_mlforecast_init_args(train_data, model_params)
|
324
|
+
assert self.freq is not None
|
307
325
|
self._mlf = MLForecast(models={}, freq=self.freq, **mlforecast_init_args)
|
308
326
|
|
309
327
|
# We generate train/val splits from train_data and ignore val_data to avoid overfitting
|
@@ -327,10 +345,10 @@ class AbstractMLForecastModel(AbstractTimeSeriesModel):
|
|
327
345
|
**model_params["tabular_fit_kwargs"],
|
328
346
|
},
|
329
347
|
)
|
330
|
-
self._mlf.models = {"mean": estimator}
|
348
|
+
self._mlf.models = {"mean": estimator} # type: ignore
|
331
349
|
|
332
350
|
with warning_filter():
|
333
|
-
self._mlf.fit_models(X=train_df.drop(columns=[MLF_TARGET, MLF_ITEMID]), y=train_df[MLF_TARGET])
|
351
|
+
self._mlf.fit_models(X=train_df.drop(columns=[MLF_TARGET, MLF_ITEMID]), y=train_df[MLF_TARGET]) # type: ignore
|
334
352
|
|
335
353
|
self._save_residuals_std(val_df)
|
336
354
|
|
@@ -340,14 +358,19 @@ class AbstractMLForecastModel(AbstractTimeSeriesModel):
|
|
340
358
|
Saves per-item residuals to `self.residuals_std_per_item`.
|
341
359
|
"""
|
342
360
|
residuals_df = val_df[[MLF_ITEMID, MLF_TARGET]]
|
343
|
-
|
361
|
+
mean_estimator = self._mlf.models_["mean"]
|
362
|
+
assert isinstance(mean_estimator, TabularEstimator)
|
363
|
+
|
364
|
+
residuals_df = residuals_df.assign(y_pred=mean_estimator.predict(val_df))
|
344
365
|
if self._scaler is not None:
|
345
366
|
# Scaler expects to find column MLF_TIMESTAMP even though it's not used - fill with dummy
|
346
|
-
residuals_df = residuals_df.assign(**{MLF_TIMESTAMP:
|
367
|
+
residuals_df = residuals_df.assign(**{MLF_TIMESTAMP: np.datetime64("2010-01-01")})
|
347
368
|
residuals_df = self._scaler.inverse_transform(residuals_df)
|
369
|
+
|
370
|
+
assert isinstance(residuals_df, pd.DataFrame)
|
348
371
|
residuals = residuals_df[MLF_TARGET] - residuals_df["y_pred"]
|
349
372
|
self._residuals_std_per_item = (
|
350
|
-
residuals.pow(2.0).groupby(val_df[MLF_ITEMID].values, sort=False).mean().pow(0.5)
|
373
|
+
residuals.pow(2.0).groupby(val_df[MLF_ITEMID].values, sort=False).mean().pow(0.5) # type: ignore
|
351
374
|
)
|
352
375
|
|
353
376
|
def _remove_short_ts_and_generate_fallback_forecast(
|
@@ -395,7 +418,9 @@ class AbstractMLForecastModel(AbstractTimeSeriesModel):
|
|
395
418
|
forecast_for_short_series = None
|
396
419
|
return data_long, known_covariates_long, forecast_for_short_series
|
397
420
|
|
398
|
-
def _add_gaussian_quantiles(
|
421
|
+
def _add_gaussian_quantiles(
|
422
|
+
self, predictions: pd.DataFrame, repeated_item_ids: pd.Series, past_target: pd.Series
|
423
|
+
) -> pd.DataFrame:
|
399
424
|
"""
|
400
425
|
Add quantile levels assuming that residuals follow normal distribution
|
401
426
|
"""
|
@@ -410,9 +435,9 @@ class AbstractMLForecastModel(AbstractTimeSeriesModel):
|
|
410
435
|
# Use in-sample seasonal error in for items not seen during fit
|
411
436
|
items_not_seen_during_fit = residuals_std_per_timestep.index[residuals_std_per_timestep.isna()].unique()
|
412
437
|
if len(items_not_seen_during_fit) > 0:
|
413
|
-
scale_for_new_items: pd.Series =
|
414
|
-
|
415
|
-
)
|
438
|
+
scale_for_new_items: pd.Series = in_sample_squared_seasonal_error(
|
439
|
+
y_past=past_target.loc[items_not_seen_during_fit]
|
440
|
+
).pow(0.5)
|
416
441
|
residuals_std_per_timestep = residuals_std_per_timestep.fillna(scale_for_new_items)
|
417
442
|
|
418
443
|
std_per_timestep = residuals_std_per_timestep * normal_scale_per_timestep
|
@@ -420,7 +445,7 @@ class AbstractMLForecastModel(AbstractTimeSeriesModel):
|
|
420
445
|
predictions[str(q)] = predictions["mean"] + norm.ppf(q) * std_per_timestep.to_numpy()
|
421
446
|
return predictions
|
422
447
|
|
423
|
-
def _more_tags(self) ->
|
448
|
+
def _more_tags(self) -> Dict[str, Any]:
|
424
449
|
return {"allow_nan": True, "can_refit_full": True}
|
425
450
|
|
426
451
|
|
@@ -473,7 +498,7 @@ class DirectTabularModel(AbstractMLForecastModel):
|
|
473
498
|
def is_quantile_model(self) -> bool:
|
474
499
|
return self.eval_metric.needs_quantile
|
475
500
|
|
476
|
-
def get_hyperparameters(self) ->
|
501
|
+
def get_hyperparameters(self) -> Dict[str, Any]:
|
477
502
|
model_params = super().get_hyperparameters()
|
478
503
|
model_params.setdefault("target_scaler", "mean_abs")
|
479
504
|
if "differences" not in model_params or model_params["differences"] is None:
|
@@ -512,6 +537,7 @@ class DirectTabularModel(AbstractMLForecastModel):
|
|
512
537
|
)
|
513
538
|
if len(data) == 0:
|
514
539
|
# All time series are too short for chosen differences
|
540
|
+
assert forecast_for_short_series is not None
|
515
541
|
return forecast_for_short_series
|
516
542
|
|
517
543
|
if known_covariates is not None:
|
@@ -522,15 +548,19 @@ class DirectTabularModel(AbstractMLForecastModel):
|
|
522
548
|
# MLForecast raises exception of target contains NaN. We use inf as placeholder, replace them by NaN afterwards
|
523
549
|
data_future[self.target] = float("inf")
|
524
550
|
data_extended = pd.concat([data, data_future])
|
525
|
-
mlforecast_df = self._to_mlforecast_df(data_extended, data.static_features)
|
551
|
+
mlforecast_df = self._to_mlforecast_df(data_extended, data.static_features) # type: ignore
|
526
552
|
if self._max_ts_length is not None:
|
527
553
|
# We appended `prediction_length` time steps to each series, so increase length
|
528
554
|
mlforecast_df = self._shorten_all_series(mlforecast_df, self._max_ts_length + self.prediction_length)
|
529
555
|
df = self._mlf.preprocess(mlforecast_df, dropna=False, static_features=[])
|
556
|
+
assert isinstance(df, pd.DataFrame)
|
557
|
+
|
530
558
|
df = df.groupby(MLF_ITEMID, sort=False).tail(self.prediction_length)
|
531
559
|
df = df.replace(float("inf"), float("nan"))
|
532
560
|
|
533
|
-
|
561
|
+
mean_estimator = self._mlf.models_["mean"]
|
562
|
+
assert isinstance(mean_estimator, TabularEstimator)
|
563
|
+
raw_predictions = mean_estimator.predict(df)
|
534
564
|
predictions = self._postprocess_predictions(raw_predictions, repeated_item_ids=df[MLF_ITEMID])
|
535
565
|
# Paste columns one by one to preserve dtypes
|
536
566
|
predictions[MLF_ITEMID] = df[MLF_ITEMID].values
|
@@ -542,6 +572,7 @@ class DirectTabularModel(AbstractMLForecastModel):
|
|
542
572
|
if self._max_ts_length is not None:
|
543
573
|
mlforecast_df_past = self._shorten_all_series(mlforecast_df_past, self._max_ts_length)
|
544
574
|
self._mlf.preprocess(mlforecast_df_past, static_features=[], dropna=False)
|
575
|
+
assert self._mlf.ts.target_transforms is not None
|
545
576
|
for tfm in self._mlf.ts.target_transforms[::-1]:
|
546
577
|
predictions = apply_inverse_transform(predictions, transform=tfm)
|
547
578
|
|
@@ -549,25 +580,30 @@ class DirectTabularModel(AbstractMLForecastModel):
|
|
549
580
|
predictions = self._add_gaussian_quantiles(
|
550
581
|
predictions, repeated_item_ids=predictions[MLF_ITEMID], past_target=data[self.target]
|
551
582
|
)
|
552
|
-
|
583
|
+
predictions_tsdf: TimeSeriesDataFrame = TimeSeriesDataFrame(
|
584
|
+
predictions.rename(columns={MLF_ITEMID: ITEMID, MLF_TIMESTAMP: TIMESTAMP})
|
585
|
+
)
|
553
586
|
|
554
587
|
if forecast_for_short_series is not None:
|
555
|
-
|
556
|
-
|
557
|
-
return predictions
|
588
|
+
predictions_tsdf = pd.concat([predictions_tsdf, forecast_for_short_series]) # type: ignore
|
589
|
+
predictions_tsdf = predictions_tsdf.reindex(original_item_id_order, level=ITEMID)
|
558
590
|
|
559
|
-
|
591
|
+
return predictions_tsdf
|
592
|
+
|
593
|
+
def _postprocess_predictions(
|
594
|
+
self, predictions: Union[np.ndarray, pd.Series], repeated_item_ids: pd.Series
|
595
|
+
) -> pd.DataFrame:
|
560
596
|
if self.is_quantile_model:
|
561
|
-
|
562
|
-
|
563
|
-
|
597
|
+
predictions_df = pd.DataFrame(predictions, columns=[str(q) for q in self.quantile_levels])
|
598
|
+
predictions_df.values.sort(axis=1)
|
599
|
+
predictions_df["mean"] = predictions_df["0.5"]
|
564
600
|
else:
|
565
|
-
|
601
|
+
predictions_df = pd.DataFrame(predictions, columns=["mean"])
|
566
602
|
|
567
|
-
column_order = ["mean"] + [col for col in
|
568
|
-
return
|
603
|
+
column_order = ["mean"] + [col for col in predictions_df.columns if col != "mean"]
|
604
|
+
return predictions_df[column_order]
|
569
605
|
|
570
|
-
def _get_extra_tabular_init_kwargs(self) ->
|
606
|
+
def _get_extra_tabular_init_kwargs(self) -> Dict[str, Any]:
|
571
607
|
if self.is_quantile_model:
|
572
608
|
return {
|
573
609
|
"problem_type": ag.constants.QUANTILE,
|
@@ -622,7 +658,7 @@ class RecursiveTabularModel(AbstractMLForecastModel):
|
|
622
658
|
end of each time series).
|
623
659
|
"""
|
624
660
|
|
625
|
-
def get_hyperparameters(self) ->
|
661
|
+
def get_hyperparameters(self) -> Dict[str, Any]:
|
626
662
|
model_params = super().get_hyperparameters()
|
627
663
|
model_params.setdefault("target_scaler", "standard")
|
628
664
|
if "differences" not in model_params or model_params["differences"] is None:
|
@@ -641,6 +677,7 @@ class RecursiveTabularModel(AbstractMLForecastModel):
|
|
641
677
|
)
|
642
678
|
if len(data) == 0:
|
643
679
|
# All time series are too short for chosen differences
|
680
|
+
assert forecast_for_short_series is not None
|
644
681
|
return forecast_for_short_series
|
645
682
|
|
646
683
|
new_df = self._to_mlforecast_df(data, data.static_features)
|
@@ -648,7 +685,9 @@ class RecursiveTabularModel(AbstractMLForecastModel):
|
|
648
685
|
new_df = self._shorten_all_series(new_df, self._max_ts_length)
|
649
686
|
if known_covariates is None:
|
650
687
|
future_index = self.get_forecast_horizon_index(data)
|
651
|
-
known_covariates =
|
688
|
+
known_covariates = TimeSeriesDataFrame(
|
689
|
+
pd.DataFrame(columns=[self.target], index=future_index, dtype="float32")
|
690
|
+
)
|
652
691
|
X_df = self._to_mlforecast_df(known_covariates, data.static_features, include_target=False)
|
653
692
|
# If both covariates & static features are missing, set X_df = None to avoid exception from MLForecast
|
654
693
|
if len(X_df.columns.difference([MLF_ITEMID, MLF_TIMESTAMP])) == 0:
|
@@ -659,18 +698,19 @@ class RecursiveTabularModel(AbstractMLForecastModel):
|
|
659
698
|
new_df=new_df,
|
660
699
|
X_df=X_df,
|
661
700
|
)
|
662
|
-
|
663
|
-
|
701
|
+
assert isinstance(raw_predictions, pd.DataFrame)
|
702
|
+
raw_predictions = raw_predictions.rename(columns={MLF_ITEMID: ITEMID, MLF_TIMESTAMP: TIMESTAMP})
|
703
|
+
|
704
|
+
predictions: TimeSeriesDataFrame = TimeSeriesDataFrame(
|
664
705
|
self._add_gaussian_quantiles(
|
665
|
-
|
706
|
+
raw_predictions, repeated_item_ids=raw_predictions[ITEMID], past_target=data[self.target]
|
666
707
|
)
|
667
708
|
)
|
668
|
-
|
669
709
|
if forecast_for_short_series is not None:
|
670
|
-
predictions = pd.concat([predictions, forecast_for_short_series])
|
710
|
+
predictions = pd.concat([predictions, forecast_for_short_series]) # type: ignore
|
671
711
|
return predictions.reindex(original_item_id_order, level=ITEMID)
|
672
712
|
|
673
|
-
def _get_extra_tabular_init_kwargs(self) ->
|
713
|
+
def _get_extra_tabular_init_kwargs(self) -> Dict[str, Any]:
|
674
714
|
return {
|
675
715
|
"problem_type": ag.constants.REGRESSION,
|
676
716
|
"eval_metric": self.eval_metric.equivalent_tabular_regression_metric or "mean_absolute_error",
|