autogluon.timeseries 1.0.1b20240304__py3-none-any.whl → 1.4.1b20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of autogluon.timeseries might be problematic. Click here for more details.
- autogluon/timeseries/configs/__init__.py +3 -2
- autogluon/timeseries/configs/hyperparameter_presets.py +62 -0
- autogluon/timeseries/configs/predictor_presets.py +84 -0
- autogluon/timeseries/dataset/ts_dataframe.py +339 -186
- autogluon/timeseries/learner.py +192 -60
- autogluon/timeseries/metrics/__init__.py +55 -11
- autogluon/timeseries/metrics/abstract.py +96 -25
- autogluon/timeseries/metrics/point.py +186 -39
- autogluon/timeseries/metrics/quantile.py +47 -20
- autogluon/timeseries/metrics/utils.py +6 -6
- autogluon/timeseries/models/__init__.py +13 -7
- autogluon/timeseries/models/abstract/__init__.py +2 -2
- autogluon/timeseries/models/abstract/abstract_timeseries_model.py +533 -273
- autogluon/timeseries/models/abstract/model_trial.py +10 -10
- autogluon/timeseries/models/abstract/tunable.py +189 -0
- autogluon/timeseries/models/autogluon_tabular/__init__.py +2 -0
- autogluon/timeseries/models/autogluon_tabular/mlforecast.py +369 -215
- autogluon/timeseries/models/autogluon_tabular/per_step.py +513 -0
- autogluon/timeseries/models/autogluon_tabular/transforms.py +67 -0
- autogluon/timeseries/models/autogluon_tabular/utils.py +3 -51
- autogluon/timeseries/models/chronos/__init__.py +4 -0
- autogluon/timeseries/models/chronos/chronos2.py +361 -0
- autogluon/timeseries/models/chronos/model.py +738 -0
- autogluon/timeseries/models/chronos/utils.py +369 -0
- autogluon/timeseries/models/ensemble/__init__.py +35 -2
- autogluon/timeseries/models/ensemble/{abstract_timeseries_ensemble.py → abstract.py} +50 -26
- autogluon/timeseries/models/ensemble/array_based/__init__.py +3 -0
- autogluon/timeseries/models/ensemble/array_based/abstract.py +236 -0
- autogluon/timeseries/models/ensemble/array_based/models.py +73 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/__init__.py +12 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/abstract.py +88 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/linear_stacker.py +167 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/per_quantile_tabular.py +94 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/tabular.py +107 -0
- autogluon/timeseries/models/ensemble/ensemble_selection.py +167 -0
- autogluon/timeseries/models/ensemble/per_item_greedy.py +162 -0
- autogluon/timeseries/models/ensemble/weighted/__init__.py +8 -0
- autogluon/timeseries/models/ensemble/weighted/abstract.py +40 -0
- autogluon/timeseries/models/ensemble/weighted/basic.py +78 -0
- autogluon/timeseries/models/ensemble/weighted/greedy.py +57 -0
- autogluon/timeseries/models/gluonts/__init__.py +3 -1
- autogluon/timeseries/models/gluonts/abstract.py +583 -0
- autogluon/timeseries/models/gluonts/dataset.py +109 -0
- autogluon/timeseries/models/gluonts/{torch/models.py → models.py} +185 -44
- autogluon/timeseries/models/local/__init__.py +1 -10
- autogluon/timeseries/models/local/abstract_local_model.py +150 -97
- autogluon/timeseries/models/local/naive.py +31 -23
- autogluon/timeseries/models/local/npts.py +6 -2
- autogluon/timeseries/models/local/statsforecast.py +99 -112
- autogluon/timeseries/models/multi_window/multi_window_model.py +99 -40
- autogluon/timeseries/models/registry.py +64 -0
- autogluon/timeseries/models/toto/__init__.py +3 -0
- autogluon/timeseries/models/toto/_internal/__init__.py +9 -0
- autogluon/timeseries/models/toto/_internal/backbone/__init__.py +3 -0
- autogluon/timeseries/models/toto/_internal/backbone/attention.py +196 -0
- autogluon/timeseries/models/toto/_internal/backbone/backbone.py +262 -0
- autogluon/timeseries/models/toto/_internal/backbone/distribution.py +70 -0
- autogluon/timeseries/models/toto/_internal/backbone/kvcache.py +136 -0
- autogluon/timeseries/models/toto/_internal/backbone/rope.py +89 -0
- autogluon/timeseries/models/toto/_internal/backbone/rotary_embedding_torch.py +342 -0
- autogluon/timeseries/models/toto/_internal/backbone/scaler.py +305 -0
- autogluon/timeseries/models/toto/_internal/backbone/transformer.py +333 -0
- autogluon/timeseries/models/toto/_internal/dataset.py +165 -0
- autogluon/timeseries/models/toto/_internal/forecaster.py +423 -0
- autogluon/timeseries/models/toto/dataloader.py +108 -0
- autogluon/timeseries/models/toto/hf_pretrained_model.py +118 -0
- autogluon/timeseries/models/toto/model.py +236 -0
- autogluon/timeseries/predictor.py +826 -305
- autogluon/timeseries/regressor.py +253 -0
- autogluon/timeseries/splitter.py +10 -31
- autogluon/timeseries/trainer/__init__.py +2 -3
- autogluon/timeseries/trainer/ensemble_composer.py +439 -0
- autogluon/timeseries/trainer/model_set_builder.py +256 -0
- autogluon/timeseries/trainer/prediction_cache.py +149 -0
- autogluon/timeseries/trainer/trainer.py +1298 -0
- autogluon/timeseries/trainer/utils.py +17 -0
- autogluon/timeseries/transforms/__init__.py +2 -0
- autogluon/timeseries/transforms/covariate_scaler.py +164 -0
- autogluon/timeseries/transforms/target_scaler.py +149 -0
- autogluon/timeseries/utils/constants.py +10 -0
- autogluon/timeseries/utils/datetime/base.py +38 -20
- autogluon/timeseries/utils/datetime/lags.py +18 -16
- autogluon/timeseries/utils/datetime/seasonality.py +14 -14
- autogluon/timeseries/utils/datetime/time_features.py +17 -14
- autogluon/timeseries/utils/features.py +317 -53
- autogluon/timeseries/utils/forecast.py +31 -17
- autogluon/timeseries/utils/timer.py +173 -0
- autogluon/timeseries/utils/warning_filters.py +44 -6
- autogluon/timeseries/version.py +2 -1
- autogluon.timeseries-1.4.1b20251210-py3.11-nspkg.pth +1 -0
- {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info}/METADATA +71 -47
- autogluon_timeseries-1.4.1b20251210.dist-info/RECORD +103 -0
- {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info}/WHEEL +1 -1
- autogluon/timeseries/configs/presets_configs.py +0 -11
- autogluon/timeseries/evaluator.py +0 -6
- autogluon/timeseries/models/ensemble/greedy_ensemble.py +0 -170
- autogluon/timeseries/models/gluonts/abstract_gluonts.py +0 -550
- autogluon/timeseries/models/gluonts/torch/__init__.py +0 -0
- autogluon/timeseries/models/presets.py +0 -325
- autogluon/timeseries/trainer/abstract_trainer.py +0 -1144
- autogluon/timeseries/trainer/auto_trainer.py +0 -74
- autogluon.timeseries-1.0.1b20240304-py3.8-nspkg.pth +0 -1
- autogluon.timeseries-1.0.1b20240304.dist-info/RECORD +0 -58
- {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info/licenses}/LICENSE +0 -0
- {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info/licenses}/NOTICE +0 -0
- {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info}/namespace_packages.txt +0 -0
- {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info}/top_level.txt +0 -0
- {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info}/zip-safe +0 -0
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import reprlib
|
|
3
|
-
|
|
4
|
-
from
|
|
3
|
+
import time
|
|
4
|
+
from dataclasses import asdict, dataclass, field
|
|
5
|
+
from typing import Any, Literal
|
|
5
6
|
|
|
7
|
+
import numpy as np
|
|
6
8
|
import pandas as pd
|
|
7
9
|
|
|
8
10
|
from autogluon.common.features.types import R_FLOAT, R_INT
|
|
@@ -12,7 +14,8 @@ from autogluon.features.generators import (
|
|
|
12
14
|
IdentityFeatureGenerator,
|
|
13
15
|
PipelineFeatureGenerator,
|
|
14
16
|
)
|
|
15
|
-
from autogluon.timeseries import TimeSeriesDataFrame
|
|
17
|
+
from autogluon.timeseries.dataset import TimeSeriesDataFrame
|
|
18
|
+
from autogluon.timeseries.utils.warning_filters import warning_filter
|
|
16
19
|
|
|
17
20
|
logger = logging.getLogger(__name__)
|
|
18
21
|
|
|
@@ -21,18 +24,60 @@ logger = logging.getLogger(__name__)
|
|
|
21
24
|
class CovariateMetadata:
|
|
22
25
|
"""Provides mapping from different covariate types to columns in the dataset."""
|
|
23
26
|
|
|
24
|
-
static_features_cat:
|
|
25
|
-
static_features_real:
|
|
26
|
-
known_covariates_real:
|
|
27
|
-
known_covariates_cat:
|
|
28
|
-
past_covariates_real:
|
|
29
|
-
past_covariates_cat:
|
|
27
|
+
static_features_cat: list[str] = field(default_factory=list)
|
|
28
|
+
static_features_real: list[str] = field(default_factory=list)
|
|
29
|
+
known_covariates_real: list[str] = field(default_factory=list)
|
|
30
|
+
known_covariates_cat: list[str] = field(default_factory=list)
|
|
31
|
+
past_covariates_real: list[str] = field(default_factory=list)
|
|
32
|
+
past_covariates_cat: list[str] = field(default_factory=list)
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def static_features(self) -> list[str]:
|
|
36
|
+
return self.static_features_cat + self.static_features_real
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def known_covariates(self) -> list[str]:
|
|
40
|
+
return self.known_covariates_cat + self.known_covariates_real
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def past_covariates(self) -> list[str]:
|
|
44
|
+
return self.past_covariates_cat + self.past_covariates_real
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def covariates(self) -> list[str]:
|
|
48
|
+
return self.known_covariates + self.past_covariates
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def covariates_real(self) -> list[str]:
|
|
52
|
+
return self.known_covariates_real + self.past_covariates_real
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def covariates_cat(self) -> list[str]:
|
|
56
|
+
return self.known_covariates_cat + self.past_covariates_cat
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def real_features(self) -> list[str]:
|
|
60
|
+
return self.static_features_real + self.covariates_real
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def cat_features(self) -> list[str]:
|
|
64
|
+
return self.static_features_cat + self.covariates_cat
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def all_features(self) -> list[str]:
|
|
68
|
+
return self.static_features + self.covariates
|
|
69
|
+
|
|
70
|
+
def to_dict(self) -> dict[str, Any]:
|
|
71
|
+
return asdict(self)
|
|
30
72
|
|
|
31
73
|
|
|
32
74
|
class ContinuousAndCategoricalFeatureGenerator(PipelineFeatureGenerator):
|
|
33
|
-
"""Generates categorical and continuous features for time series models.
|
|
75
|
+
"""Generates categorical and continuous features for time series models.
|
|
76
|
+
|
|
77
|
+
Imputes missing categorical features with the most frequent value in the training set.
|
|
78
|
+
"""
|
|
34
79
|
|
|
35
|
-
def __init__(self, verbosity: int = 0, minimum_cat_count=2,
|
|
80
|
+
def __init__(self, verbosity: int = 0, minimum_cat_count=2, **kwargs):
|
|
36
81
|
generators = [
|
|
37
82
|
CategoryFeatureGenerator(minimum_cat_count=minimum_cat_count, fillna="mode"),
|
|
38
83
|
IdentityFeatureGenerator(infer_features_in_args={"valid_raw_types": [R_INT, R_FLOAT]}),
|
|
@@ -43,53 +88,87 @@ class ContinuousAndCategoricalFeatureGenerator(PipelineFeatureGenerator):
|
|
|
43
88
|
pre_generators=[AsTypeFeatureGenerator(convert_bool=False)],
|
|
44
89
|
pre_enforce_types=False,
|
|
45
90
|
pre_drop_useless=False,
|
|
91
|
+
post_drop_duplicates=True,
|
|
92
|
+
reset_index=False,
|
|
46
93
|
verbosity=verbosity,
|
|
47
94
|
**kwargs,
|
|
48
95
|
)
|
|
49
|
-
self.float_dtype = float_dtype
|
|
50
|
-
|
|
51
|
-
def _convert_numerical_columns_to_float(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
52
|
-
"""Convert the dtype of all numerical (float or int) columns to the given float dtype."""
|
|
53
|
-
numeric_columns = [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col])]
|
|
54
|
-
return df.astype({col: self.float_dtype for col in numeric_columns})
|
|
55
96
|
|
|
56
97
|
def transform(self, X: pd.DataFrame, *args, **kwargs) -> pd.DataFrame:
|
|
57
|
-
|
|
58
|
-
X = pd.DataFrame(X)
|
|
59
|
-
return self._convert_numerical_columns_to_float(super().transform(X, *args, **kwargs))
|
|
98
|
+
return super().transform(X, *args, **kwargs)
|
|
60
99
|
|
|
61
100
|
def fit_transform(self, X: pd.DataFrame, *args, **kwargs) -> pd.DataFrame:
|
|
62
101
|
# PipelineFeatureGenerator does not use transform() inside fit_transform(), so we need to override both methods
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
102
|
+
transformed = super().fit_transform(X, *args, **kwargs)
|
|
103
|
+
# Ignore the '__dummy__' feature generated by PipelineFeatureGenerator if none of the features are informative
|
|
104
|
+
if "__dummy__" in transformed.columns:
|
|
105
|
+
transformed.drop(columns=["__dummy__"], inplace=True)
|
|
106
|
+
return transformed
|
|
66
107
|
|
|
67
108
|
|
|
68
109
|
class TimeSeriesFeatureGenerator:
|
|
69
110
|
"""Takes care of preprocessing for static_features and past/known covariates.
|
|
70
111
|
|
|
71
|
-
All covariates & static features are converted into either
|
|
112
|
+
All covariates & static features are converted into either float or categorical dtype.
|
|
113
|
+
|
|
114
|
+
Missing values in the target column are left as-is but missing values in static features & covariates are imputed.
|
|
115
|
+
Imputation logic is as follows:
|
|
116
|
+
1. For all categorical columns (static, past, known), we fill missing values with the mode of the training set.
|
|
117
|
+
2. For real static features, we impute missing values with the median of the training set.
|
|
118
|
+
3. For real covariates (past, known), we ffill + bfill within each time series. If for some time series all
|
|
119
|
+
covariate values are missing, we fill them with the median of the training set.
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
target
|
|
124
|
+
Name of the target column.
|
|
125
|
+
known_covariates_names
|
|
126
|
+
Columns that contain covariates that are known into the future.
|
|
127
|
+
float_dtype
|
|
128
|
+
Numpy float dtype to which all numeric columns (float, int, bool) will be converted both in static & dynamic dfs.
|
|
129
|
+
num_samples
|
|
130
|
+
Number of rows sampled from the training dataset to speed up computation of the median (used later for imputation).
|
|
131
|
+
If set to `None`, median will be computed using all rows.
|
|
72
132
|
"""
|
|
73
133
|
|
|
74
|
-
def __init__(
|
|
134
|
+
def __init__(
|
|
135
|
+
self,
|
|
136
|
+
target: str,
|
|
137
|
+
known_covariates_names: list[str],
|
|
138
|
+
float_dtype: str = "float32",
|
|
139
|
+
num_samples: int | None = 20_000,
|
|
140
|
+
):
|
|
75
141
|
self.target = target
|
|
76
142
|
self.float_dtype = float_dtype
|
|
143
|
+
self.num_samples = num_samples
|
|
144
|
+
|
|
77
145
|
self._is_fit = False
|
|
78
|
-
self.known_covariates_names = list(known_covariates_names)
|
|
79
|
-
self.past_covariates_names = []
|
|
146
|
+
self.known_covariates_names: list[str] = list(known_covariates_names)
|
|
147
|
+
self.past_covariates_names: list[str] = []
|
|
80
148
|
self.known_covariates_pipeline = ContinuousAndCategoricalFeatureGenerator()
|
|
81
149
|
self.past_covariates_pipeline = ContinuousAndCategoricalFeatureGenerator()
|
|
82
150
|
# Cat features with cat_count=1 are fine in static_features since they are repeated for all time steps in a TS
|
|
83
151
|
self.static_feature_pipeline = ContinuousAndCategoricalFeatureGenerator(minimum_cat_count=1)
|
|
84
|
-
self.
|
|
152
|
+
self._covariate_metadata: CovariateMetadata | None = None # type ignore
|
|
153
|
+
self._train_covariates_real_median: pd.Series | None = None
|
|
154
|
+
self._train_static_real_median: pd.Series | None = None
|
|
85
155
|
|
|
86
156
|
@property
|
|
87
|
-
def required_column_names(self) ->
|
|
157
|
+
def required_column_names(self) -> list[str]:
|
|
88
158
|
return [self.target] + list(self.known_covariates_names) + list(self.past_covariates_names)
|
|
89
159
|
|
|
160
|
+
@property
|
|
161
|
+
def covariate_metadata(self) -> CovariateMetadata:
|
|
162
|
+
assert self._covariate_metadata is not None, "covariate_metadata is not set. Did you call fit?"
|
|
163
|
+
return self._covariate_metadata
|
|
164
|
+
|
|
90
165
|
def fit(self, data: TimeSeriesDataFrame) -> None:
|
|
166
|
+
self.fit_transform(data)
|
|
167
|
+
|
|
168
|
+
def fit_transform(self, data: TimeSeriesDataFrame) -> TimeSeriesDataFrame:
|
|
91
169
|
assert not self._is_fit, f"{self.__class__.__name__} has already been fit"
|
|
92
170
|
|
|
171
|
+
start_time = time.monotonic()
|
|
93
172
|
self.past_covariates_names = []
|
|
94
173
|
for column in data.columns:
|
|
95
174
|
if column != self.target and column not in self.known_covariates_names:
|
|
@@ -99,23 +178,33 @@ class TimeSeriesFeatureGenerator:
|
|
|
99
178
|
data, required_column_names=self.required_column_names, data_frame_name="train_data"
|
|
100
179
|
)
|
|
101
180
|
|
|
181
|
+
# Convert to a pd.DataFrame and remove index for faster processing
|
|
182
|
+
df = pd.DataFrame(data)
|
|
183
|
+
index = df.index
|
|
184
|
+
df.reset_index(drop=True, inplace=True)
|
|
185
|
+
df = self._convert_numeric_to_float_dtype(df)
|
|
186
|
+
|
|
187
|
+
dfs_to_concat = [df[[self.target]]]
|
|
188
|
+
|
|
102
189
|
logger.info("\nProvided data contains following columns:")
|
|
103
190
|
logger.info(f"\ttarget: '{self.target}'")
|
|
104
191
|
|
|
105
192
|
if len(self.known_covariates_names) > 0:
|
|
106
|
-
known_covariates_df = self.known_covariates_pipeline.fit_transform(
|
|
193
|
+
known_covariates_df = self.known_covariates_pipeline.fit_transform(df[self.known_covariates_names])
|
|
107
194
|
logger.info("\tknown_covariates:")
|
|
108
195
|
known_covariates_cat, known_covariates_real = self._detect_and_log_column_types(known_covariates_df)
|
|
109
196
|
self.known_covariates_names = self.known_covariates_pipeline.features_in
|
|
197
|
+
dfs_to_concat.append(known_covariates_df)
|
|
110
198
|
else:
|
|
111
199
|
known_covariates_cat = []
|
|
112
200
|
known_covariates_real = []
|
|
113
201
|
|
|
114
202
|
if len(self.past_covariates_names) > 0:
|
|
115
|
-
past_covariates_df = self.past_covariates_pipeline.fit_transform(
|
|
203
|
+
past_covariates_df = self.past_covariates_pipeline.fit_transform(df[self.past_covariates_names])
|
|
116
204
|
logger.info("\tpast_covariates:")
|
|
117
205
|
past_covariates_cat, past_covariates_real = self._detect_and_log_column_types(past_covariates_df)
|
|
118
206
|
self.past_covariates_names = self.past_covariates_pipeline.features_in
|
|
207
|
+
dfs_to_concat.append(past_covariates_df)
|
|
119
208
|
else:
|
|
120
209
|
past_covariates_cat = []
|
|
121
210
|
past_covariates_real = []
|
|
@@ -125,14 +214,18 @@ class TimeSeriesFeatureGenerator:
|
|
|
125
214
|
)
|
|
126
215
|
|
|
127
216
|
if data.static_features is not None:
|
|
128
|
-
static_features_df = self.static_feature_pipeline.fit_transform(
|
|
217
|
+
static_features_df = self.static_feature_pipeline.fit_transform(
|
|
218
|
+
self._convert_numeric_to_float_dtype(data.static_features)
|
|
219
|
+
)
|
|
129
220
|
logger.info("\tstatic_features:")
|
|
130
221
|
static_features_cat, static_features_real = self._detect_and_log_column_types(static_features_df)
|
|
131
222
|
ignored_static_features = data.static_features.columns.difference(self.static_feature_pipeline.features_in)
|
|
223
|
+
self._train_static_real_median = data.static_features[static_features_real].median()
|
|
132
224
|
else:
|
|
133
225
|
static_features_cat = []
|
|
134
226
|
static_features_real = []
|
|
135
227
|
ignored_static_features = []
|
|
228
|
+
static_features_df = None
|
|
136
229
|
|
|
137
230
|
if len(ignored_covariates) > 0 or len(ignored_static_features) > 0:
|
|
138
231
|
logger.info("\nAutoGluon will ignore following non-numeric/non-informative columns:")
|
|
@@ -146,7 +239,7 @@ class TimeSeriesFeatureGenerator:
|
|
|
146
239
|
"\nTo learn how to fix incorrectly inferred types, please see documentation for TimeSeriesPredictor.fit"
|
|
147
240
|
)
|
|
148
241
|
|
|
149
|
-
self.
|
|
242
|
+
self._covariate_metadata = CovariateMetadata(
|
|
150
243
|
known_covariates_cat=known_covariates_cat,
|
|
151
244
|
known_covariates_real=known_covariates_real,
|
|
152
245
|
past_covariates_cat=past_covariates_cat,
|
|
@@ -154,8 +247,47 @@ class TimeSeriesFeatureGenerator:
|
|
|
154
247
|
static_features_cat=static_features_cat,
|
|
155
248
|
static_features_real=static_features_real,
|
|
156
249
|
)
|
|
250
|
+
|
|
251
|
+
# Median of real-valued covariates will be used for missing value imputation
|
|
252
|
+
if self.num_samples is not None and len(df) > self.num_samples:
|
|
253
|
+
df = df.sample(n=self.num_samples, replace=True)
|
|
254
|
+
self._train_covariates_real_median = df[self.covariate_metadata.covariates_real].median()
|
|
255
|
+
|
|
256
|
+
self.fit_time = time.monotonic() - start_time
|
|
157
257
|
self._is_fit = True
|
|
158
258
|
|
|
259
|
+
df_out = self._concat_dfs(dfs_to_concat)
|
|
260
|
+
df_out.index = index
|
|
261
|
+
ts_df = TimeSeriesDataFrame(df_out, static_features=self._impute_static_features(static_features_df))
|
|
262
|
+
return self._impute_covariates(ts_df, column_names=self.covariate_metadata.covariates_real)
|
|
263
|
+
|
|
264
|
+
@staticmethod
|
|
265
|
+
def _concat_dfs(dfs_to_concat: list[pd.DataFrame]) -> pd.DataFrame:
|
|
266
|
+
if len(dfs_to_concat) == 1:
|
|
267
|
+
return dfs_to_concat[0]
|
|
268
|
+
else:
|
|
269
|
+
return pd.concat(dfs_to_concat, axis=1, copy=False)
|
|
270
|
+
|
|
271
|
+
def _impute_covariates(self, ts_df: TimeSeriesDataFrame, column_names: list[str]) -> TimeSeriesDataFrame:
|
|
272
|
+
"""Impute missing values in selected columns with ffill, bfill, and median imputation."""
|
|
273
|
+
if len(column_names) > 0:
|
|
274
|
+
# ffill + bfill covariates that have at least some observed values
|
|
275
|
+
covariates_real = ts_df[column_names].fill_missing_values()
|
|
276
|
+
# If for some items covariates consist completely of NaNs, fill them with median of training data
|
|
277
|
+
if np.isnan(covariates_real.to_numpy()).any():
|
|
278
|
+
covariates_real.fillna(self._train_covariates_real_median, inplace=True)
|
|
279
|
+
ts_df[column_names] = covariates_real
|
|
280
|
+
return ts_df
|
|
281
|
+
|
|
282
|
+
def _impute_static_features(self, static_df: pd.DataFrame | None) -> pd.DataFrame | None:
|
|
283
|
+
"""Impute missing values in static features using the median."""
|
|
284
|
+
static_real_names = self.covariate_metadata.static_features_real
|
|
285
|
+
if static_df is not None and static_real_names:
|
|
286
|
+
static_real = static_df[static_real_names]
|
|
287
|
+
if np.isnan(static_real.to_numpy()).any():
|
|
288
|
+
static_df[static_real_names] = static_real.fillna(self._train_static_real_median)
|
|
289
|
+
return static_df
|
|
290
|
+
|
|
159
291
|
def transform(self, data: TimeSeriesDataFrame, data_frame_name: str = "data") -> TimeSeriesDataFrame:
|
|
160
292
|
"""Transform static features and past/known covariates.
|
|
161
293
|
|
|
@@ -168,50 +300,61 @@ class TimeSeriesFeatureGenerator:
|
|
|
168
300
|
self._check_required_columns_are_present(
|
|
169
301
|
data, required_column_names=self.required_column_names, data_frame_name=data_frame_name
|
|
170
302
|
)
|
|
171
|
-
|
|
303
|
+
# Convert to a pd.DataFrame and remove index for faster processing
|
|
304
|
+
df = pd.DataFrame(data)
|
|
305
|
+
index = df.index
|
|
306
|
+
df.reset_index(drop=True, inplace=True)
|
|
307
|
+
|
|
308
|
+
dfs_to_concat = [df[[self.target]]]
|
|
172
309
|
|
|
173
310
|
if len(self.known_covariates_names) > 0:
|
|
174
|
-
|
|
311
|
+
known_covariates_df = self.known_covariates_pipeline.transform(df[self.known_covariates_names])
|
|
312
|
+
dfs_to_concat.append(known_covariates_df)
|
|
175
313
|
|
|
176
314
|
if len(self.past_covariates_names) > 0:
|
|
177
|
-
|
|
315
|
+
past_covariates_df = self.past_covariates_pipeline.transform(df[self.past_covariates_names])
|
|
316
|
+
dfs_to_concat.append(past_covariates_df)
|
|
178
317
|
|
|
179
318
|
if self.static_feature_pipeline.is_fit():
|
|
180
319
|
if data.static_features is None:
|
|
181
320
|
raise ValueError(f"Provided {data_frame_name} must contain static_features")
|
|
182
|
-
|
|
321
|
+
static_features_df = self.static_feature_pipeline.transform(data.static_features)
|
|
183
322
|
else:
|
|
184
|
-
|
|
323
|
+
static_features_df = None
|
|
185
324
|
|
|
186
|
-
|
|
325
|
+
df_out = self._concat_dfs(dfs_to_concat)
|
|
326
|
+
df_out.index = index
|
|
327
|
+
ts_df = TimeSeriesDataFrame(df_out, static_features=self._impute_static_features(static_features_df))
|
|
328
|
+
return self._impute_covariates(ts_df, column_names=self.covariate_metadata.covariates_real)
|
|
187
329
|
|
|
188
330
|
def transform_future_known_covariates(
|
|
189
|
-
self, known_covariates:
|
|
190
|
-
) ->
|
|
331
|
+
self, known_covariates: TimeSeriesDataFrame | None
|
|
332
|
+
) -> TimeSeriesDataFrame | None:
|
|
191
333
|
assert self._is_fit, f"{self.__class__.__name__} has not been fit yet"
|
|
192
334
|
if len(self.known_covariates_names) > 0:
|
|
193
335
|
assert known_covariates is not None, "known_covariates must be provided at prediction time"
|
|
194
336
|
self._check_required_columns_are_present(
|
|
195
337
|
known_covariates, required_column_names=self.known_covariates_names, data_frame_name="known_covariates"
|
|
196
338
|
)
|
|
197
|
-
|
|
339
|
+
known_covariates = TimeSeriesDataFrame(
|
|
340
|
+
self.known_covariates_pipeline.transform(pd.DataFrame(known_covariates))
|
|
341
|
+
)
|
|
342
|
+
return self._impute_covariates(
|
|
343
|
+
known_covariates, column_names=self.covariate_metadata.known_covariates_real
|
|
344
|
+
)
|
|
198
345
|
else:
|
|
199
346
|
return None
|
|
200
347
|
|
|
201
|
-
def fit_transform(self, data: TimeSeriesDataFrame, data_frame_name: str = "data") -> TimeSeriesDataFrame:
|
|
202
|
-
self.fit(data)
|
|
203
|
-
return self.transform(data, data_frame_name=data_frame_name)
|
|
204
|
-
|
|
205
348
|
@staticmethod
|
|
206
|
-
def _detect_and_log_column_types(transformed_df: pd.DataFrame) ->
|
|
349
|
+
def _detect_and_log_column_types(transformed_df: pd.DataFrame) -> tuple[list[str], list[str]]:
|
|
207
350
|
"""Log & return names of categorical and real-valued columns in the DataFrame."""
|
|
208
|
-
cat_column_names = []
|
|
209
|
-
real_column_names = []
|
|
351
|
+
cat_column_names: list[str] = []
|
|
352
|
+
real_column_names: list[str] = []
|
|
210
353
|
for column_name, column_dtype in transformed_df.dtypes.items():
|
|
211
354
|
if isinstance(column_dtype, pd.CategoricalDtype):
|
|
212
|
-
cat_column_names.append(column_name)
|
|
355
|
+
cat_column_names.append(str(column_name))
|
|
213
356
|
elif pd.api.types.is_numeric_dtype(column_dtype):
|
|
214
|
-
real_column_names.append(column_name)
|
|
357
|
+
real_column_names.append(str(column_name))
|
|
215
358
|
|
|
216
359
|
logger.info(f"\t\tcategorical: {reprlib.repr(cat_column_names)}")
|
|
217
360
|
logger.info(f"\t\tcontinuous (float): {reprlib.repr(real_column_names)}")
|
|
@@ -219,10 +362,131 @@ class TimeSeriesFeatureGenerator:
|
|
|
219
362
|
|
|
220
363
|
@staticmethod
|
|
221
364
|
def _check_required_columns_are_present(
|
|
222
|
-
data: TimeSeriesDataFrame, required_column_names:
|
|
365
|
+
data: TimeSeriesDataFrame, required_column_names: list[str], data_frame_name: str
|
|
223
366
|
) -> None:
|
|
224
|
-
missing_columns = pd.Index(required_column_names).difference(data.columns)
|
|
367
|
+
missing_columns = pd.Index(required_column_names).difference(data.columns) # type: ignore
|
|
225
368
|
if len(missing_columns) > 0:
|
|
226
369
|
raise ValueError(
|
|
227
370
|
f"{len(missing_columns)} columns are missing from {data_frame_name}: {reprlib.repr(missing_columns.to_list())}"
|
|
228
371
|
)
|
|
372
|
+
|
|
373
|
+
def _convert_numeric_to_float_dtype(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
374
|
+
"""Convert the dtype of all numeric (float, int or bool) columns to self.float_dtype."""
|
|
375
|
+
numeric_columns = [
|
|
376
|
+
col for col, dtype in df.dtypes.items() if pd.api.types.is_numeric_dtype(dtype) and col != self.target
|
|
377
|
+
]
|
|
378
|
+
if len(numeric_columns) > 0:
|
|
379
|
+
df = df.astype({col: self.float_dtype for col in numeric_columns}, copy=False)
|
|
380
|
+
return df
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
class AbstractFeatureImportanceTransform:
|
|
384
|
+
"""Abstract class for transforms that replace a given feature with dummy or shuffled values,
|
|
385
|
+
for use in feature importance operations.
|
|
386
|
+
"""
|
|
387
|
+
|
|
388
|
+
def __init__(
|
|
389
|
+
self,
|
|
390
|
+
covariate_metadata: CovariateMetadata,
|
|
391
|
+
prediction_length: int,
|
|
392
|
+
**kwargs,
|
|
393
|
+
):
|
|
394
|
+
self.covariate_metadata: CovariateMetadata = covariate_metadata
|
|
395
|
+
self.prediction_length: int = prediction_length
|
|
396
|
+
|
|
397
|
+
def _transform_static_series(self, feature_data: pd.Series, is_categorical: bool) -> Any:
|
|
398
|
+
"""Transforms a series with the same index as the pandas DataFrame"""
|
|
399
|
+
raise NotImplementedError
|
|
400
|
+
|
|
401
|
+
def _transform_series(self, feature_data: pd.Series, is_categorical: bool) -> pd.Series:
|
|
402
|
+
"""Transforms a series with the same index as the pandas DataFrame"""
|
|
403
|
+
raise NotImplementedError
|
|
404
|
+
|
|
405
|
+
def transform(self, data: TimeSeriesDataFrame, feature_name: str, **kwargs) -> TimeSeriesDataFrame:
|
|
406
|
+
if feature_name not in self.covariate_metadata.all_features:
|
|
407
|
+
raise ValueError(f"Target feature {feature_name} not found in covariate metadata")
|
|
408
|
+
|
|
409
|
+
# feature transform works on a shallow copy of the main time series dataframe
|
|
410
|
+
# but a deep copy of the static features.
|
|
411
|
+
data = data.copy(deep=False)
|
|
412
|
+
|
|
413
|
+
is_categorical = feature_name in self.covariate_metadata.cat_features
|
|
414
|
+
|
|
415
|
+
if feature_name in self.covariate_metadata.past_covariates:
|
|
416
|
+
# we'll have to work on the history of the data alone
|
|
417
|
+
data[feature_name] = data[feature_name].copy()
|
|
418
|
+
feature_data = (
|
|
419
|
+
data[feature_name].groupby(level=TimeSeriesDataFrame.ITEMID, sort=False).head(-self.prediction_length)
|
|
420
|
+
)
|
|
421
|
+
# Silence spurious FutureWarning raised by DataFrame.update https://github.com/pandas-dev/pandas/issues/57124
|
|
422
|
+
with warning_filter():
|
|
423
|
+
data[feature_name].update(self._transform_series(feature_data, is_categorical=is_categorical))
|
|
424
|
+
elif feature_name in self.covariate_metadata.static_features:
|
|
425
|
+
assert data.static_features is not None
|
|
426
|
+
feature_data = data.static_features[feature_name].copy()
|
|
427
|
+
feature_data.reset_index(drop=True, inplace=True)
|
|
428
|
+
data.static_features[feature_name] = self._transform_static_series(
|
|
429
|
+
feature_data, is_categorical=is_categorical
|
|
430
|
+
)
|
|
431
|
+
else: # known covariates
|
|
432
|
+
data[feature_name] = self._transform_series(data[feature_name], is_categorical=is_categorical)
|
|
433
|
+
|
|
434
|
+
return data
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
class PermutationFeatureImportanceTransform(AbstractFeatureImportanceTransform):
|
|
438
|
+
"""Naively shuffles a given feature."""
|
|
439
|
+
|
|
440
|
+
def __init__(
|
|
441
|
+
self,
|
|
442
|
+
covariate_metadata: CovariateMetadata,
|
|
443
|
+
prediction_length: int,
|
|
444
|
+
random_seed: int | None = None,
|
|
445
|
+
shuffle_type: Literal["itemwise", "naive"] = "itemwise",
|
|
446
|
+
**kwargs,
|
|
447
|
+
):
|
|
448
|
+
super().__init__(covariate_metadata, prediction_length, **kwargs)
|
|
449
|
+
self.shuffle_type = shuffle_type
|
|
450
|
+
self.random_seed = random_seed
|
|
451
|
+
|
|
452
|
+
def _transform_static_series(self, feature_data: pd.Series, is_categorical: bool) -> Any:
|
|
453
|
+
return feature_data.sample(frac=1, random_state=self.random_seed).values
|
|
454
|
+
|
|
455
|
+
def _transform_series(self, feature_data: pd.Series, is_categorical: bool) -> pd.Series:
|
|
456
|
+
# set random state once to shuffle 'independently' for different items
|
|
457
|
+
rng = np.random.RandomState(self.random_seed)
|
|
458
|
+
|
|
459
|
+
if self.shuffle_type == "itemwise":
|
|
460
|
+
return feature_data.groupby(level=TimeSeriesDataFrame.ITEMID, sort=False).transform(
|
|
461
|
+
lambda x: x.sample(frac=1, random_state=rng).values
|
|
462
|
+
)
|
|
463
|
+
elif self.shuffle_type == "naive":
|
|
464
|
+
return pd.Series(feature_data.sample(frac=1, random_state=rng).values, index=feature_data.index)
|
|
465
|
+
else:
|
|
466
|
+
raise ValueError(f"Unknown shuffle_type: {self.shuffle_type}")
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
class ConstantReplacementFeatureImportanceTransform(AbstractFeatureImportanceTransform):
|
|
470
|
+
"""Replaces a target feature with the median if it's a real-valued feature, and the mode if it's a
|
|
471
|
+
categorical feature."""
|
|
472
|
+
|
|
473
|
+
def __init__(
|
|
474
|
+
self,
|
|
475
|
+
covariate_metadata: CovariateMetadata,
|
|
476
|
+
prediction_length: int,
|
|
477
|
+
real_value_aggregation: Literal["mean", "median"] = "mean",
|
|
478
|
+
**kwargs,
|
|
479
|
+
):
|
|
480
|
+
super().__init__(covariate_metadata, prediction_length, **kwargs)
|
|
481
|
+
self.real_value_aggregation = real_value_aggregation
|
|
482
|
+
|
|
483
|
+
def _transform_static_series(self, feature_data: pd.Series, is_categorical: bool) -> Any:
|
|
484
|
+
return feature_data.mode()[0] if is_categorical else feature_data.agg(self.real_value_aggregation)
|
|
485
|
+
|
|
486
|
+
def _transform_series(self, feature_data: pd.Series, is_categorical: bool) -> pd.Series:
|
|
487
|
+
if is_categorical:
|
|
488
|
+
return feature_data.groupby(level=TimeSeriesDataFrame.ITEMID, sort=False).transform(lambda x: x.mode()[0])
|
|
489
|
+
else:
|
|
490
|
+
return feature_data.groupby(level=TimeSeriesDataFrame.ITEMID, sort=False).transform(
|
|
491
|
+
self.real_value_aggregation
|
|
492
|
+
) # type: ignore
|
|
@@ -3,33 +3,47 @@ import warnings
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
import pandas as pd
|
|
5
5
|
|
|
6
|
-
from autogluon.
|
|
6
|
+
from autogluon.common.utils.deprecated_utils import Deprecated
|
|
7
|
+
from autogluon.timeseries.dataset import TimeSeriesDataFrame
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
def get_forecast_horizon_index_single_time_series(
|
|
10
11
|
past_timestamps: pd.DatetimeIndex, freq: str, prediction_length: int
|
|
11
12
|
) -> pd.DatetimeIndex:
|
|
12
13
|
"""Get timestamps for the next prediction_length many time steps of the time series with given frequency."""
|
|
13
|
-
|
|
14
|
-
|
|
14
|
+
offset = pd.tseries.frequencies.to_offset(freq)
|
|
15
|
+
if offset is None:
|
|
16
|
+
raise ValueError(f"Invalid frequency: {freq}")
|
|
17
|
+
start_ts = past_timestamps.max() + 1 * offset
|
|
18
|
+
return pd.date_range(start=start_ts, periods=prediction_length, freq=freq, name=TimeSeriesDataFrame.TIMESTAMP)
|
|
15
19
|
|
|
16
20
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
+
@Deprecated(
|
|
22
|
+
min_version_to_warn="1.3", min_version_to_error="2.0", new="TimeSeriesPredictor.forecast_horizon_data_frame"
|
|
23
|
+
)
|
|
24
|
+
def get_forecast_horizon_index_ts_dataframe(*args, **kwargs) -> pd.MultiIndex:
|
|
25
|
+
return pd.MultiIndex.from_frame(make_future_data_frame(*args, **kwargs))
|
|
21
26
|
|
|
22
|
-
Returns a pandas.MultiIndex, where
|
|
23
|
-
- level 0 ("item_id") contains the same item_ids as the input ts_dataframe.
|
|
24
|
-
- level 1 ("timestamp") contains the next prediction_length time steps starting from the end of each time series.
|
|
25
|
-
"""
|
|
26
|
-
last = ts_dataframe.reset_index()[[ITEMID, TIMESTAMP]].groupby(by=ITEMID, sort=False, as_index=False).last()
|
|
27
|
-
item_ids = np.repeat(last[ITEMID], prediction_length)
|
|
28
27
|
|
|
29
|
-
|
|
30
|
-
|
|
28
|
+
def make_future_data_frame(
|
|
29
|
+
ts_dataframe: TimeSeriesDataFrame,
|
|
30
|
+
prediction_length: int,
|
|
31
|
+
freq: str | None = None,
|
|
32
|
+
) -> pd.DataFrame:
|
|
33
|
+
"""For each item in the dataframe, get timestamps for the next `prediction_length` time steps into the future.
|
|
34
|
+
|
|
35
|
+
Returns a pandas.DataFrame, with columns "item_id" and "timestamp" corresponding to the forecast horizon.
|
|
36
|
+
"""
|
|
37
|
+
indptr = ts_dataframe.get_indptr()
|
|
38
|
+
last = ts_dataframe.index[indptr[1:] - 1].to_frame(index=False)
|
|
39
|
+
item_ids = np.repeat(last[TimeSeriesDataFrame.ITEMID].to_numpy(), prediction_length)
|
|
40
|
+
|
|
41
|
+
if freq is None:
|
|
42
|
+
freq = ts_dataframe.freq
|
|
43
|
+
offset = pd.tseries.frequencies.to_offset(freq)
|
|
44
|
+
last_ts = pd.DatetimeIndex(last[TimeSeriesDataFrame.TIMESTAMP])
|
|
31
45
|
# Non-vectorized offsets like BusinessDay may produce a PerformanceWarning - we filter them
|
|
32
46
|
with warnings.catch_warnings():
|
|
33
47
|
warnings.simplefilter("ignore", category=pd.errors.PerformanceWarning)
|
|
34
|
-
timestamps = np.dstack([last_ts + step * offset for step in range(1, prediction_length + 1)]).ravel()
|
|
35
|
-
return pd.
|
|
48
|
+
timestamps = np.dstack([last_ts + step * offset for step in range(1, prediction_length + 1)]).ravel() # type: ignore[operator]
|
|
49
|
+
return pd.DataFrame({TimeSeriesDataFrame.ITEMID: item_ids, TimeSeriesDataFrame.TIMESTAMP: timestamps})
|