autogluon.timeseries 1.4.1b20251016__py3-none-any.whl → 1.4.1b20251218__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of autogluon.timeseries might be problematic. Click here for more details.
- autogluon/timeseries/configs/hyperparameter_presets.py +7 -21
- autogluon/timeseries/configs/predictor_presets.py +23 -39
- autogluon/timeseries/dataset/ts_dataframe.py +97 -86
- autogluon/timeseries/learner.py +70 -35
- autogluon/timeseries/metrics/__init__.py +4 -4
- autogluon/timeseries/metrics/abstract.py +8 -8
- autogluon/timeseries/metrics/point.py +9 -9
- autogluon/timeseries/metrics/quantile.py +5 -5
- autogluon/timeseries/metrics/utils.py +4 -4
- autogluon/timeseries/models/__init__.py +2 -1
- autogluon/timeseries/models/abstract/abstract_timeseries_model.py +52 -39
- autogluon/timeseries/models/abstract/model_trial.py +2 -1
- autogluon/timeseries/models/abstract/tunable.py +8 -8
- autogluon/timeseries/models/autogluon_tabular/mlforecast.py +58 -62
- autogluon/timeseries/models/autogluon_tabular/per_step.py +26 -15
- autogluon/timeseries/models/autogluon_tabular/transforms.py +11 -9
- autogluon/timeseries/models/chronos/__init__.py +2 -1
- autogluon/timeseries/models/chronos/chronos2.py +395 -0
- autogluon/timeseries/models/chronos/model.py +126 -88
- autogluon/timeseries/models/chronos/{pipeline/utils.py → utils.py} +69 -37
- autogluon/timeseries/models/ensemble/__init__.py +36 -2
- autogluon/timeseries/models/ensemble/abstract.py +14 -46
- autogluon/timeseries/models/ensemble/array_based/__init__.py +3 -0
- autogluon/timeseries/models/ensemble/array_based/abstract.py +240 -0
- autogluon/timeseries/models/ensemble/array_based/models.py +185 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/__init__.py +12 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/abstract.py +88 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/linear_stacker.py +186 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/per_quantile_tabular.py +94 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/tabular.py +107 -0
- autogluon/timeseries/models/ensemble/{greedy.py → ensemble_selection.py} +41 -61
- autogluon/timeseries/models/ensemble/per_item_greedy.py +172 -0
- autogluon/timeseries/models/ensemble/weighted/__init__.py +8 -0
- autogluon/timeseries/models/ensemble/weighted/abstract.py +45 -0
- autogluon/timeseries/models/ensemble/{basic.py → weighted/basic.py} +25 -22
- autogluon/timeseries/models/ensemble/weighted/greedy.py +62 -0
- autogluon/timeseries/models/gluonts/abstract.py +32 -31
- autogluon/timeseries/models/gluonts/dataset.py +11 -11
- autogluon/timeseries/models/gluonts/models.py +0 -7
- autogluon/timeseries/models/local/__init__.py +0 -7
- autogluon/timeseries/models/local/abstract_local_model.py +15 -18
- autogluon/timeseries/models/local/naive.py +2 -2
- autogluon/timeseries/models/local/npts.py +7 -1
- autogluon/timeseries/models/local/statsforecast.py +12 -12
- autogluon/timeseries/models/multi_window/multi_window_model.py +39 -24
- autogluon/timeseries/models/registry.py +3 -4
- autogluon/timeseries/models/toto/_internal/backbone/attention.py +3 -4
- autogluon/timeseries/models/toto/_internal/backbone/backbone.py +6 -6
- autogluon/timeseries/models/toto/_internal/backbone/rope.py +4 -9
- autogluon/timeseries/models/toto/_internal/backbone/rotary_embedding_torch.py +342 -0
- autogluon/timeseries/models/toto/_internal/backbone/scaler.py +2 -3
- autogluon/timeseries/models/toto/_internal/backbone/transformer.py +10 -10
- autogluon/timeseries/models/toto/_internal/dataset.py +2 -2
- autogluon/timeseries/models/toto/_internal/forecaster.py +8 -8
- autogluon/timeseries/models/toto/dataloader.py +4 -4
- autogluon/timeseries/models/toto/hf_pretrained_model.py +97 -16
- autogluon/timeseries/models/toto/model.py +35 -20
- autogluon/timeseries/predictor.py +527 -155
- autogluon/timeseries/regressor.py +27 -30
- autogluon/timeseries/splitter.py +3 -27
- autogluon/timeseries/trainer/ensemble_composer.py +444 -0
- autogluon/timeseries/trainer/model_set_builder.py +9 -9
- autogluon/timeseries/trainer/prediction_cache.py +16 -16
- autogluon/timeseries/trainer/trainer.py +300 -278
- autogluon/timeseries/trainer/utils.py +17 -0
- autogluon/timeseries/transforms/covariate_scaler.py +8 -8
- autogluon/timeseries/transforms/target_scaler.py +15 -15
- autogluon/timeseries/utils/constants.py +10 -0
- autogluon/timeseries/utils/datetime/lags.py +1 -3
- autogluon/timeseries/utils/datetime/seasonality.py +1 -3
- autogluon/timeseries/utils/features.py +31 -14
- autogluon/timeseries/utils/forecast.py +6 -7
- autogluon/timeseries/utils/timer.py +173 -0
- autogluon/timeseries/version.py +1 -1
- autogluon.timeseries-1.4.1b20251218-py3.11-nspkg.pth +1 -0
- {autogluon.timeseries-1.4.1b20251016.dist-info → autogluon_timeseries-1.4.1b20251218.dist-info}/METADATA +39 -27
- autogluon_timeseries-1.4.1b20251218.dist-info/RECORD +103 -0
- {autogluon.timeseries-1.4.1b20251016.dist-info → autogluon_timeseries-1.4.1b20251218.dist-info}/WHEEL +1 -1
- autogluon/timeseries/evaluator.py +0 -6
- autogluon/timeseries/models/chronos/pipeline/__init__.py +0 -10
- autogluon/timeseries/models/chronos/pipeline/base.py +0 -160
- autogluon/timeseries/models/chronos/pipeline/chronos.py +0 -544
- autogluon/timeseries/models/chronos/pipeline/chronos_bolt.py +0 -580
- autogluon.timeseries-1.4.1b20251016-py3.9-nspkg.pth +0 -1
- autogluon.timeseries-1.4.1b20251016.dist-info/RECORD +0 -90
- {autogluon.timeseries-1.4.1b20251016.dist-info → autogluon_timeseries-1.4.1b20251218.dist-info/licenses}/LICENSE +0 -0
- {autogluon.timeseries-1.4.1b20251016.dist-info → autogluon_timeseries-1.4.1b20251218.dist-info/licenses}/NOTICE +0 -0
- {autogluon.timeseries-1.4.1b20251016.dist-info → autogluon_timeseries-1.4.1b20251218.dist-info}/namespace_packages.txt +0 -0
- {autogluon.timeseries-1.4.1b20251016.dist-info → autogluon_timeseries-1.4.1b20251218.dist-info}/top_level.txt +0 -0
- {autogluon.timeseries-1.4.1b20251016.dist-info → autogluon_timeseries-1.4.1b20251218.dist-info}/zip-safe +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from typing import Any
|
|
1
|
+
from typing import Any
|
|
2
2
|
|
|
3
3
|
|
|
4
|
-
def get_hyperparameter_presets() -> dict[str, dict[str,
|
|
4
|
+
def get_hyperparameter_presets() -> dict[str, dict[str, dict[str, Any] | list[dict[str, Any]]]]:
|
|
5
5
|
return {
|
|
6
6
|
"very_light": {
|
|
7
7
|
"Naive": {},
|
|
@@ -31,32 +31,18 @@ def get_hyperparameter_presets() -> dict[str, dict[str, Union[dict[str, Any], li
|
|
|
31
31
|
"default": {
|
|
32
32
|
"SeasonalNaive": {},
|
|
33
33
|
"AutoETS": {},
|
|
34
|
-
"NPTS": {},
|
|
35
34
|
"DynamicOptimizedTheta": {},
|
|
36
35
|
"RecursiveTabular": {},
|
|
37
36
|
"DirectTabular": {},
|
|
38
37
|
"TemporalFusionTransformer": {},
|
|
39
|
-
"
|
|
40
|
-
|
|
41
|
-
"Chronos": [
|
|
42
|
-
{
|
|
43
|
-
"ag_args": {"name_suffix": "ZeroShot"},
|
|
44
|
-
"model_path": "bolt_base",
|
|
45
|
-
},
|
|
38
|
+
"Chronos2": [
|
|
39
|
+
{},
|
|
46
40
|
{
|
|
47
|
-
"ag_args": {"name_suffix": "
|
|
48
|
-
"model_path": "
|
|
41
|
+
"ag_args": {"name_suffix": "SmallFineTuned"},
|
|
42
|
+
"model_path": "autogluon/chronos-2-small",
|
|
49
43
|
"fine_tune": True,
|
|
50
|
-
"
|
|
51
|
-
"covariate_regressor": {"model_name": "CAT", "model_hyperparameters": {"iterations": 1_000}},
|
|
44
|
+
"eval_during_fine_tune": True,
|
|
52
45
|
},
|
|
53
46
|
],
|
|
54
|
-
"TiDE": {
|
|
55
|
-
"encoder_hidden_dim": 256,
|
|
56
|
-
"decoder_hidden_dim": 256,
|
|
57
|
-
"temporal_hidden_dim": 64,
|
|
58
|
-
"num_batches_per_epoch": 100,
|
|
59
|
-
"lr": 1e-4,
|
|
60
|
-
},
|
|
61
47
|
},
|
|
62
48
|
}
|
|
@@ -2,10 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from typing import Any
|
|
4
4
|
|
|
5
|
-
from . import get_hyperparameter_presets
|
|
6
|
-
|
|
7
5
|
TIMESERIES_PRESETS_ALIASES = dict(
|
|
8
|
-
chronos="chronos_small",
|
|
9
6
|
best="best_quality",
|
|
10
7
|
high="high_quality",
|
|
11
8
|
medium="medium_quality",
|
|
@@ -16,13 +13,33 @@ TIMESERIES_PRESETS_ALIASES = dict(
|
|
|
16
13
|
|
|
17
14
|
|
|
18
15
|
def get_predictor_presets() -> dict[str, Any]:
|
|
19
|
-
hp_presets = get_hyperparameter_presets()
|
|
20
|
-
|
|
21
16
|
predictor_presets = dict(
|
|
22
|
-
best_quality={"hyperparameters": "default", "num_val_windows":
|
|
17
|
+
best_quality={"hyperparameters": "default", "num_val_windows": "auto", "refit_every_n_windows": "auto"},
|
|
23
18
|
high_quality={"hyperparameters": "default"},
|
|
24
19
|
medium_quality={"hyperparameters": "light"},
|
|
25
20
|
fast_training={"hyperparameters": "very_light"},
|
|
21
|
+
# Chronos-2 models
|
|
22
|
+
chronos2={
|
|
23
|
+
"hyperparameters": {"Chronos2": {"model_path": "autogluon/chronos-2"}},
|
|
24
|
+
"skip_model_selection": True,
|
|
25
|
+
},
|
|
26
|
+
chronos2_small={
|
|
27
|
+
"hyperparameters": {"Chronos2": {"model_path": "autogluon/chronos-2-small"}},
|
|
28
|
+
"skip_model_selection": True,
|
|
29
|
+
},
|
|
30
|
+
chronos2_ensemble={
|
|
31
|
+
"hyperparameters": {
|
|
32
|
+
"Chronos2": [
|
|
33
|
+
{"model_path": "autogluon/chronos-2", "ag_args": {"name_suffix": "ZeroShot"}},
|
|
34
|
+
{
|
|
35
|
+
"model_path": "autogluon/chronos-2-small",
|
|
36
|
+
"fine_tune": True,
|
|
37
|
+
"eval_during_fine_tune": True,
|
|
38
|
+
"ag_args": {"name_suffix": "SmallFineTuned"},
|
|
39
|
+
},
|
|
40
|
+
]
|
|
41
|
+
},
|
|
42
|
+
},
|
|
26
43
|
# Chronos-Bolt models
|
|
27
44
|
bolt_tiny={
|
|
28
45
|
"hyperparameters": {"Chronos": {"model_path": "bolt_tiny"}},
|
|
@@ -40,39 +57,6 @@ def get_predictor_presets() -> dict[str, Any]:
|
|
|
40
57
|
"hyperparameters": {"Chronos": {"model_path": "bolt_base"}},
|
|
41
58
|
"skip_model_selection": True,
|
|
42
59
|
},
|
|
43
|
-
# Original Chronos models
|
|
44
|
-
chronos_tiny={
|
|
45
|
-
"hyperparameters": {"Chronos": {"model_path": "tiny"}},
|
|
46
|
-
"skip_model_selection": True,
|
|
47
|
-
},
|
|
48
|
-
chronos_mini={
|
|
49
|
-
"hyperparameters": {"Chronos": {"model_path": "mini"}},
|
|
50
|
-
"skip_model_selection": True,
|
|
51
|
-
},
|
|
52
|
-
chronos_small={
|
|
53
|
-
"hyperparameters": {"Chronos": {"model_path": "small"}},
|
|
54
|
-
"skip_model_selection": True,
|
|
55
|
-
},
|
|
56
|
-
chronos_base={
|
|
57
|
-
"hyperparameters": {"Chronos": {"model_path": "base"}},
|
|
58
|
-
"skip_model_selection": True,
|
|
59
|
-
},
|
|
60
|
-
chronos_large={
|
|
61
|
-
"hyperparameters": {"Chronos": {"model_path": "large", "batch_size": 8}},
|
|
62
|
-
"skip_model_selection": True,
|
|
63
|
-
},
|
|
64
|
-
chronos_ensemble={
|
|
65
|
-
"hyperparameters": {
|
|
66
|
-
"Chronos": {"model_path": "small"},
|
|
67
|
-
**hp_presets["light_inference"],
|
|
68
|
-
}
|
|
69
|
-
},
|
|
70
|
-
chronos_large_ensemble={
|
|
71
|
-
"hyperparameters": {
|
|
72
|
-
"Chronos": {"model_path": "large", "batch_size": 8},
|
|
73
|
-
**hp_presets["light_inference"],
|
|
74
|
-
}
|
|
75
|
-
},
|
|
76
60
|
)
|
|
77
61
|
|
|
78
62
|
# update with aliases
|
|
@@ -7,7 +7,7 @@ import reprlib
|
|
|
7
7
|
from collections.abc import Iterable
|
|
8
8
|
from itertools import islice
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import TYPE_CHECKING, Any,
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Final, Type, overload
|
|
11
11
|
|
|
12
12
|
import numpy as np
|
|
13
13
|
import pandas as pd
|
|
@@ -19,11 +19,6 @@ from autogluon.common.loaders import load_pd
|
|
|
19
19
|
|
|
20
20
|
logger = logging.getLogger(__name__)
|
|
21
21
|
|
|
22
|
-
ITEMID = "item_id"
|
|
23
|
-
TIMESTAMP = "timestamp"
|
|
24
|
-
|
|
25
|
-
IRREGULAR_TIME_INDEX_FREQSTR = "IRREG"
|
|
26
|
-
|
|
27
22
|
|
|
28
23
|
class TimeSeriesDataFrame(pd.DataFrame):
|
|
29
24
|
"""A collection of univariate time series, where each row is identified by an (``item_id``, ``timestamp``) pair.
|
|
@@ -121,12 +116,16 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
121
116
|
index: pd.MultiIndex # type: ignore
|
|
122
117
|
_metadata = ["_static_features"]
|
|
123
118
|
|
|
119
|
+
IRREGULAR_TIME_INDEX_FREQSTR: Final[str] = "IRREG"
|
|
120
|
+
ITEMID: Final[str] = "item_id"
|
|
121
|
+
TIMESTAMP: Final[str] = "timestamp"
|
|
122
|
+
|
|
124
123
|
def __init__(
|
|
125
124
|
self,
|
|
126
|
-
data:
|
|
127
|
-
static_features:
|
|
128
|
-
id_column:
|
|
129
|
-
timestamp_column:
|
|
125
|
+
data: pd.DataFrame | str | Path | Iterable,
|
|
126
|
+
static_features: pd.DataFrame | str | Path | None = None,
|
|
127
|
+
id_column: str | None = None,
|
|
128
|
+
timestamp_column: str | None = None,
|
|
130
129
|
num_cpus: int = -1,
|
|
131
130
|
*args,
|
|
132
131
|
**kwargs,
|
|
@@ -150,7 +149,7 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
150
149
|
else:
|
|
151
150
|
raise ValueError(f"data must be a pd.DataFrame, Iterable, string or Path (received {type(data)}).")
|
|
152
151
|
super().__init__(data=data, *args, **kwargs) # type: ignore
|
|
153
|
-
self._static_features:
|
|
152
|
+
self._static_features: pd.DataFrame | None = None
|
|
154
153
|
if static_features is not None:
|
|
155
154
|
self.static_features = self._construct_static_features(static_features, id_column=id_column)
|
|
156
155
|
|
|
@@ -169,29 +168,33 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
169
168
|
def _construct_tsdf_from_data_frame(
|
|
170
169
|
cls,
|
|
171
170
|
df: pd.DataFrame,
|
|
172
|
-
id_column:
|
|
173
|
-
timestamp_column:
|
|
171
|
+
id_column: str | None = None,
|
|
172
|
+
timestamp_column: str | None = None,
|
|
174
173
|
) -> pd.DataFrame:
|
|
175
174
|
df = df.copy()
|
|
176
175
|
if id_column is not None:
|
|
177
176
|
assert id_column in df.columns, f"Column '{id_column}' not found!"
|
|
178
|
-
if id_column != ITEMID and ITEMID in df.columns:
|
|
179
|
-
logger.warning(
|
|
180
|
-
|
|
181
|
-
|
|
177
|
+
if id_column != cls.ITEMID and cls.ITEMID in df.columns:
|
|
178
|
+
logger.warning(
|
|
179
|
+
f"Renaming existing column '{cls.ITEMID}' -> '__{cls.ITEMID}' to avoid name collisions."
|
|
180
|
+
)
|
|
181
|
+
df.rename(columns={cls.ITEMID: "__" + cls.ITEMID}, inplace=True)
|
|
182
|
+
df.rename(columns={id_column: cls.ITEMID}, inplace=True)
|
|
182
183
|
|
|
183
184
|
if timestamp_column is not None:
|
|
184
185
|
assert timestamp_column in df.columns, f"Column '{timestamp_column}' not found!"
|
|
185
|
-
if timestamp_column != TIMESTAMP and TIMESTAMP in df.columns:
|
|
186
|
-
logger.warning(
|
|
187
|
-
|
|
188
|
-
|
|
186
|
+
if timestamp_column != cls.TIMESTAMP and cls.TIMESTAMP in df.columns:
|
|
187
|
+
logger.warning(
|
|
188
|
+
f"Renaming existing column '{cls.TIMESTAMP}' -> '__{cls.TIMESTAMP}' to avoid name collisions."
|
|
189
|
+
)
|
|
190
|
+
df.rename(columns={cls.TIMESTAMP: "__" + cls.TIMESTAMP}, inplace=True)
|
|
191
|
+
df.rename(columns={timestamp_column: cls.TIMESTAMP}, inplace=True)
|
|
189
192
|
|
|
190
|
-
if TIMESTAMP in df.columns:
|
|
191
|
-
df[TIMESTAMP] = pd.to_datetime(df[TIMESTAMP])
|
|
193
|
+
if cls.TIMESTAMP in df.columns:
|
|
194
|
+
df[cls.TIMESTAMP] = pd.to_datetime(df[cls.TIMESTAMP])
|
|
192
195
|
|
|
193
196
|
cls._validate_data_frame(df)
|
|
194
|
-
return df.set_index([ITEMID, TIMESTAMP])
|
|
197
|
+
return df.set_index([cls.ITEMID, cls.TIMESTAMP])
|
|
195
198
|
|
|
196
199
|
@classmethod
|
|
197
200
|
def _construct_tsdf_from_iterable_dataset(cls, iterable_dataset: Iterable, num_cpus: int = -1) -> pd.DataFrame:
|
|
@@ -202,7 +205,7 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
202
205
|
start_timestamp = start_timestamp.to_timestamp(how="S")
|
|
203
206
|
target = ts["target"]
|
|
204
207
|
datetime_index = tuple(pd.date_range(start_timestamp, periods=len(target), freq=freq))
|
|
205
|
-
idx = pd.MultiIndex.from_product([(item_id,), datetime_index], names=[ITEMID, TIMESTAMP])
|
|
208
|
+
idx = pd.MultiIndex.from_product([(item_id,), datetime_index], names=[cls.ITEMID, cls.TIMESTAMP])
|
|
206
209
|
return pd.Series(target, name="target", index=idx).to_frame()
|
|
207
210
|
|
|
208
211
|
cls._validate_iterable(iterable_dataset)
|
|
@@ -219,32 +222,34 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
219
222
|
raise ValueError(f"data must be a pd.DataFrame, got {type(data)}")
|
|
220
223
|
if not isinstance(data.index, pd.MultiIndex):
|
|
221
224
|
raise ValueError(f"data must have pd.MultiIndex, got {type(data.index)}")
|
|
222
|
-
if not pd.api.types.is_datetime64_dtype(data.index.dtypes[TIMESTAMP]):
|
|
223
|
-
raise ValueError(f"for {TIMESTAMP}, the only pandas dtype allowed is `datetime64`.")
|
|
224
|
-
if not data.index.names == (f"{ITEMID}", f"{TIMESTAMP}"):
|
|
225
|
-
raise ValueError(
|
|
225
|
+
if not pd.api.types.is_datetime64_dtype(data.index.dtypes[cls.TIMESTAMP]):
|
|
226
|
+
raise ValueError(f"for {cls.TIMESTAMP}, the only pandas dtype allowed is `datetime64`.")
|
|
227
|
+
if not data.index.names == (f"{cls.ITEMID}", f"{cls.TIMESTAMP}"):
|
|
228
|
+
raise ValueError(
|
|
229
|
+
f"data must have index names as ('{cls.ITEMID}', '{cls.TIMESTAMP}'), got {data.index.names}"
|
|
230
|
+
)
|
|
226
231
|
item_id_index = data.index.levels[0]
|
|
227
232
|
if not (pd.api.types.is_integer_dtype(item_id_index) or pd.api.types.is_string_dtype(item_id_index)):
|
|
228
|
-
raise ValueError(f"all entries in index `{ITEMID}` must be of integer or string dtype")
|
|
233
|
+
raise ValueError(f"all entries in index `{cls.ITEMID}` must be of integer or string dtype")
|
|
229
234
|
|
|
230
235
|
@classmethod
|
|
231
236
|
def _validate_data_frame(cls, df: pd.DataFrame):
|
|
232
237
|
"""Validate that a pd.DataFrame with ITEMID and TIMESTAMP columns can be converted to TimeSeriesDataFrame"""
|
|
233
238
|
if not isinstance(df, pd.DataFrame):
|
|
234
239
|
raise ValueError(f"data must be a pd.DataFrame, got {type(df)}")
|
|
235
|
-
if ITEMID not in df.columns:
|
|
236
|
-
raise ValueError(f"data must have a `{ITEMID}` column")
|
|
237
|
-
if TIMESTAMP not in df.columns:
|
|
238
|
-
raise ValueError(f"data must have a `{TIMESTAMP}` column")
|
|
239
|
-
if df[ITEMID].isnull().any():
|
|
240
|
-
raise ValueError(f"`{ITEMID}` column can not have nan")
|
|
241
|
-
if df[TIMESTAMP].isnull().any():
|
|
242
|
-
raise ValueError(f"`{TIMESTAMP}` column can not have nan")
|
|
243
|
-
if not pd.api.types.is_datetime64_dtype(df[TIMESTAMP]):
|
|
244
|
-
raise ValueError(f"for {TIMESTAMP}, the only pandas dtype allowed is `datetime64`.")
|
|
245
|
-
item_id_column = df[ITEMID]
|
|
240
|
+
if cls.ITEMID not in df.columns:
|
|
241
|
+
raise ValueError(f"data must have a `{cls.ITEMID}` column")
|
|
242
|
+
if cls.TIMESTAMP not in df.columns:
|
|
243
|
+
raise ValueError(f"data must have a `{cls.TIMESTAMP}` column")
|
|
244
|
+
if df[cls.ITEMID].isnull().any():
|
|
245
|
+
raise ValueError(f"`{cls.ITEMID}` column can not have nan")
|
|
246
|
+
if df[cls.TIMESTAMP].isnull().any():
|
|
247
|
+
raise ValueError(f"`{cls.TIMESTAMP}` column can not have nan")
|
|
248
|
+
if not pd.api.types.is_datetime64_dtype(df[cls.TIMESTAMP]):
|
|
249
|
+
raise ValueError(f"for {cls.TIMESTAMP}, the only pandas dtype allowed is `datetime64`.")
|
|
250
|
+
item_id_column = df[cls.ITEMID]
|
|
246
251
|
if not (pd.api.types.is_integer_dtype(item_id_column) or pd.api.types.is_string_dtype(item_id_column)):
|
|
247
|
-
raise ValueError(f"all entries in column `{ITEMID}` must be of integer or string dtype")
|
|
252
|
+
raise ValueError(f"all entries in column `{cls.ITEMID}` must be of integer or string dtype")
|
|
248
253
|
|
|
249
254
|
@classmethod
|
|
250
255
|
def _validate_iterable(cls, data: Iterable):
|
|
@@ -267,9 +272,9 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
267
272
|
def from_data_frame(
|
|
268
273
|
cls,
|
|
269
274
|
df: pd.DataFrame,
|
|
270
|
-
id_column:
|
|
271
|
-
timestamp_column:
|
|
272
|
-
static_features_df:
|
|
275
|
+
id_column: str | None = None,
|
|
276
|
+
timestamp_column: str | None = None,
|
|
277
|
+
static_features_df: pd.DataFrame | None = None,
|
|
273
278
|
) -> TimeSeriesDataFrame:
|
|
274
279
|
"""Construct a ``TimeSeriesDataFrame`` from a pandas DataFrame.
|
|
275
280
|
|
|
@@ -310,10 +315,10 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
310
315
|
@classmethod
|
|
311
316
|
def from_path(
|
|
312
317
|
cls,
|
|
313
|
-
path:
|
|
314
|
-
id_column:
|
|
315
|
-
timestamp_column:
|
|
316
|
-
static_features_path:
|
|
318
|
+
path: str | Path,
|
|
319
|
+
id_column: str | None = None,
|
|
320
|
+
timestamp_column: str | None = None,
|
|
321
|
+
static_features_path: str | Path | None = None,
|
|
317
322
|
) -> TimeSeriesDataFrame:
|
|
318
323
|
"""Construct a ``TimeSeriesDataFrame`` from a CSV or Parquet file.
|
|
319
324
|
|
|
@@ -386,13 +391,13 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
386
391
|
@property
|
|
387
392
|
def item_ids(self) -> pd.Index:
|
|
388
393
|
"""List of unique time series IDs contained in the data set."""
|
|
389
|
-
return self.index.unique(level=ITEMID)
|
|
394
|
+
return self.index.unique(level=self.ITEMID)
|
|
390
395
|
|
|
391
396
|
@classmethod
|
|
392
397
|
def _construct_static_features(
|
|
393
398
|
cls,
|
|
394
|
-
static_features:
|
|
395
|
-
id_column:
|
|
399
|
+
static_features: pd.DataFrame | str | Path,
|
|
400
|
+
id_column: str | None = None,
|
|
396
401
|
) -> pd.DataFrame:
|
|
397
402
|
if isinstance(static_features, (str, Path)):
|
|
398
403
|
static_features = load_pd.load(str(static_features))
|
|
@@ -403,10 +408,12 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
403
408
|
|
|
404
409
|
if id_column is not None:
|
|
405
410
|
assert id_column in static_features.columns, f"Column '{id_column}' not found in static_features!"
|
|
406
|
-
if id_column != ITEMID and ITEMID in static_features.columns:
|
|
407
|
-
logger.warning(
|
|
408
|
-
|
|
409
|
-
|
|
411
|
+
if id_column != cls.ITEMID and cls.ITEMID in static_features.columns:
|
|
412
|
+
logger.warning(
|
|
413
|
+
f"Renaming existing column '{cls.ITEMID}' -> '__{cls.ITEMID}' to avoid name collisions."
|
|
414
|
+
)
|
|
415
|
+
static_features.rename(columns={cls.ITEMID: "__" + cls.ITEMID}, inplace=True)
|
|
416
|
+
static_features.rename(columns={id_column: cls.ITEMID}, inplace=True)
|
|
410
417
|
return static_features
|
|
411
418
|
|
|
412
419
|
@property
|
|
@@ -414,7 +421,7 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
414
421
|
return self._static_features
|
|
415
422
|
|
|
416
423
|
@static_features.setter
|
|
417
|
-
def static_features(self, value:
|
|
424
|
+
def static_features(self, value: pd.DataFrame | None):
|
|
418
425
|
# if the current item index is not a multiindex, then we are dealing with a single
|
|
419
426
|
# item slice. this should only happen when the user explicitly requests only a
|
|
420
427
|
# single item or during `slice_by_timestep`. In this case we do not set static features
|
|
@@ -431,10 +438,10 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
431
438
|
|
|
432
439
|
# Avoid modifying static features inplace
|
|
433
440
|
value = value.copy()
|
|
434
|
-
if ITEMID in value.columns and value.index.name != ITEMID:
|
|
435
|
-
value = value.set_index(ITEMID)
|
|
436
|
-
if value.index.name != ITEMID:
|
|
437
|
-
value.index.rename(ITEMID, inplace=True)
|
|
441
|
+
if self.ITEMID in value.columns and value.index.name != self.ITEMID:
|
|
442
|
+
value = value.set_index(self.ITEMID)
|
|
443
|
+
if value.index.name != self.ITEMID:
|
|
444
|
+
value.index.rename(self.ITEMID, inplace=True)
|
|
438
445
|
missing_item_ids = self.item_ids.difference(value.index)
|
|
439
446
|
if len(missing_item_ids) > 0:
|
|
440
447
|
raise ValueError(
|
|
@@ -447,7 +454,7 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
447
454
|
|
|
448
455
|
self._static_features = value
|
|
449
456
|
|
|
450
|
-
def infer_frequency(self, num_items:
|
|
457
|
+
def infer_frequency(self, num_items: int | None = None, raise_if_irregular: bool = False) -> str:
|
|
451
458
|
"""Infer the time series frequency based on the timestamps of the observations.
|
|
452
459
|
|
|
453
460
|
Parameters
|
|
@@ -514,7 +521,7 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
514
521
|
else:
|
|
515
522
|
raise ValueError(f"Cannot infer frequency. Multiple frequencies detected: {unique_freqs}")
|
|
516
523
|
else:
|
|
517
|
-
return IRREGULAR_TIME_INDEX_FREQSTR
|
|
524
|
+
return self.IRREGULAR_TIME_INDEX_FREQSTR
|
|
518
525
|
else:
|
|
519
526
|
return pd.tseries.frequencies.to_offset(unique_freqs[0]).freqstr
|
|
520
527
|
|
|
@@ -526,7 +533,7 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
526
533
|
values. For reliable results, use :meth:`~autogluon.timeseries.TimeSeriesDataFrame.infer_frequency`.
|
|
527
534
|
"""
|
|
528
535
|
inferred_freq = self.infer_frequency(num_items=50)
|
|
529
|
-
return None if inferred_freq == IRREGULAR_TIME_INDEX_FREQSTR else inferred_freq
|
|
536
|
+
return None if inferred_freq == self.IRREGULAR_TIME_INDEX_FREQSTR else inferred_freq
|
|
530
537
|
|
|
531
538
|
@property
|
|
532
539
|
def num_items(self):
|
|
@@ -563,7 +570,7 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
563
570
|
return obj
|
|
564
571
|
|
|
565
572
|
def __finalize__( # noqa
|
|
566
|
-
self: TimeSeriesDataFrame, other, method:
|
|
573
|
+
self: TimeSeriesDataFrame, other, method: str | None = None, **kwargs
|
|
567
574
|
) -> TimeSeriesDataFrame:
|
|
568
575
|
super().__finalize__(other=other, method=method, **kwargs)
|
|
569
576
|
# when finalizing the copy/slice operation, we use the property setter to stay consistent
|
|
@@ -595,9 +602,7 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
595
602
|
after = TimeSeriesDataFrame(data_after, static_features=self.static_features)
|
|
596
603
|
return before, after
|
|
597
604
|
|
|
598
|
-
def slice_by_timestep(
|
|
599
|
-
self, start_index: Optional[int] = None, end_index: Optional[int] = None
|
|
600
|
-
) -> TimeSeriesDataFrame:
|
|
605
|
+
def slice_by_timestep(self, start_index: int | None = None, end_index: int | None = None) -> TimeSeriesDataFrame:
|
|
601
606
|
"""Select a subsequence from each time series between start (inclusive) and end (exclusive) indices.
|
|
602
607
|
|
|
603
608
|
This operation is equivalent to selecting a slice ``[start_index : end_index]`` from each time series, and then
|
|
@@ -735,7 +740,7 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
735
740
|
return self.loc[mask]
|
|
736
741
|
else:
|
|
737
742
|
# Fall back to a slow groupby operation
|
|
738
|
-
result = self.groupby(level=ITEMID, sort=False, as_index=False).nth(slice(start_index, end_index))
|
|
743
|
+
result = self.groupby(level=self.ITEMID, sort=False, as_index=False).nth(slice(start_index, end_index))
|
|
739
744
|
result.static_features = self.static_features
|
|
740
745
|
return result
|
|
741
746
|
|
|
@@ -852,12 +857,12 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
852
857
|
"It is highly recommended to call `ts_df.sort_index()` before calling `ts_df.fill_missing_values()`"
|
|
853
858
|
)
|
|
854
859
|
|
|
855
|
-
grouped_df = df.groupby(level=ITEMID, sort=False, group_keys=False)
|
|
860
|
+
grouped_df = df.groupby(level=self.ITEMID, sort=False, group_keys=False)
|
|
856
861
|
if method == "auto":
|
|
857
862
|
filled_df = grouped_df.ffill()
|
|
858
863
|
# If necessary, fill missing values at the start of each time series with bfill
|
|
859
864
|
if filled_df.isna().any(axis=None):
|
|
860
|
-
filled_df = filled_df.groupby(level=ITEMID, sort=False, group_keys=False).bfill()
|
|
865
|
+
filled_df = filled_df.groupby(level=self.ITEMID, sort=False, group_keys=False).bfill()
|
|
861
866
|
elif method in ["ffill", "pad"]:
|
|
862
867
|
filled_df = grouped_df.ffill()
|
|
863
868
|
elif method in ["bfill", "backfill"]:
|
|
@@ -900,8 +905,8 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
900
905
|
return super().sort_index(*args, **kwargs) # type: ignore
|
|
901
906
|
|
|
902
907
|
def get_model_inputs_for_scoring(
|
|
903
|
-
self, prediction_length: int, known_covariates_names:
|
|
904
|
-
) -> tuple[TimeSeriesDataFrame,
|
|
908
|
+
self, prediction_length: int, known_covariates_names: list[str] | None = None
|
|
909
|
+
) -> tuple[TimeSeriesDataFrame, TimeSeriesDataFrame | None]:
|
|
905
910
|
"""Prepare model inputs necessary to predict the last ``prediction_length`` time steps of each time series in the dataset.
|
|
906
911
|
|
|
907
912
|
Parameters
|
|
@@ -931,8 +936,8 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
931
936
|
def train_test_split(
|
|
932
937
|
self,
|
|
933
938
|
prediction_length: int,
|
|
934
|
-
end_index:
|
|
935
|
-
suffix:
|
|
939
|
+
end_index: int | None = None,
|
|
940
|
+
suffix: str | None = None,
|
|
936
941
|
) -> tuple[TimeSeriesDataFrame, TimeSeriesDataFrame]:
|
|
937
942
|
"""Generate a train/test split from the given dataset.
|
|
938
943
|
|
|
@@ -977,7 +982,7 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
977
982
|
|
|
978
983
|
def convert_frequency(
|
|
979
984
|
self,
|
|
980
|
-
freq:
|
|
985
|
+
freq: str | pd.DateOffset,
|
|
981
986
|
agg_numeric: str = "mean",
|
|
982
987
|
agg_categorical: str = "first",
|
|
983
988
|
num_cpus: int = -1,
|
|
@@ -996,7 +1001,7 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
996
1001
|
|
|
997
1002
|
Parameters
|
|
998
1003
|
----------
|
|
999
|
-
freq :
|
|
1004
|
+
freq : str | pd.DateOffset
|
|
1000
1005
|
Frequency to which the data should be converted. See `pandas frequency aliases <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_
|
|
1001
1006
|
for supported values.
|
|
1002
1007
|
agg_numeric : {"max", "min", "sum", "mean", "median", "first", "last"}, default = "mean"
|
|
@@ -1086,8 +1091,8 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
1086
1091
|
def resample_chunk(chunk: Iterable[tuple[str, pd.DataFrame]]) -> pd.DataFrame:
|
|
1087
1092
|
resampled_dfs = []
|
|
1088
1093
|
for item_id, df in chunk:
|
|
1089
|
-
resampled_df = df.resample(offset, level=TIMESTAMP, **kwargs).agg(aggregation)
|
|
1090
|
-
resampled_dfs.append(pd.concat({item_id: resampled_df}, names=[ITEMID]))
|
|
1094
|
+
resampled_df = df.resample(offset, level=self.TIMESTAMP, **kwargs).agg(aggregation)
|
|
1095
|
+
resampled_dfs.append(pd.concat({item_id: resampled_df}, names=[self.ITEMID]))
|
|
1091
1096
|
return pd.concat(resampled_dfs)
|
|
1092
1097
|
|
|
1093
1098
|
# Resampling time for 1 item < overhead time for a single parallel job. Therefore, we group items into chunks
|
|
@@ -1095,8 +1100,8 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
1095
1100
|
df = pd.DataFrame(self)
|
|
1096
1101
|
# Make sure that timestamp index has dtype 'datetime64[ns]', otherwise index may contain NaT values.
|
|
1097
1102
|
# See https://github.com/autogluon/autogluon/issues/4917
|
|
1098
|
-
df.index = df.index.set_levels(df.index.levels[1].astype("datetime64[ns]"), level=TIMESTAMP)
|
|
1099
|
-
chunks = split_into_chunks(df.groupby(level=ITEMID, sort=False), chunk_size)
|
|
1103
|
+
df.index = df.index.set_levels(df.index.levels[1].astype("datetime64[ns]"), level=self.TIMESTAMP)
|
|
1104
|
+
chunks = split_into_chunks(df.groupby(level=self.ITEMID, sort=False), chunk_size)
|
|
1100
1105
|
resampled_chunks = Parallel(n_jobs=num_cpus)(delayed(resample_chunk)(chunk) for chunk in chunks)
|
|
1101
1106
|
resampled_df = TimeSeriesDataFrame(pd.concat(resampled_chunks))
|
|
1102
1107
|
resampled_df.static_features = self.static_features
|
|
@@ -1123,14 +1128,14 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
1123
1128
|
def reindex(*args, **kwargs) -> Self: ... # type: ignore
|
|
1124
1129
|
|
|
1125
1130
|
@overload
|
|
1126
|
-
def __new__(cls, data: pd.DataFrame, static_features:
|
|
1131
|
+
def __new__(cls, data: pd.DataFrame, static_features: pd.DataFrame | None = None) -> Self: ... # type: ignore
|
|
1127
1132
|
@overload
|
|
1128
1133
|
def __new__(
|
|
1129
1134
|
cls,
|
|
1130
|
-
data:
|
|
1131
|
-
static_features:
|
|
1132
|
-
id_column:
|
|
1133
|
-
timestamp_column:
|
|
1135
|
+
data: pd.DataFrame | str | Path | Iterable,
|
|
1136
|
+
static_features: pd.DataFrame | str | Path | None = None,
|
|
1137
|
+
id_column: str | None = None,
|
|
1138
|
+
timestamp_column: str | None = None,
|
|
1134
1139
|
num_cpus: int = -1,
|
|
1135
1140
|
*args,
|
|
1136
1141
|
**kwargs,
|
|
@@ -1142,3 +1147,9 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
1142
1147
|
def __getitem__(self, items: list[str]) -> Self: ... # type: ignore
|
|
1143
1148
|
@overload
|
|
1144
1149
|
def __getitem__(self, item: str) -> pd.Series: ... # type: ignore
|
|
1150
|
+
|
|
1151
|
+
|
|
1152
|
+
# TODO: remove with v2.0
|
|
1153
|
+
# module-level constants kept for backward compatibility.
|
|
1154
|
+
ITEMID = TimeSeriesDataFrame.ITEMID
|
|
1155
|
+
TIMESTAMP = TimeSeriesDataFrame.TIMESTAMP
|