autogluon.timeseries 1.2.1b20250224__py3-none-any.whl → 1.4.1b20251215__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of autogluon.timeseries might be problematic. Click here for more details.
- autogluon/timeseries/configs/__init__.py +3 -2
- autogluon/timeseries/configs/hyperparameter_presets.py +62 -0
- autogluon/timeseries/configs/predictor_presets.py +106 -0
- autogluon/timeseries/dataset/ts_dataframe.py +256 -141
- autogluon/timeseries/learner.py +86 -52
- autogluon/timeseries/metrics/__init__.py +42 -8
- autogluon/timeseries/metrics/abstract.py +89 -19
- autogluon/timeseries/metrics/point.py +142 -53
- autogluon/timeseries/metrics/quantile.py +46 -21
- autogluon/timeseries/metrics/utils.py +4 -4
- autogluon/timeseries/models/__init__.py +8 -2
- autogluon/timeseries/models/abstract/__init__.py +2 -2
- autogluon/timeseries/models/abstract/abstract_timeseries_model.py +361 -592
- autogluon/timeseries/models/abstract/model_trial.py +2 -1
- autogluon/timeseries/models/abstract/tunable.py +189 -0
- autogluon/timeseries/models/autogluon_tabular/__init__.py +2 -0
- autogluon/timeseries/models/autogluon_tabular/mlforecast.py +282 -194
- autogluon/timeseries/models/autogluon_tabular/per_step.py +513 -0
- autogluon/timeseries/models/autogluon_tabular/transforms.py +25 -18
- autogluon/timeseries/models/chronos/__init__.py +2 -1
- autogluon/timeseries/models/chronos/chronos2.py +361 -0
- autogluon/timeseries/models/chronos/model.py +219 -138
- autogluon/timeseries/models/chronos/{pipeline/utils.py → utils.py} +81 -50
- autogluon/timeseries/models/ensemble/__init__.py +37 -2
- autogluon/timeseries/models/ensemble/abstract.py +107 -0
- autogluon/timeseries/models/ensemble/array_based/__init__.py +3 -0
- autogluon/timeseries/models/ensemble/array_based/abstract.py +240 -0
- autogluon/timeseries/models/ensemble/array_based/models.py +185 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/__init__.py +12 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/abstract.py +88 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/linear_stacker.py +186 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/per_quantile_tabular.py +94 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/tabular.py +107 -0
- autogluon/timeseries/models/ensemble/ensemble_selection.py +167 -0
- autogluon/timeseries/models/ensemble/per_item_greedy.py +172 -0
- autogluon/timeseries/models/ensemble/weighted/__init__.py +8 -0
- autogluon/timeseries/models/ensemble/weighted/abstract.py +45 -0
- autogluon/timeseries/models/ensemble/weighted/basic.py +91 -0
- autogluon/timeseries/models/ensemble/weighted/greedy.py +62 -0
- autogluon/timeseries/models/gluonts/__init__.py +1 -1
- autogluon/timeseries/models/gluonts/{abstract_gluonts.py → abstract.py} +148 -208
- autogluon/timeseries/models/gluonts/dataset.py +109 -0
- autogluon/timeseries/models/gluonts/{torch/models.py → models.py} +38 -22
- autogluon/timeseries/models/local/__init__.py +0 -7
- autogluon/timeseries/models/local/abstract_local_model.py +71 -74
- autogluon/timeseries/models/local/naive.py +13 -9
- autogluon/timeseries/models/local/npts.py +9 -2
- autogluon/timeseries/models/local/statsforecast.py +52 -36
- autogluon/timeseries/models/multi_window/multi_window_model.py +65 -45
- autogluon/timeseries/models/registry.py +64 -0
- autogluon/timeseries/models/toto/__init__.py +3 -0
- autogluon/timeseries/models/toto/_internal/__init__.py +9 -0
- autogluon/timeseries/models/toto/_internal/backbone/__init__.py +3 -0
- autogluon/timeseries/models/toto/_internal/backbone/attention.py +196 -0
- autogluon/timeseries/models/toto/_internal/backbone/backbone.py +262 -0
- autogluon/timeseries/models/toto/_internal/backbone/distribution.py +70 -0
- autogluon/timeseries/models/toto/_internal/backbone/kvcache.py +136 -0
- autogluon/timeseries/models/toto/_internal/backbone/rope.py +89 -0
- autogluon/timeseries/models/toto/_internal/backbone/rotary_embedding_torch.py +342 -0
- autogluon/timeseries/models/toto/_internal/backbone/scaler.py +305 -0
- autogluon/timeseries/models/toto/_internal/backbone/transformer.py +333 -0
- autogluon/timeseries/models/toto/_internal/dataset.py +165 -0
- autogluon/timeseries/models/toto/_internal/forecaster.py +423 -0
- autogluon/timeseries/models/toto/dataloader.py +108 -0
- autogluon/timeseries/models/toto/hf_pretrained_model.py +200 -0
- autogluon/timeseries/models/toto/model.py +249 -0
- autogluon/timeseries/predictor.py +685 -297
- autogluon/timeseries/regressor.py +94 -44
- autogluon/timeseries/splitter.py +8 -32
- autogluon/timeseries/trainer/__init__.py +3 -0
- autogluon/timeseries/trainer/ensemble_composer.py +444 -0
- autogluon/timeseries/trainer/model_set_builder.py +256 -0
- autogluon/timeseries/trainer/prediction_cache.py +149 -0
- autogluon/timeseries/{trainer.py → trainer/trainer.py} +387 -390
- autogluon/timeseries/trainer/utils.py +17 -0
- autogluon/timeseries/transforms/__init__.py +2 -13
- autogluon/timeseries/transforms/covariate_scaler.py +34 -40
- autogluon/timeseries/transforms/target_scaler.py +37 -20
- autogluon/timeseries/utils/constants.py +10 -0
- autogluon/timeseries/utils/datetime/lags.py +3 -5
- autogluon/timeseries/utils/datetime/seasonality.py +1 -3
- autogluon/timeseries/utils/datetime/time_features.py +2 -2
- autogluon/timeseries/utils/features.py +70 -47
- autogluon/timeseries/utils/forecast.py +19 -14
- autogluon/timeseries/utils/timer.py +173 -0
- autogluon/timeseries/utils/warning_filters.py +4 -2
- autogluon/timeseries/version.py +1 -1
- autogluon.timeseries-1.4.1b20251215-py3.11-nspkg.pth +1 -0
- {autogluon.timeseries-1.2.1b20250224.dist-info → autogluon_timeseries-1.4.1b20251215.dist-info}/METADATA +49 -36
- autogluon_timeseries-1.4.1b20251215.dist-info/RECORD +103 -0
- {autogluon.timeseries-1.2.1b20250224.dist-info → autogluon_timeseries-1.4.1b20251215.dist-info}/WHEEL +1 -1
- autogluon/timeseries/configs/presets_configs.py +0 -79
- autogluon/timeseries/evaluator.py +0 -6
- autogluon/timeseries/models/chronos/pipeline/__init__.py +0 -11
- autogluon/timeseries/models/chronos/pipeline/base.py +0 -160
- autogluon/timeseries/models/chronos/pipeline/chronos.py +0 -585
- autogluon/timeseries/models/chronos/pipeline/chronos_bolt.py +0 -518
- autogluon/timeseries/models/ensemble/abstract_timeseries_ensemble.py +0 -78
- autogluon/timeseries/models/ensemble/greedy_ensemble.py +0 -170
- autogluon/timeseries/models/gluonts/torch/__init__.py +0 -0
- autogluon/timeseries/models/presets.py +0 -360
- autogluon.timeseries-1.2.1b20250224-py3.9-nspkg.pth +0 -1
- autogluon.timeseries-1.2.1b20250224.dist-info/RECORD +0 -68
- {autogluon.timeseries-1.2.1b20250224.dist-info → autogluon_timeseries-1.4.1b20251215.dist-info/licenses}/LICENSE +0 -0
- {autogluon.timeseries-1.2.1b20250224.dist-info → autogluon_timeseries-1.4.1b20251215.dist-info/licenses}/NOTICE +0 -0
- {autogluon.timeseries-1.2.1b20250224.dist-info → autogluon_timeseries-1.4.1b20251215.dist-info}/namespace_packages.txt +0 -0
- {autogluon.timeseries-1.2.1b20250224.dist-info → autogluon_timeseries-1.4.1b20251215.dist-info}/top_level.txt +0 -0
- {autogluon.timeseries-1.2.1b20250224.dist-info → autogluon_timeseries-1.4.1b20251215.dist-info}/zip-safe +0 -0
|
@@ -7,27 +7,23 @@ import reprlib
|
|
|
7
7
|
from collections.abc import Iterable
|
|
8
8
|
from itertools import islice
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from
|
|
11
|
-
from typing import Any, List, Optional, Tuple, Type, Union
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Final, Type, overload
|
|
12
11
|
|
|
12
|
+
import numpy as np
|
|
13
13
|
import pandas as pd
|
|
14
14
|
from joblib.parallel import Parallel, delayed
|
|
15
15
|
from pandas.core.internals import ArrayManager, BlockManager # type: ignore
|
|
16
|
+
from typing_extensions import Self
|
|
16
17
|
|
|
17
18
|
from autogluon.common.loaders import load_pd
|
|
18
19
|
|
|
19
20
|
logger = logging.getLogger(__name__)
|
|
20
21
|
|
|
21
|
-
ITEMID = "item_id"
|
|
22
|
-
TIMESTAMP = "timestamp"
|
|
23
|
-
|
|
24
|
-
IRREGULAR_TIME_INDEX_FREQSTR = "IRREG"
|
|
25
|
-
|
|
26
22
|
|
|
27
23
|
class TimeSeriesDataFrame(pd.DataFrame):
|
|
28
24
|
"""A collection of univariate time series, where each row is identified by an (``item_id``, ``timestamp``) pair.
|
|
29
25
|
|
|
30
|
-
For example, a time series
|
|
26
|
+
For example, a time series dataframe could represent the daily sales of a collection of products, where each
|
|
31
27
|
``item_id`` corresponds to a product and ``timestamp`` corresponds to the day of the record.
|
|
32
28
|
|
|
33
29
|
Parameters
|
|
@@ -77,7 +73,7 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
77
73
|
You can also use :meth:`~autogluon.timeseries.TimeSeriesDataFrame.from_iterable_dataset` for loading data in such format.
|
|
78
74
|
|
|
79
75
|
static_features : pd.DataFrame, str or pathlib.Path, optional
|
|
80
|
-
An optional
|
|
76
|
+
An optional dataframe describing the metadata of each individual time series that does not change with time.
|
|
81
77
|
Can take real-valued or categorical values. For example, if ``TimeSeriesDataFrame`` contains sales of various
|
|
82
78
|
products, static features may refer to time-independent features like color or brand.
|
|
83
79
|
|
|
@@ -117,15 +113,19 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
117
113
|
|
|
118
114
|
"""
|
|
119
115
|
|
|
120
|
-
index: pd.MultiIndex
|
|
116
|
+
index: pd.MultiIndex # type: ignore
|
|
121
117
|
_metadata = ["_static_features"]
|
|
122
118
|
|
|
119
|
+
IRREGULAR_TIME_INDEX_FREQSTR: Final[str] = "IRREG"
|
|
120
|
+
ITEMID: Final[str] = "item_id"
|
|
121
|
+
TIMESTAMP: Final[str] = "timestamp"
|
|
122
|
+
|
|
123
123
|
def __init__(
|
|
124
124
|
self,
|
|
125
|
-
data:
|
|
126
|
-
static_features:
|
|
127
|
-
id_column:
|
|
128
|
-
timestamp_column:
|
|
125
|
+
data: pd.DataFrame | str | Path | Iterable,
|
|
126
|
+
static_features: pd.DataFrame | str | Path | None = None,
|
|
127
|
+
id_column: str | None = None,
|
|
128
|
+
timestamp_column: str | None = None,
|
|
129
129
|
num_cpus: int = -1,
|
|
130
130
|
*args,
|
|
131
131
|
**kwargs,
|
|
@@ -149,7 +149,7 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
149
149
|
else:
|
|
150
150
|
raise ValueError(f"data must be a pd.DataFrame, Iterable, string or Path (received {type(data)}).")
|
|
151
151
|
super().__init__(data=data, *args, **kwargs) # type: ignore
|
|
152
|
-
self._static_features:
|
|
152
|
+
self._static_features: pd.DataFrame | None = None
|
|
153
153
|
if static_features is not None:
|
|
154
154
|
self.static_features = self._construct_static_features(static_features, id_column=id_column)
|
|
155
155
|
|
|
@@ -168,29 +168,33 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
168
168
|
def _construct_tsdf_from_data_frame(
|
|
169
169
|
cls,
|
|
170
170
|
df: pd.DataFrame,
|
|
171
|
-
id_column:
|
|
172
|
-
timestamp_column:
|
|
171
|
+
id_column: str | None = None,
|
|
172
|
+
timestamp_column: str | None = None,
|
|
173
173
|
) -> pd.DataFrame:
|
|
174
174
|
df = df.copy()
|
|
175
175
|
if id_column is not None:
|
|
176
176
|
assert id_column in df.columns, f"Column '{id_column}' not found!"
|
|
177
|
-
if id_column != ITEMID and ITEMID in df.columns:
|
|
178
|
-
logger.warning(
|
|
179
|
-
|
|
180
|
-
|
|
177
|
+
if id_column != cls.ITEMID and cls.ITEMID in df.columns:
|
|
178
|
+
logger.warning(
|
|
179
|
+
f"Renaming existing column '{cls.ITEMID}' -> '__{cls.ITEMID}' to avoid name collisions."
|
|
180
|
+
)
|
|
181
|
+
df.rename(columns={cls.ITEMID: "__" + cls.ITEMID}, inplace=True)
|
|
182
|
+
df.rename(columns={id_column: cls.ITEMID}, inplace=True)
|
|
181
183
|
|
|
182
184
|
if timestamp_column is not None:
|
|
183
185
|
assert timestamp_column in df.columns, f"Column '{timestamp_column}' not found!"
|
|
184
|
-
if timestamp_column != TIMESTAMP and TIMESTAMP in df.columns:
|
|
185
|
-
logger.warning(
|
|
186
|
-
|
|
187
|
-
|
|
186
|
+
if timestamp_column != cls.TIMESTAMP and cls.TIMESTAMP in df.columns:
|
|
187
|
+
logger.warning(
|
|
188
|
+
f"Renaming existing column '{cls.TIMESTAMP}' -> '__{cls.TIMESTAMP}' to avoid name collisions."
|
|
189
|
+
)
|
|
190
|
+
df.rename(columns={cls.TIMESTAMP: "__" + cls.TIMESTAMP}, inplace=True)
|
|
191
|
+
df.rename(columns={timestamp_column: cls.TIMESTAMP}, inplace=True)
|
|
188
192
|
|
|
189
|
-
if TIMESTAMP in df.columns:
|
|
190
|
-
df[TIMESTAMP] = pd.to_datetime(df[TIMESTAMP])
|
|
193
|
+
if cls.TIMESTAMP in df.columns:
|
|
194
|
+
df[cls.TIMESTAMP] = pd.to_datetime(df[cls.TIMESTAMP])
|
|
191
195
|
|
|
192
196
|
cls._validate_data_frame(df)
|
|
193
|
-
return df.set_index([ITEMID, TIMESTAMP])
|
|
197
|
+
return df.set_index([cls.ITEMID, cls.TIMESTAMP])
|
|
194
198
|
|
|
195
199
|
@classmethod
|
|
196
200
|
def _construct_tsdf_from_iterable_dataset(cls, iterable_dataset: Iterable, num_cpus: int = -1) -> pd.DataFrame:
|
|
@@ -201,7 +205,7 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
201
205
|
start_timestamp = start_timestamp.to_timestamp(how="S")
|
|
202
206
|
target = ts["target"]
|
|
203
207
|
datetime_index = tuple(pd.date_range(start_timestamp, periods=len(target), freq=freq))
|
|
204
|
-
idx = pd.MultiIndex.from_product([(item_id,), datetime_index], names=[ITEMID, TIMESTAMP])
|
|
208
|
+
idx = pd.MultiIndex.from_product([(item_id,), datetime_index], names=[cls.ITEMID, cls.TIMESTAMP])
|
|
205
209
|
return pd.Series(target, name="target", index=idx).to_frame()
|
|
206
210
|
|
|
207
211
|
cls._validate_iterable(iterable_dataset)
|
|
@@ -218,32 +222,34 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
218
222
|
raise ValueError(f"data must be a pd.DataFrame, got {type(data)}")
|
|
219
223
|
if not isinstance(data.index, pd.MultiIndex):
|
|
220
224
|
raise ValueError(f"data must have pd.MultiIndex, got {type(data.index)}")
|
|
221
|
-
if not pd.api.types.is_datetime64_dtype(data.index.dtypes[TIMESTAMP]):
|
|
222
|
-
raise ValueError(f"for {TIMESTAMP}, the only pandas dtype allowed is `datetime64`.")
|
|
223
|
-
if not data.index.names == (f"{ITEMID}", f"{TIMESTAMP}"):
|
|
224
|
-
raise ValueError(
|
|
225
|
-
|
|
225
|
+
if not pd.api.types.is_datetime64_dtype(data.index.dtypes[cls.TIMESTAMP]):
|
|
226
|
+
raise ValueError(f"for {cls.TIMESTAMP}, the only pandas dtype allowed is `datetime64`.")
|
|
227
|
+
if not data.index.names == (f"{cls.ITEMID}", f"{cls.TIMESTAMP}"):
|
|
228
|
+
raise ValueError(
|
|
229
|
+
f"data must have index names as ('{cls.ITEMID}', '{cls.TIMESTAMP}'), got {data.index.names}"
|
|
230
|
+
)
|
|
231
|
+
item_id_index = data.index.levels[0]
|
|
226
232
|
if not (pd.api.types.is_integer_dtype(item_id_index) or pd.api.types.is_string_dtype(item_id_index)):
|
|
227
|
-
raise ValueError(f"all entries in index `{ITEMID}` must be of integer or string dtype")
|
|
233
|
+
raise ValueError(f"all entries in index `{cls.ITEMID}` must be of integer or string dtype")
|
|
228
234
|
|
|
229
235
|
@classmethod
|
|
230
236
|
def _validate_data_frame(cls, df: pd.DataFrame):
|
|
231
237
|
"""Validate that a pd.DataFrame with ITEMID and TIMESTAMP columns can be converted to TimeSeriesDataFrame"""
|
|
232
238
|
if not isinstance(df, pd.DataFrame):
|
|
233
239
|
raise ValueError(f"data must be a pd.DataFrame, got {type(df)}")
|
|
234
|
-
if ITEMID not in df.columns:
|
|
235
|
-
raise ValueError(f"data must have a `{ITEMID}` column")
|
|
236
|
-
if TIMESTAMP not in df.columns:
|
|
237
|
-
raise ValueError(f"data must have a `{TIMESTAMP}` column")
|
|
238
|
-
if df[ITEMID].isnull().any():
|
|
239
|
-
raise ValueError(f"`{ITEMID}` column can not have nan")
|
|
240
|
-
if df[TIMESTAMP].isnull().any():
|
|
241
|
-
raise ValueError(f"`{TIMESTAMP}` column can not have nan")
|
|
242
|
-
if not pd.api.types.is_datetime64_dtype(df[TIMESTAMP]):
|
|
243
|
-
raise ValueError(f"for {TIMESTAMP}, the only pandas dtype allowed is `datetime64`.")
|
|
244
|
-
item_id_column = df[ITEMID]
|
|
240
|
+
if cls.ITEMID not in df.columns:
|
|
241
|
+
raise ValueError(f"data must have a `{cls.ITEMID}` column")
|
|
242
|
+
if cls.TIMESTAMP not in df.columns:
|
|
243
|
+
raise ValueError(f"data must have a `{cls.TIMESTAMP}` column")
|
|
244
|
+
if df[cls.ITEMID].isnull().any():
|
|
245
|
+
raise ValueError(f"`{cls.ITEMID}` column can not have nan")
|
|
246
|
+
if df[cls.TIMESTAMP].isnull().any():
|
|
247
|
+
raise ValueError(f"`{cls.TIMESTAMP}` column can not have nan")
|
|
248
|
+
if not pd.api.types.is_datetime64_dtype(df[cls.TIMESTAMP]):
|
|
249
|
+
raise ValueError(f"for {cls.TIMESTAMP}, the only pandas dtype allowed is `datetime64`.")
|
|
250
|
+
item_id_column = df[cls.ITEMID]
|
|
245
251
|
if not (pd.api.types.is_integer_dtype(item_id_column) or pd.api.types.is_string_dtype(item_id_column)):
|
|
246
|
-
raise ValueError(f"all entries in column `{ITEMID}` must be of integer or string dtype")
|
|
252
|
+
raise ValueError(f"all entries in column `{cls.ITEMID}` must be of integer or string dtype")
|
|
247
253
|
|
|
248
254
|
@classmethod
|
|
249
255
|
def _validate_iterable(cls, data: Iterable):
|
|
@@ -266,9 +272,9 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
266
272
|
def from_data_frame(
|
|
267
273
|
cls,
|
|
268
274
|
df: pd.DataFrame,
|
|
269
|
-
id_column:
|
|
270
|
-
timestamp_column:
|
|
271
|
-
static_features_df:
|
|
275
|
+
id_column: str | None = None,
|
|
276
|
+
timestamp_column: str | None = None,
|
|
277
|
+
static_features_df: pd.DataFrame | None = None,
|
|
272
278
|
) -> TimeSeriesDataFrame:
|
|
273
279
|
"""Construct a ``TimeSeriesDataFrame`` from a pandas DataFrame.
|
|
274
280
|
|
|
@@ -302,17 +308,17 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
302
308
|
Returns
|
|
303
309
|
-------
|
|
304
310
|
ts_df: TimeSeriesDataFrame
|
|
305
|
-
A
|
|
311
|
+
A dataframe in TimeSeriesDataFrame format.
|
|
306
312
|
"""
|
|
307
313
|
return cls(df, static_features=static_features_df, id_column=id_column, timestamp_column=timestamp_column)
|
|
308
314
|
|
|
309
315
|
@classmethod
|
|
310
316
|
def from_path(
|
|
311
317
|
cls,
|
|
312
|
-
path:
|
|
313
|
-
id_column:
|
|
314
|
-
timestamp_column:
|
|
315
|
-
static_features_path:
|
|
318
|
+
path: str | Path,
|
|
319
|
+
id_column: str | None = None,
|
|
320
|
+
timestamp_column: str | None = None,
|
|
321
|
+
static_features_path: str | Path | None = None,
|
|
316
322
|
) -> TimeSeriesDataFrame:
|
|
317
323
|
"""Construct a ``TimeSeriesDataFrame`` from a CSV or Parquet file.
|
|
318
324
|
|
|
@@ -349,7 +355,7 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
349
355
|
Returns
|
|
350
356
|
-------
|
|
351
357
|
ts_df: TimeSeriesDataFrame
|
|
352
|
-
A
|
|
358
|
+
A dataframe in TimeSeriesDataFrame format.
|
|
353
359
|
"""
|
|
354
360
|
return cls(path, static_features=static_features_path, id_column=id_column, timestamp_column=timestamp_column)
|
|
355
361
|
|
|
@@ -378,20 +384,20 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
378
384
|
Returns
|
|
379
385
|
-------
|
|
380
386
|
ts_df: TimeSeriesDataFrame
|
|
381
|
-
A
|
|
387
|
+
A dataframe in TimeSeriesDataFrame format.
|
|
382
388
|
"""
|
|
383
389
|
return cls(iterable_dataset, num_cpus=num_cpus)
|
|
384
390
|
|
|
385
391
|
@property
|
|
386
392
|
def item_ids(self) -> pd.Index:
|
|
387
393
|
"""List of unique time series IDs contained in the data set."""
|
|
388
|
-
return self.index.unique(level=ITEMID)
|
|
394
|
+
return self.index.unique(level=self.ITEMID)
|
|
389
395
|
|
|
390
396
|
@classmethod
|
|
391
397
|
def _construct_static_features(
|
|
392
398
|
cls,
|
|
393
|
-
static_features:
|
|
394
|
-
id_column:
|
|
399
|
+
static_features: pd.DataFrame | str | Path,
|
|
400
|
+
id_column: str | None = None,
|
|
395
401
|
) -> pd.DataFrame:
|
|
396
402
|
if isinstance(static_features, (str, Path)):
|
|
397
403
|
static_features = load_pd.load(str(static_features))
|
|
@@ -402,10 +408,12 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
402
408
|
|
|
403
409
|
if id_column is not None:
|
|
404
410
|
assert id_column in static_features.columns, f"Column '{id_column}' not found in static_features!"
|
|
405
|
-
if id_column != ITEMID and ITEMID in static_features.columns:
|
|
406
|
-
logger.warning(
|
|
407
|
-
|
|
408
|
-
|
|
411
|
+
if id_column != cls.ITEMID and cls.ITEMID in static_features.columns:
|
|
412
|
+
logger.warning(
|
|
413
|
+
f"Renaming existing column '{cls.ITEMID}' -> '__{cls.ITEMID}' to avoid name collisions."
|
|
414
|
+
)
|
|
415
|
+
static_features.rename(columns={cls.ITEMID: "__" + cls.ITEMID}, inplace=True)
|
|
416
|
+
static_features.rename(columns={id_column: cls.ITEMID}, inplace=True)
|
|
409
417
|
return static_features
|
|
410
418
|
|
|
411
419
|
@property
|
|
@@ -413,7 +421,7 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
413
421
|
return self._static_features
|
|
414
422
|
|
|
415
423
|
@static_features.setter
|
|
416
|
-
def static_features(self, value:
|
|
424
|
+
def static_features(self, value: pd.DataFrame | None):
|
|
417
425
|
# if the current item index is not a multiindex, then we are dealing with a single
|
|
418
426
|
# item slice. this should only happen when the user explicitly requests only a
|
|
419
427
|
# single item or during `slice_by_timestep`. In this case we do not set static features
|
|
@@ -430,10 +438,10 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
430
438
|
|
|
431
439
|
# Avoid modifying static features inplace
|
|
432
440
|
value = value.copy()
|
|
433
|
-
if ITEMID in value.columns and value.index.name != ITEMID:
|
|
434
|
-
value = value.set_index(ITEMID)
|
|
435
|
-
if value.index.name != ITEMID:
|
|
436
|
-
value.index.rename(ITEMID, inplace=True)
|
|
441
|
+
if self.ITEMID in value.columns and value.index.name != self.ITEMID:
|
|
442
|
+
value = value.set_index(self.ITEMID)
|
|
443
|
+
if value.index.name != self.ITEMID:
|
|
444
|
+
value.index.rename(self.ITEMID, inplace=True)
|
|
437
445
|
missing_item_ids = self.item_ids.difference(value.index)
|
|
438
446
|
if len(missing_item_ids) > 0:
|
|
439
447
|
raise ValueError(
|
|
@@ -446,7 +454,7 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
446
454
|
|
|
447
455
|
self._static_features = value
|
|
448
456
|
|
|
449
|
-
def infer_frequency(self, num_items:
|
|
457
|
+
def infer_frequency(self, num_items: int | None = None, raise_if_irregular: bool = False) -> str:
|
|
450
458
|
"""Infer the time series frequency based on the timestamps of the observations.
|
|
451
459
|
|
|
452
460
|
Parameters
|
|
@@ -455,7 +463,7 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
455
463
|
Number of items (individual time series) randomly selected to infer the frequency. Lower values speed up
|
|
456
464
|
the method, but increase the chance that some items with invalid frequency are missed by subsampling.
|
|
457
465
|
|
|
458
|
-
If set to
|
|
466
|
+
If set to ``None``, all items will be used for inferring the frequency.
|
|
459
467
|
raise_if_irregular : bool, default = False
|
|
460
468
|
If True, an exception will be raised if some items have an irregular frequency, or if different items have
|
|
461
469
|
different frequencies.
|
|
@@ -466,61 +474,66 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
466
474
|
If all time series have a regular frequency, returns a pandas-compatible `frequency alias <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_.
|
|
467
475
|
|
|
468
476
|
If some items have an irregular frequency or if different items have different frequencies, returns string
|
|
469
|
-
|
|
477
|
+
``IRREG``.
|
|
470
478
|
"""
|
|
479
|
+
ts_df = self
|
|
480
|
+
if num_items is not None and ts_df.num_items > num_items:
|
|
481
|
+
items_subset = ts_df.item_ids.to_series().sample(n=num_items, random_state=123)
|
|
482
|
+
ts_df = ts_df.loc[items_subset]
|
|
483
|
+
|
|
484
|
+
if not ts_df.index.is_monotonic_increasing:
|
|
485
|
+
ts_df = ts_df.sort_index()
|
|
486
|
+
|
|
487
|
+
indptr = ts_df.get_indptr()
|
|
488
|
+
item_ids = ts_df.item_ids
|
|
489
|
+
timestamps = ts_df.index.get_level_values(level=1)
|
|
490
|
+
candidate_freq = ts_df.index.levels[1].freq
|
|
491
|
+
|
|
492
|
+
frequencies = []
|
|
493
|
+
irregular_items = []
|
|
494
|
+
for i in range(len(indptr) - 1):
|
|
495
|
+
start, end = indptr[i], indptr[i + 1]
|
|
496
|
+
item_timestamps = timestamps[start:end]
|
|
497
|
+
inferred_freq = item_timestamps.inferred_freq
|
|
471
498
|
|
|
472
|
-
df = pd.DataFrame(self)
|
|
473
|
-
if num_items is not None:
|
|
474
|
-
all_item_ids = self.item_ids
|
|
475
|
-
if len(all_item_ids) > num_items:
|
|
476
|
-
items_subset = all_item_ids.to_series().sample(n=num_items, random_state=123)
|
|
477
|
-
df = df.loc[items_subset]
|
|
478
|
-
|
|
479
|
-
candidate_freq = df.index.levels[1].freq
|
|
480
|
-
index_df = df.index.to_frame(index=False)
|
|
481
|
-
|
|
482
|
-
def get_freq(series: pd.Series) -> Optional[str]:
|
|
483
|
-
dt_index = pd.DatetimeIndex(series)
|
|
484
|
-
inferred_freq = dt_index.inferred_freq
|
|
485
499
|
# Fallback option: maybe original index has a `freq` attribute that pandas fails to infer (e.g., 'SME')
|
|
486
500
|
if inferred_freq is None and candidate_freq is not None:
|
|
487
501
|
try:
|
|
488
502
|
# If this line does not raise an exception, then candidate_freq is a compatible frequency
|
|
489
|
-
|
|
503
|
+
item_timestamps.freq = candidate_freq
|
|
490
504
|
except ValueError:
|
|
491
505
|
inferred_freq = None
|
|
492
506
|
else:
|
|
493
|
-
inferred_freq = candidate_freq
|
|
494
|
-
|
|
507
|
+
inferred_freq = candidate_freq.freqstr
|
|
508
|
+
|
|
509
|
+
if inferred_freq is None:
|
|
510
|
+
irregular_items.append(item_ids[i])
|
|
511
|
+
else:
|
|
512
|
+
frequencies.append(inferred_freq)
|
|
495
513
|
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
if len(set(freq_for_each_item)) > 1 or freq is None:
|
|
514
|
+
unique_freqs = list(set(frequencies))
|
|
515
|
+
if len(unique_freqs) != 1 or len(irregular_items) > 0:
|
|
499
516
|
if raise_if_irregular:
|
|
500
|
-
|
|
501
|
-
if len(items_with_irregular_freq) > 0:
|
|
517
|
+
if irregular_items:
|
|
502
518
|
raise ValueError(
|
|
503
|
-
"Cannot infer frequency. Items with irregular frequency: "
|
|
504
|
-
f"{pformat(items_with_irregular_freq.index.tolist())}"
|
|
519
|
+
f"Cannot infer frequency. Items with irregular frequency: {reprlib.repr(irregular_items)}"
|
|
505
520
|
)
|
|
506
521
|
else:
|
|
507
|
-
raise ValueError(
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
)
|
|
511
|
-
return IRREGULAR_TIME_INDEX_FREQSTR
|
|
522
|
+
raise ValueError(f"Cannot infer frequency. Multiple frequencies detected: {unique_freqs}")
|
|
523
|
+
else:
|
|
524
|
+
return self.IRREGULAR_TIME_INDEX_FREQSTR
|
|
512
525
|
else:
|
|
513
|
-
return pd.tseries.frequencies.to_offset(
|
|
526
|
+
return pd.tseries.frequencies.to_offset(unique_freqs[0]).freqstr
|
|
514
527
|
|
|
515
528
|
@property
|
|
516
529
|
def freq(self):
|
|
517
|
-
"""Inferred pandas-compatible frequency of the timestamps in the
|
|
530
|
+
"""Inferred pandas-compatible frequency of the timestamps in the dataframe.
|
|
518
531
|
|
|
519
532
|
Computed using a random subset of the time series for speed. This may sometimes result in incorrectly inferred
|
|
520
533
|
values. For reliable results, use :meth:`~autogluon.timeseries.TimeSeriesDataFrame.infer_frequency`.
|
|
521
534
|
"""
|
|
522
535
|
inferred_freq = self.infer_frequency(num_items=50)
|
|
523
|
-
return None if inferred_freq == IRREGULAR_TIME_INDEX_FREQSTR else inferred_freq
|
|
536
|
+
return None if inferred_freq == self.IRREGULAR_TIME_INDEX_FREQSTR else inferred_freq
|
|
524
537
|
|
|
525
538
|
@property
|
|
526
539
|
def num_items(self):
|
|
@@ -528,8 +541,13 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
528
541
|
return len(self.item_ids)
|
|
529
542
|
|
|
530
543
|
def num_timesteps_per_item(self) -> pd.Series:
|
|
531
|
-
"""
|
|
532
|
-
|
|
544
|
+
"""Number of observations in each time series in the dataframe.
|
|
545
|
+
|
|
546
|
+
Returns a ``pandas.Series`` with ``item_id`` as index and number of observations per item as values.
|
|
547
|
+
"""
|
|
548
|
+
counts = pd.Series(self.index.codes[0]).value_counts(sort=False)
|
|
549
|
+
counts.index = self.index.levels[0][counts.index]
|
|
550
|
+
return counts
|
|
533
551
|
|
|
534
552
|
def copy(self: TimeSeriesDataFrame, deep: bool = True) -> TimeSeriesDataFrame:
|
|
535
553
|
"""Make a copy of the TimeSeriesDataFrame.
|
|
@@ -552,7 +570,7 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
552
570
|
return obj
|
|
553
571
|
|
|
554
572
|
def __finalize__( # noqa
|
|
555
|
-
self: TimeSeriesDataFrame, other, method:
|
|
573
|
+
self: TimeSeriesDataFrame, other, method: str | None = None, **kwargs
|
|
556
574
|
) -> TimeSeriesDataFrame:
|
|
557
575
|
super().__finalize__(other=other, method=method, **kwargs)
|
|
558
576
|
# when finalizing the copy/slice operation, we use the property setter to stay consistent
|
|
@@ -561,13 +579,13 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
561
579
|
self.static_features = other._static_features
|
|
562
580
|
return self
|
|
563
581
|
|
|
564
|
-
def split_by_time(self, cutoff_time: pd.Timestamp) ->
|
|
582
|
+
def split_by_time(self, cutoff_time: pd.Timestamp) -> tuple[TimeSeriesDataFrame, TimeSeriesDataFrame]:
|
|
565
583
|
"""Split dataframe to two different ``TimeSeriesDataFrame`` s before and after a certain ``cutoff_time``.
|
|
566
584
|
|
|
567
585
|
Parameters
|
|
568
586
|
----------
|
|
569
587
|
cutoff_time: pd.Timestamp
|
|
570
|
-
The time to split the current
|
|
588
|
+
The time to split the current dataframe into two dataframes.
|
|
571
589
|
|
|
572
590
|
Returns
|
|
573
591
|
-------
|
|
@@ -584,15 +602,14 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
584
602
|
after = TimeSeriesDataFrame(data_after, static_features=self.static_features)
|
|
585
603
|
return before, after
|
|
586
604
|
|
|
587
|
-
def slice_by_timestep(
|
|
588
|
-
self, start_index: Optional[int] = None, end_index: Optional[int] = None
|
|
589
|
-
) -> TimeSeriesDataFrame:
|
|
605
|
+
def slice_by_timestep(self, start_index: int | None = None, end_index: int | None = None) -> TimeSeriesDataFrame:
|
|
590
606
|
"""Select a subsequence from each time series between start (inclusive) and end (exclusive) indices.
|
|
591
607
|
|
|
592
608
|
This operation is equivalent to selecting a slice ``[start_index : end_index]`` from each time series, and then
|
|
593
609
|
combining these slices into a new ``TimeSeriesDataFrame``. See examples below.
|
|
594
610
|
|
|
595
|
-
|
|
611
|
+
It is recommended to sort the index with ``ts_df.sort_index()`` before calling this method to take advantage of
|
|
612
|
+
a fast optimized algorithm.
|
|
596
613
|
|
|
597
614
|
Parameters
|
|
598
615
|
----------
|
|
@@ -679,10 +696,53 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
679
696
|
if end_index is not None and not isinstance(end_index, int):
|
|
680
697
|
raise ValueError(f"end_index must be of type int or None (got {type(end_index)})")
|
|
681
698
|
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
699
|
+
if start_index is None and end_index is None:
|
|
700
|
+
# Return a copy to avoid in-place modification.
|
|
701
|
+
# self.copy() is much faster than self.loc[ones(len(self), dtype=bool)]
|
|
702
|
+
return self.copy()
|
|
703
|
+
|
|
704
|
+
if self.index.is_monotonic_increasing:
|
|
705
|
+
# Use a fast optimized algorithm if the index is sorted
|
|
706
|
+
indptr = self.get_indptr()
|
|
707
|
+
lengths = np.diff(indptr)
|
|
708
|
+
starts = indptr[:-1]
|
|
709
|
+
|
|
710
|
+
slice_start = (
|
|
711
|
+
np.zeros_like(lengths)
|
|
712
|
+
if start_index is None
|
|
713
|
+
else np.clip(np.where(start_index >= 0, start_index, lengths + start_index), 0, lengths)
|
|
714
|
+
)
|
|
715
|
+
slice_end = (
|
|
716
|
+
lengths.copy()
|
|
717
|
+
if end_index is None
|
|
718
|
+
else np.clip(np.where(end_index >= 0, end_index, lengths + end_index), 0, lengths)
|
|
719
|
+
)
|
|
720
|
+
|
|
721
|
+
# Filter out invalid slices where start >= end
|
|
722
|
+
valid_slices = slice_start < slice_end
|
|
723
|
+
if not np.any(valid_slices):
|
|
724
|
+
# Return empty dataframe with same structure
|
|
725
|
+
return self.loc[np.zeros(len(self), dtype=bool)]
|
|
726
|
+
|
|
727
|
+
starts = starts[valid_slices]
|
|
728
|
+
slice_start = slice_start[valid_slices]
|
|
729
|
+
slice_end = slice_end[valid_slices]
|
|
730
|
+
|
|
731
|
+
# We put 1 at the slice_start index for each item and -1 at the slice_end index for each item.
|
|
732
|
+
# After we apply cumsum we get the indicator mask selecting values between slice_start and slice_end
|
|
733
|
+
# cumsum([0, 0, 1, 0, 0, -1, 0]) -> [0, 0, 1, 1, 1, 0, 0]
|
|
734
|
+
# We need array of size len(self) + 1 in case events[starts + slice_end] tries to access position len(self)
|
|
735
|
+
events = np.zeros(len(self) + 1, dtype=np.int8)
|
|
736
|
+
events[starts + slice_start] += 1
|
|
737
|
+
events[starts + slice_end] -= 1
|
|
738
|
+
mask = np.cumsum(events)[:-1].astype(bool)
|
|
739
|
+
# loc[mask] returns a view of the original data - modifying it will produce a SettingWithCopyWarning
|
|
740
|
+
return self.loc[mask]
|
|
741
|
+
else:
|
|
742
|
+
# Fall back to a slow groupby operation
|
|
743
|
+
result = self.groupby(level=self.ITEMID, sort=False, as_index=False).nth(slice(start_index, end_index))
|
|
744
|
+
result.static_features = self.static_features
|
|
745
|
+
return result
|
|
686
746
|
|
|
687
747
|
def slice_by_time(self, start_time: pd.Timestamp, end_time: pd.Timestamp) -> TimeSeriesDataFrame:
|
|
688
748
|
"""Select a subsequence from each time series between start (inclusive) and end (exclusive) timestamps.
|
|
@@ -711,7 +771,7 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
711
771
|
|
|
712
772
|
@classmethod
|
|
713
773
|
def from_pickle(cls, filepath_or_buffer: Any) -> TimeSeriesDataFrame:
|
|
714
|
-
"""Convenience method to read pickled time series
|
|
774
|
+
"""Convenience method to read pickled time series dataframes. If the read pickle
|
|
715
775
|
file refers to a plain pandas DataFrame, it will be cast to a TimeSeriesDataFrame.
|
|
716
776
|
|
|
717
777
|
Parameters
|
|
@@ -722,7 +782,7 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
722
782
|
Returns
|
|
723
783
|
-------
|
|
724
784
|
ts_df : TimeSeriesDataFrame
|
|
725
|
-
The pickled time series
|
|
785
|
+
The pickled time series dataframe.
|
|
726
786
|
"""
|
|
727
787
|
try:
|
|
728
788
|
data = pd.read_pickle(filepath_or_buffer)
|
|
@@ -733,16 +793,21 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
733
793
|
def fill_missing_values(self, method: str = "auto", value: float = 0.0) -> TimeSeriesDataFrame:
|
|
734
794
|
"""Fill missing values represented by NaN.
|
|
735
795
|
|
|
796
|
+
.. note::
|
|
797
|
+
This method assumes that the index of the TimeSeriesDataFrame is sorted by [item_id, timestamp].
|
|
798
|
+
|
|
799
|
+
If the index is not sorted, this method will log a warning and may produce an incorrect result.
|
|
800
|
+
|
|
736
801
|
Parameters
|
|
737
802
|
----------
|
|
738
803
|
method : str, default = "auto"
|
|
739
804
|
Method used to impute missing values.
|
|
740
805
|
|
|
741
|
-
- "auto" - first forward fill (to fill the in-between and trailing NaNs), then backward fill (to fill the leading NaNs)
|
|
742
|
-
- "ffill" or "pad" - propagate last valid observation forward. Note: missing values at the start of the time series are not filled.
|
|
743
|
-
- "bfill" or "backfill" - use next valid observation to fill gap. Note: this may result in information leakage; missing values at the end of the time series are not filled.
|
|
744
|
-
- "constant" - replace NaNs with the given constant ``value``.
|
|
745
|
-
- "interpolate" - fill NaN values using linear interpolation. Note: this may result in information leakage.
|
|
806
|
+
- ``"auto"`` - first forward fill (to fill the in-between and trailing NaNs), then backward fill (to fill the leading NaNs)
|
|
807
|
+
- ``"ffill"`` or ``"pad"`` - propagate last valid observation forward. Note: missing values at the start of the time series are not filled.
|
|
808
|
+
- ``"bfill"`` or ``"backfill"`` - use next valid observation to fill gap. Note: this may result in information leakage; missing values at the end of the time series are not filled.
|
|
809
|
+
- ``"constant"`` - replace NaNs with the given constant ``value``.
|
|
810
|
+
- ``"interpolate"`` - fill NaN values using linear interpolation. Note: this may result in information leakage.
|
|
746
811
|
value : float, default = 0.0
|
|
747
812
|
Value used by the "constant" imputation method.
|
|
748
813
|
|
|
@@ -792,12 +857,12 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
792
857
|
"It is highly recommended to call `ts_df.sort_index()` before calling `ts_df.fill_missing_values()`"
|
|
793
858
|
)
|
|
794
859
|
|
|
795
|
-
grouped_df = df.groupby(level=ITEMID, sort=False, group_keys=False)
|
|
860
|
+
grouped_df = df.groupby(level=self.ITEMID, sort=False, group_keys=False)
|
|
796
861
|
if method == "auto":
|
|
797
862
|
filled_df = grouped_df.ffill()
|
|
798
863
|
# If necessary, fill missing values at the start of each time series with bfill
|
|
799
864
|
if filled_df.isna().any(axis=None):
|
|
800
|
-
filled_df = filled_df.groupby(level=ITEMID, sort=False, group_keys=False).bfill()
|
|
865
|
+
filled_df = filled_df.groupby(level=self.ITEMID, sort=False, group_keys=False).bfill()
|
|
801
866
|
elif method in ["ffill", "pad"]:
|
|
802
867
|
filled_df = grouped_df.ffill()
|
|
803
868
|
elif method in ["bfill", "backfill"]:
|
|
@@ -840,17 +905,17 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
840
905
|
return super().sort_index(*args, **kwargs) # type: ignore
|
|
841
906
|
|
|
842
907
|
def get_model_inputs_for_scoring(
|
|
843
|
-
self, prediction_length: int, known_covariates_names:
|
|
844
|
-
) ->
|
|
908
|
+
self, prediction_length: int, known_covariates_names: list[str] | None = None
|
|
909
|
+
) -> tuple[TimeSeriesDataFrame, TimeSeriesDataFrame | None]:
|
|
845
910
|
"""Prepare model inputs necessary to predict the last ``prediction_length`` time steps of each time series in the dataset.
|
|
846
911
|
|
|
847
912
|
Parameters
|
|
848
913
|
----------
|
|
849
914
|
prediction_length : int
|
|
850
915
|
The forecast horizon, i.e., How many time steps into the future must be predicted.
|
|
851
|
-
known_covariates_names :
|
|
916
|
+
known_covariates_names : list[str], optional
|
|
852
917
|
Names of the dataframe columns that contain covariates known in the future.
|
|
853
|
-
See
|
|
918
|
+
See ``known_covariates_names`` of :class:`~autogluon.timeseries.TimeSeriesPredictor` for more details.
|
|
854
919
|
|
|
855
920
|
Returns
|
|
856
921
|
-------
|
|
@@ -871,12 +936,16 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
871
936
|
def train_test_split(
|
|
872
937
|
self,
|
|
873
938
|
prediction_length: int,
|
|
874
|
-
end_index:
|
|
875
|
-
suffix:
|
|
876
|
-
) ->
|
|
939
|
+
end_index: int | None = None,
|
|
940
|
+
suffix: str | None = None,
|
|
941
|
+
) -> tuple[TimeSeriesDataFrame, TimeSeriesDataFrame]:
|
|
877
942
|
"""Generate a train/test split from the given dataset.
|
|
943
|
+
|
|
878
944
|
This method can be used to generate splits for multi-window backtesting.
|
|
879
945
|
|
|
946
|
+
.. note::
|
|
947
|
+
This method automatically sorts the TimeSeriesDataFrame by [item_id, timestamp].
|
|
948
|
+
|
|
880
949
|
Parameters
|
|
881
950
|
----------
|
|
882
951
|
prediction_length : int
|
|
@@ -913,14 +982,14 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
913
982
|
|
|
914
983
|
def convert_frequency(
|
|
915
984
|
self,
|
|
916
|
-
freq:
|
|
985
|
+
freq: str | pd.DateOffset,
|
|
917
986
|
agg_numeric: str = "mean",
|
|
918
987
|
agg_categorical: str = "first",
|
|
919
988
|
num_cpus: int = -1,
|
|
920
989
|
chunk_size: int = 100,
|
|
921
990
|
**kwargs,
|
|
922
991
|
) -> TimeSeriesDataFrame:
|
|
923
|
-
"""Convert each time series in the
|
|
992
|
+
"""Convert each time series in the dataframe to the given frequency.
|
|
924
993
|
|
|
925
994
|
This method is useful for two purposes:
|
|
926
995
|
|
|
@@ -930,10 +999,9 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
930
999
|
Standard ``df.groupby(...).resample(...)`` can be extremely slow for large datasets, so we parallelize this
|
|
931
1000
|
operation across multiple CPU cores.
|
|
932
1001
|
|
|
933
|
-
|
|
934
1002
|
Parameters
|
|
935
1003
|
----------
|
|
936
|
-
freq :
|
|
1004
|
+
freq : str | pd.DateOffset
|
|
937
1005
|
Frequency to which the data should be converted. See `pandas frequency aliases <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_
|
|
938
1006
|
for supported values.
|
|
939
1007
|
agg_numeric : {"max", "min", "sum", "mean", "median", "first", "last"}, default = "mean"
|
|
@@ -1020,21 +1088,68 @@ class TimeSeriesDataFrame(pd.DataFrame):
|
|
|
1020
1088
|
iterable = iter(iterable)
|
|
1021
1089
|
return iter(lambda: tuple(islice(iterable, size)), ())
|
|
1022
1090
|
|
|
1023
|
-
def resample_chunk(chunk: Iterable[
|
|
1091
|
+
def resample_chunk(chunk: Iterable[tuple[str, pd.DataFrame]]) -> pd.DataFrame:
|
|
1024
1092
|
resampled_dfs = []
|
|
1025
1093
|
for item_id, df in chunk:
|
|
1026
|
-
resampled_df = df.resample(offset, level=TIMESTAMP, **kwargs).agg(aggregation)
|
|
1027
|
-
resampled_dfs.append(pd.concat({item_id: resampled_df}, names=[ITEMID]))
|
|
1094
|
+
resampled_df = df.resample(offset, level=self.TIMESTAMP, **kwargs).agg(aggregation)
|
|
1095
|
+
resampled_dfs.append(pd.concat({item_id: resampled_df}, names=[self.ITEMID]))
|
|
1028
1096
|
return pd.concat(resampled_dfs)
|
|
1029
1097
|
|
|
1030
1098
|
# Resampling time for 1 item < overhead time for a single parallel job. Therefore, we group items into chunks
|
|
1031
1099
|
# so that the speedup from parallelization isn't dominated by the communication costs.
|
|
1032
|
-
|
|
1100
|
+
df = pd.DataFrame(self)
|
|
1101
|
+
# Make sure that timestamp index has dtype 'datetime64[ns]', otherwise index may contain NaT values.
|
|
1102
|
+
# See https://github.com/autogluon/autogluon/issues/4917
|
|
1103
|
+
df.index = df.index.set_levels(df.index.levels[1].astype("datetime64[ns]"), level=self.TIMESTAMP)
|
|
1104
|
+
chunks = split_into_chunks(df.groupby(level=self.ITEMID, sort=False), chunk_size)
|
|
1033
1105
|
resampled_chunks = Parallel(n_jobs=num_cpus)(delayed(resample_chunk)(chunk) for chunk in chunks)
|
|
1034
1106
|
resampled_df = TimeSeriesDataFrame(pd.concat(resampled_chunks))
|
|
1035
1107
|
resampled_df.static_features = self.static_features
|
|
1036
1108
|
return resampled_df
|
|
1037
1109
|
|
|
1038
1110
|
def to_data_frame(self) -> pd.DataFrame:
|
|
1039
|
-
"""Convert
|
|
1111
|
+
"""Convert ``TimeSeriesDataFrame`` to a ``pandas.DataFrame``"""
|
|
1040
1112
|
return pd.DataFrame(self)
|
|
1113
|
+
|
|
1114
|
+
def get_indptr(self) -> np.ndarray:
|
|
1115
|
+
"""[Advanced] Get a numpy array of shape [num_items + 1] that points to the start and end of each time series.
|
|
1116
|
+
|
|
1117
|
+
This method assumes that the TimeSeriesDataFrame is sorted by [item_id, timestamp].
|
|
1118
|
+
"""
|
|
1119
|
+
return np.concatenate([[0], np.cumsum(self.num_timesteps_per_item().to_numpy())]).astype(np.int32)
|
|
1120
|
+
|
|
1121
|
+
# inline typing stubs for various overridden methods
|
|
1122
|
+
if TYPE_CHECKING:
|
|
1123
|
+
|
|
1124
|
+
def query( # type: ignore
|
|
1125
|
+
self, expr: str, *, inplace: bool = False, **kwargs
|
|
1126
|
+
) -> Self: ...
|
|
1127
|
+
|
|
1128
|
+
def reindex(*args, **kwargs) -> Self: ... # type: ignore
|
|
1129
|
+
|
|
1130
|
+
@overload
|
|
1131
|
+
def __new__(cls, data: pd.DataFrame, static_features: pd.DataFrame | None = None) -> Self: ... # type: ignore
|
|
1132
|
+
@overload
|
|
1133
|
+
def __new__(
|
|
1134
|
+
cls,
|
|
1135
|
+
data: pd.DataFrame | str | Path | Iterable,
|
|
1136
|
+
static_features: pd.DataFrame | str | Path | None = None,
|
|
1137
|
+
id_column: str | None = None,
|
|
1138
|
+
timestamp_column: str | None = None,
|
|
1139
|
+
num_cpus: int = -1,
|
|
1140
|
+
*args,
|
|
1141
|
+
**kwargs,
|
|
1142
|
+
) -> Self:
|
|
1143
|
+
"""This overload is needed since in pandas, during type checking, the default constructor resolves to __new__"""
|
|
1144
|
+
...
|
|
1145
|
+
|
|
1146
|
+
@overload
|
|
1147
|
+
def __getitem__(self, items: list[str]) -> Self: ... # type: ignore
|
|
1148
|
+
@overload
|
|
1149
|
+
def __getitem__(self, item: str) -> pd.Series: ... # type: ignore
|
|
1150
|
+
|
|
1151
|
+
|
|
1152
|
+
# TODO: remove with v2.0
|
|
1153
|
+
# module-level constants kept for backward compatibility.
|
|
1154
|
+
ITEMID = TimeSeriesDataFrame.ITEMID
|
|
1155
|
+
TIMESTAMP = TimeSeriesDataFrame.TIMESTAMP
|