autogluon.timeseries 1.0.1b20240304__py3-none-any.whl → 1.4.1b20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of autogluon.timeseries might be problematic. Click here for more details.
- autogluon/timeseries/configs/__init__.py +3 -2
- autogluon/timeseries/configs/hyperparameter_presets.py +62 -0
- autogluon/timeseries/configs/predictor_presets.py +84 -0
- autogluon/timeseries/dataset/ts_dataframe.py +339 -186
- autogluon/timeseries/learner.py +192 -60
- autogluon/timeseries/metrics/__init__.py +55 -11
- autogluon/timeseries/metrics/abstract.py +96 -25
- autogluon/timeseries/metrics/point.py +186 -39
- autogluon/timeseries/metrics/quantile.py +47 -20
- autogluon/timeseries/metrics/utils.py +6 -6
- autogluon/timeseries/models/__init__.py +13 -7
- autogluon/timeseries/models/abstract/__init__.py +2 -2
- autogluon/timeseries/models/abstract/abstract_timeseries_model.py +533 -273
- autogluon/timeseries/models/abstract/model_trial.py +10 -10
- autogluon/timeseries/models/abstract/tunable.py +189 -0
- autogluon/timeseries/models/autogluon_tabular/__init__.py +2 -0
- autogluon/timeseries/models/autogluon_tabular/mlforecast.py +369 -215
- autogluon/timeseries/models/autogluon_tabular/per_step.py +513 -0
- autogluon/timeseries/models/autogluon_tabular/transforms.py +67 -0
- autogluon/timeseries/models/autogluon_tabular/utils.py +3 -51
- autogluon/timeseries/models/chronos/__init__.py +4 -0
- autogluon/timeseries/models/chronos/chronos2.py +361 -0
- autogluon/timeseries/models/chronos/model.py +738 -0
- autogluon/timeseries/models/chronos/utils.py +369 -0
- autogluon/timeseries/models/ensemble/__init__.py +35 -2
- autogluon/timeseries/models/ensemble/{abstract_timeseries_ensemble.py → abstract.py} +50 -26
- autogluon/timeseries/models/ensemble/array_based/__init__.py +3 -0
- autogluon/timeseries/models/ensemble/array_based/abstract.py +236 -0
- autogluon/timeseries/models/ensemble/array_based/models.py +73 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/__init__.py +12 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/abstract.py +88 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/linear_stacker.py +167 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/per_quantile_tabular.py +94 -0
- autogluon/timeseries/models/ensemble/array_based/regressor/tabular.py +107 -0
- autogluon/timeseries/models/ensemble/ensemble_selection.py +167 -0
- autogluon/timeseries/models/ensemble/per_item_greedy.py +162 -0
- autogluon/timeseries/models/ensemble/weighted/__init__.py +8 -0
- autogluon/timeseries/models/ensemble/weighted/abstract.py +40 -0
- autogluon/timeseries/models/ensemble/weighted/basic.py +78 -0
- autogluon/timeseries/models/ensemble/weighted/greedy.py +57 -0
- autogluon/timeseries/models/gluonts/__init__.py +3 -1
- autogluon/timeseries/models/gluonts/abstract.py +583 -0
- autogluon/timeseries/models/gluonts/dataset.py +109 -0
- autogluon/timeseries/models/gluonts/{torch/models.py → models.py} +185 -44
- autogluon/timeseries/models/local/__init__.py +1 -10
- autogluon/timeseries/models/local/abstract_local_model.py +150 -97
- autogluon/timeseries/models/local/naive.py +31 -23
- autogluon/timeseries/models/local/npts.py +6 -2
- autogluon/timeseries/models/local/statsforecast.py +99 -112
- autogluon/timeseries/models/multi_window/multi_window_model.py +99 -40
- autogluon/timeseries/models/registry.py +64 -0
- autogluon/timeseries/models/toto/__init__.py +3 -0
- autogluon/timeseries/models/toto/_internal/__init__.py +9 -0
- autogluon/timeseries/models/toto/_internal/backbone/__init__.py +3 -0
- autogluon/timeseries/models/toto/_internal/backbone/attention.py +196 -0
- autogluon/timeseries/models/toto/_internal/backbone/backbone.py +262 -0
- autogluon/timeseries/models/toto/_internal/backbone/distribution.py +70 -0
- autogluon/timeseries/models/toto/_internal/backbone/kvcache.py +136 -0
- autogluon/timeseries/models/toto/_internal/backbone/rope.py +89 -0
- autogluon/timeseries/models/toto/_internal/backbone/rotary_embedding_torch.py +342 -0
- autogluon/timeseries/models/toto/_internal/backbone/scaler.py +305 -0
- autogluon/timeseries/models/toto/_internal/backbone/transformer.py +333 -0
- autogluon/timeseries/models/toto/_internal/dataset.py +165 -0
- autogluon/timeseries/models/toto/_internal/forecaster.py +423 -0
- autogluon/timeseries/models/toto/dataloader.py +108 -0
- autogluon/timeseries/models/toto/hf_pretrained_model.py +118 -0
- autogluon/timeseries/models/toto/model.py +236 -0
- autogluon/timeseries/predictor.py +826 -305
- autogluon/timeseries/regressor.py +253 -0
- autogluon/timeseries/splitter.py +10 -31
- autogluon/timeseries/trainer/__init__.py +2 -3
- autogluon/timeseries/trainer/ensemble_composer.py +439 -0
- autogluon/timeseries/trainer/model_set_builder.py +256 -0
- autogluon/timeseries/trainer/prediction_cache.py +149 -0
- autogluon/timeseries/trainer/trainer.py +1298 -0
- autogluon/timeseries/trainer/utils.py +17 -0
- autogluon/timeseries/transforms/__init__.py +2 -0
- autogluon/timeseries/transforms/covariate_scaler.py +164 -0
- autogluon/timeseries/transforms/target_scaler.py +149 -0
- autogluon/timeseries/utils/constants.py +10 -0
- autogluon/timeseries/utils/datetime/base.py +38 -20
- autogluon/timeseries/utils/datetime/lags.py +18 -16
- autogluon/timeseries/utils/datetime/seasonality.py +14 -14
- autogluon/timeseries/utils/datetime/time_features.py +17 -14
- autogluon/timeseries/utils/features.py +317 -53
- autogluon/timeseries/utils/forecast.py +31 -17
- autogluon/timeseries/utils/timer.py +173 -0
- autogluon/timeseries/utils/warning_filters.py +44 -6
- autogluon/timeseries/version.py +2 -1
- autogluon.timeseries-1.4.1b20251210-py3.11-nspkg.pth +1 -0
- {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info}/METADATA +71 -47
- autogluon_timeseries-1.4.1b20251210.dist-info/RECORD +103 -0
- {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info}/WHEEL +1 -1
- autogluon/timeseries/configs/presets_configs.py +0 -11
- autogluon/timeseries/evaluator.py +0 -6
- autogluon/timeseries/models/ensemble/greedy_ensemble.py +0 -170
- autogluon/timeseries/models/gluonts/abstract_gluonts.py +0 -550
- autogluon/timeseries/models/gluonts/torch/__init__.py +0 -0
- autogluon/timeseries/models/presets.py +0 -325
- autogluon/timeseries/trainer/abstract_trainer.py +0 -1144
- autogluon/timeseries/trainer/auto_trainer.py +0 -74
- autogluon.timeseries-1.0.1b20240304-py3.8-nspkg.pth +0 -1
- autogluon.timeseries-1.0.1b20240304.dist-info/RECORD +0 -58
- {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info/licenses}/LICENSE +0 -0
- {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info/licenses}/NOTICE +0 -0
- {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info}/namespace_packages.txt +0 -0
- {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info}/top_level.txt +0 -0
- {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info}/zip-safe +0 -0
|
@@ -7,42 +7,23 @@ import reprlib
|
|
|
7
7
|
from collections.abc import Iterable
|
|
8
8
|
from itertools import islice
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Final, Type, overload
|
|
11
11
|
|
|
12
|
+
import numpy as np
|
|
12
13
|
import pandas as pd
|
|
13
14
|
from joblib.parallel import Parallel, delayed
|
|
14
|
-
from pandas.core.internals import ArrayManager, BlockManager
|
|
15
|
+
from pandas.core.internals import ArrayManager, BlockManager # type: ignore
|
|
16
|
+
from typing_extensions import Self
|
|
15
17
|
|
|
16
18
|
from autogluon.common.loaders import load_pd
|
|
17
19
|
|
|
18
20
|
logger = logging.getLogger(__name__)
|
|
19
21
|
|
|
20
|
-
ITEMID = "item_id"
|
|
21
|
-
TIMESTAMP = "timestamp"
|
|
22
22
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class TimeSeriesDataFrameDeprecatedMixin:
|
|
27
|
-
"""Contains deprecated methods from TimeSeriesDataFrame that shouldn't show up in API documentation."""
|
|
28
|
-
|
|
29
|
-
def get_reindexed_view(self, *args, **kwargs) -> TimeSeriesDataFrame:
|
|
30
|
-
raise ValueError(
|
|
31
|
-
"`TimeSeriesDataFrame.get_reindexed_view` has been deprecated. If your data has irregular timestamps, "
|
|
32
|
-
"please convert it to a regular frequency with `convert_frequency`."
|
|
33
|
-
)
|
|
34
|
-
|
|
35
|
-
def to_regular_index(self, *args, **kwargs) -> TimeSeriesDataFrame:
|
|
36
|
-
raise ValueError(
|
|
37
|
-
"`TimeSeriesDataFrame.to_regular_index` has been deprecated. "
|
|
38
|
-
"Please use `TimeSeriesDataFrame.convert_frequency` instead."
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
23
|
+
class TimeSeriesDataFrame(pd.DataFrame):
|
|
43
24
|
"""A collection of univariate time series, where each row is identified by an (``item_id``, ``timestamp``) pair.
|
|
44
25
|
|
|
45
|
-
For example, a time series
|
|
26
|
+
For example, a time series dataframe could represent the daily sales of a collection of products, where each
|
|
46
27
|
``item_id`` corresponds to a product and ``timestamp`` corresponds to the day of the record.
|
|
47
28
|
|
|
48
29
|
Parameters
|
|
@@ -92,7 +73,7 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
92
73
|
You can also use :meth:`~autogluon.timeseries.TimeSeriesDataFrame.from_iterable_dataset` for loading data in such format.
|
|
93
74
|
|
|
94
75
|
static_features : pd.DataFrame, str or pathlib.Path, optional
|
|
95
|
-
An optional
|
|
76
|
+
An optional dataframe describing the metadata of each individual time series that does not change with time.
|
|
96
77
|
Can take real-valued or categorical values. For example, if ``TimeSeriesDataFrame`` contains sales of various
|
|
97
78
|
products, static features may refer to time-independent features like color or brand.
|
|
98
79
|
|
|
@@ -130,27 +111,21 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
130
111
|
Number of CPU cores used to process the iterable dataset in parallel. Set to -1 to use all cores. This argument
|
|
131
112
|
is only used when constructing a TimeSeriesDataFrame using format 4 (iterable dataset).
|
|
132
113
|
|
|
133
|
-
Attributes
|
|
134
|
-
----------
|
|
135
|
-
freq : str
|
|
136
|
-
A pandas-compatible string describing the frequency of the time series. For example ``"D"`` for daily data,
|
|
137
|
-
``"H"`` for hourly data, etc. This attribute is determined automatically based on the timestamps. For the full
|
|
138
|
-
list of possible values, see `pandas documentation <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_.
|
|
139
|
-
num_items : int
|
|
140
|
-
Number of items (time series) in the data set.
|
|
141
|
-
item_ids : pd.Index
|
|
142
|
-
List of unique time series IDs contained in the data set.
|
|
143
114
|
"""
|
|
144
115
|
|
|
145
|
-
index: pd.MultiIndex
|
|
146
|
-
_metadata = ["_static_features"
|
|
116
|
+
index: pd.MultiIndex # type: ignore
|
|
117
|
+
_metadata = ["_static_features"]
|
|
118
|
+
|
|
119
|
+
IRREGULAR_TIME_INDEX_FREQSTR: Final[str] = "IRREG"
|
|
120
|
+
ITEMID: Final[str] = "item_id"
|
|
121
|
+
TIMESTAMP: Final[str] = "timestamp"
|
|
147
122
|
|
|
148
123
|
def __init__(
|
|
149
124
|
self,
|
|
150
|
-
data:
|
|
151
|
-
static_features:
|
|
152
|
-
id_column:
|
|
153
|
-
timestamp_column:
|
|
125
|
+
data: pd.DataFrame | str | Path | Iterable,
|
|
126
|
+
static_features: pd.DataFrame | str | Path | None = None,
|
|
127
|
+
id_column: str | None = None,
|
|
128
|
+
timestamp_column: str | None = None,
|
|
154
129
|
num_cpus: int = -1,
|
|
155
130
|
*args,
|
|
156
131
|
**kwargs,
|
|
@@ -173,17 +148,11 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
173
148
|
data = self._construct_tsdf_from_iterable_dataset(data, num_cpus=num_cpus)
|
|
174
149
|
else:
|
|
175
150
|
raise ValueError(f"data must be a pd.DataFrame, Iterable, string or Path (received {type(data)}).")
|
|
176
|
-
super().__init__(data=data, *args, **kwargs)
|
|
177
|
-
self._static_features:
|
|
151
|
+
super().__init__(data=data, *args, **kwargs) # type: ignore
|
|
152
|
+
self._static_features: pd.DataFrame | None = None
|
|
178
153
|
if static_features is not None:
|
|
179
154
|
self.static_features = self._construct_static_features(static_features, id_column=id_column)
|
|
180
155
|
|
|
181
|
-
# internal value for cached frequency values that are inferred. corresponds to either a
|
|
182
|
-
# pandas-compatible frequency string, the value IRREGULAR_TIME_INDEX_FREQSTR that signals
|
|
183
|
-
# the time series have irregular timestamps (in which case tsdf.freq returns None), or None
|
|
184
|
-
# if inference was not yet performed.
|
|
185
|
-
self._cached_freq: Optional[str] = None
|
|
186
|
-
|
|
187
156
|
@property
|
|
188
157
|
def _constructor(self) -> Type[TimeSeriesDataFrame]:
|
|
189
158
|
return TimeSeriesDataFrame
|
|
@@ -193,36 +162,39 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
193
162
|
# repeatedly calling TimeSeriesDataFrame constructor
|
|
194
163
|
df = self._from_mgr(mgr, axes=axes)
|
|
195
164
|
df._static_features = self._static_features
|
|
196
|
-
df._cached_freq = self._cached_freq
|
|
197
165
|
return df
|
|
198
166
|
|
|
199
167
|
@classmethod
|
|
200
168
|
def _construct_tsdf_from_data_frame(
|
|
201
169
|
cls,
|
|
202
170
|
df: pd.DataFrame,
|
|
203
|
-
id_column:
|
|
204
|
-
timestamp_column:
|
|
171
|
+
id_column: str | None = None,
|
|
172
|
+
timestamp_column: str | None = None,
|
|
205
173
|
) -> pd.DataFrame:
|
|
206
174
|
df = df.copy()
|
|
207
175
|
if id_column is not None:
|
|
208
176
|
assert id_column in df.columns, f"Column '{id_column}' not found!"
|
|
209
|
-
if id_column != ITEMID and ITEMID in df.columns:
|
|
210
|
-
logger.warning(
|
|
211
|
-
|
|
212
|
-
|
|
177
|
+
if id_column != cls.ITEMID and cls.ITEMID in df.columns:
|
|
178
|
+
logger.warning(
|
|
179
|
+
f"Renaming existing column '{cls.ITEMID}' -> '__{cls.ITEMID}' to avoid name collisions."
|
|
180
|
+
)
|
|
181
|
+
df.rename(columns={cls.ITEMID: "__" + cls.ITEMID}, inplace=True)
|
|
182
|
+
df.rename(columns={id_column: cls.ITEMID}, inplace=True)
|
|
213
183
|
|
|
214
184
|
if timestamp_column is not None:
|
|
215
185
|
assert timestamp_column in df.columns, f"Column '{timestamp_column}' not found!"
|
|
216
|
-
if timestamp_column != TIMESTAMP and TIMESTAMP in df.columns:
|
|
217
|
-
logger.warning(
|
|
218
|
-
|
|
219
|
-
|
|
186
|
+
if timestamp_column != cls.TIMESTAMP and cls.TIMESTAMP in df.columns:
|
|
187
|
+
logger.warning(
|
|
188
|
+
f"Renaming existing column '{cls.TIMESTAMP}' -> '__{cls.TIMESTAMP}' to avoid name collisions."
|
|
189
|
+
)
|
|
190
|
+
df.rename(columns={cls.TIMESTAMP: "__" + cls.TIMESTAMP}, inplace=True)
|
|
191
|
+
df.rename(columns={timestamp_column: cls.TIMESTAMP}, inplace=True)
|
|
220
192
|
|
|
221
|
-
if TIMESTAMP in df.columns:
|
|
222
|
-
df[TIMESTAMP] = pd.to_datetime(df[TIMESTAMP])
|
|
193
|
+
if cls.TIMESTAMP in df.columns:
|
|
194
|
+
df[cls.TIMESTAMP] = pd.to_datetime(df[cls.TIMESTAMP])
|
|
223
195
|
|
|
224
196
|
cls._validate_data_frame(df)
|
|
225
|
-
return df.set_index([ITEMID, TIMESTAMP])
|
|
197
|
+
return df.set_index([cls.ITEMID, cls.TIMESTAMP])
|
|
226
198
|
|
|
227
199
|
@classmethod
|
|
228
200
|
def _construct_tsdf_from_iterable_dataset(cls, iterable_dataset: Iterable, num_cpus: int = -1) -> pd.DataFrame:
|
|
@@ -233,7 +205,7 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
233
205
|
start_timestamp = start_timestamp.to_timestamp(how="S")
|
|
234
206
|
target = ts["target"]
|
|
235
207
|
datetime_index = tuple(pd.date_range(start_timestamp, periods=len(target), freq=freq))
|
|
236
|
-
idx = pd.MultiIndex.from_product([(item_id,), datetime_index], names=[ITEMID, TIMESTAMP])
|
|
208
|
+
idx = pd.MultiIndex.from_product([(item_id,), datetime_index], names=[cls.ITEMID, cls.TIMESTAMP])
|
|
237
209
|
return pd.Series(target, name="target", index=idx).to_frame()
|
|
238
210
|
|
|
239
211
|
cls._validate_iterable(iterable_dataset)
|
|
@@ -250,32 +222,34 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
250
222
|
raise ValueError(f"data must be a pd.DataFrame, got {type(data)}")
|
|
251
223
|
if not isinstance(data.index, pd.MultiIndex):
|
|
252
224
|
raise ValueError(f"data must have pd.MultiIndex, got {type(data.index)}")
|
|
253
|
-
if not pd.api.types.is_datetime64_dtype(data.index.dtypes[TIMESTAMP]):
|
|
254
|
-
raise ValueError(f"for {TIMESTAMP}, the only pandas dtype allowed is `datetime64`.")
|
|
255
|
-
if not data.index.names == (f"{ITEMID}", f"{TIMESTAMP}"):
|
|
256
|
-
raise ValueError(
|
|
257
|
-
|
|
225
|
+
if not pd.api.types.is_datetime64_dtype(data.index.dtypes[cls.TIMESTAMP]):
|
|
226
|
+
raise ValueError(f"for {cls.TIMESTAMP}, the only pandas dtype allowed is `datetime64`.")
|
|
227
|
+
if not data.index.names == (f"{cls.ITEMID}", f"{cls.TIMESTAMP}"):
|
|
228
|
+
raise ValueError(
|
|
229
|
+
f"data must have index names as ('{cls.ITEMID}', '{cls.TIMESTAMP}'), got {data.index.names}"
|
|
230
|
+
)
|
|
231
|
+
item_id_index = data.index.levels[0]
|
|
258
232
|
if not (pd.api.types.is_integer_dtype(item_id_index) or pd.api.types.is_string_dtype(item_id_index)):
|
|
259
|
-
raise ValueError(f"all entries in index `{ITEMID}` must be of integer or string dtype")
|
|
233
|
+
raise ValueError(f"all entries in index `{cls.ITEMID}` must be of integer or string dtype")
|
|
260
234
|
|
|
261
235
|
@classmethod
|
|
262
236
|
def _validate_data_frame(cls, df: pd.DataFrame):
|
|
263
237
|
"""Validate that a pd.DataFrame with ITEMID and TIMESTAMP columns can be converted to TimeSeriesDataFrame"""
|
|
264
238
|
if not isinstance(df, pd.DataFrame):
|
|
265
239
|
raise ValueError(f"data must be a pd.DataFrame, got {type(df)}")
|
|
266
|
-
if ITEMID not in df.columns:
|
|
267
|
-
raise ValueError(f"data must have a `{ITEMID}` column")
|
|
268
|
-
if TIMESTAMP not in df.columns:
|
|
269
|
-
raise ValueError(f"data must have a `{TIMESTAMP}` column")
|
|
270
|
-
if df[ITEMID].isnull().any():
|
|
271
|
-
raise ValueError(f"`{ITEMID}` column can not have nan")
|
|
272
|
-
if df[TIMESTAMP].isnull().any():
|
|
273
|
-
raise ValueError(f"`{TIMESTAMP}` column can not have nan")
|
|
274
|
-
if not pd.api.types.is_datetime64_dtype(df[TIMESTAMP]):
|
|
275
|
-
raise ValueError(f"for {TIMESTAMP}, the only pandas dtype allowed is `datetime64`.")
|
|
276
|
-
item_id_column = df[ITEMID]
|
|
240
|
+
if cls.ITEMID not in df.columns:
|
|
241
|
+
raise ValueError(f"data must have a `{cls.ITEMID}` column")
|
|
242
|
+
if cls.TIMESTAMP not in df.columns:
|
|
243
|
+
raise ValueError(f"data must have a `{cls.TIMESTAMP}` column")
|
|
244
|
+
if df[cls.ITEMID].isnull().any():
|
|
245
|
+
raise ValueError(f"`{cls.ITEMID}` column can not have nan")
|
|
246
|
+
if df[cls.TIMESTAMP].isnull().any():
|
|
247
|
+
raise ValueError(f"`{cls.TIMESTAMP}` column can not have nan")
|
|
248
|
+
if not pd.api.types.is_datetime64_dtype(df[cls.TIMESTAMP]):
|
|
249
|
+
raise ValueError(f"for {cls.TIMESTAMP}, the only pandas dtype allowed is `datetime64`.")
|
|
250
|
+
item_id_column = df[cls.ITEMID]
|
|
277
251
|
if not (pd.api.types.is_integer_dtype(item_id_column) or pd.api.types.is_string_dtype(item_id_column)):
|
|
278
|
-
raise ValueError(f"all entries in column `{ITEMID}` must be of integer or string dtype")
|
|
252
|
+
raise ValueError(f"all entries in column `{cls.ITEMID}` must be of integer or string dtype")
|
|
279
253
|
|
|
280
254
|
@classmethod
|
|
281
255
|
def _validate_iterable(cls, data: Iterable):
|
|
@@ -298,9 +272,9 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
298
272
|
def from_data_frame(
|
|
299
273
|
cls,
|
|
300
274
|
df: pd.DataFrame,
|
|
301
|
-
id_column:
|
|
302
|
-
timestamp_column:
|
|
303
|
-
static_features_df:
|
|
275
|
+
id_column: str | None = None,
|
|
276
|
+
timestamp_column: str | None = None,
|
|
277
|
+
static_features_df: pd.DataFrame | None = None,
|
|
304
278
|
) -> TimeSeriesDataFrame:
|
|
305
279
|
"""Construct a ``TimeSeriesDataFrame`` from a pandas DataFrame.
|
|
306
280
|
|
|
@@ -334,17 +308,17 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
334
308
|
Returns
|
|
335
309
|
-------
|
|
336
310
|
ts_df: TimeSeriesDataFrame
|
|
337
|
-
A
|
|
311
|
+
A dataframe in TimeSeriesDataFrame format.
|
|
338
312
|
"""
|
|
339
313
|
return cls(df, static_features=static_features_df, id_column=id_column, timestamp_column=timestamp_column)
|
|
340
314
|
|
|
341
315
|
@classmethod
|
|
342
316
|
def from_path(
|
|
343
317
|
cls,
|
|
344
|
-
path:
|
|
345
|
-
id_column:
|
|
346
|
-
timestamp_column:
|
|
347
|
-
static_features_path:
|
|
318
|
+
path: str | Path,
|
|
319
|
+
id_column: str | None = None,
|
|
320
|
+
timestamp_column: str | None = None,
|
|
321
|
+
static_features_path: str | Path | None = None,
|
|
348
322
|
) -> TimeSeriesDataFrame:
|
|
349
323
|
"""Construct a ``TimeSeriesDataFrame`` from a CSV or Parquet file.
|
|
350
324
|
|
|
@@ -381,7 +355,7 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
381
355
|
Returns
|
|
382
356
|
-------
|
|
383
357
|
ts_df: TimeSeriesDataFrame
|
|
384
|
-
A
|
|
358
|
+
A dataframe in TimeSeriesDataFrame format.
|
|
385
359
|
"""
|
|
386
360
|
return cls(path, static_features=static_features_path, id_column=id_column, timestamp_column=timestamp_column)
|
|
387
361
|
|
|
@@ -410,22 +384,20 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
410
384
|
Returns
|
|
411
385
|
-------
|
|
412
386
|
ts_df: TimeSeriesDataFrame
|
|
413
|
-
A
|
|
387
|
+
A dataframe in TimeSeriesDataFrame format.
|
|
414
388
|
"""
|
|
415
389
|
return cls(iterable_dataset, num_cpus=num_cpus)
|
|
416
390
|
|
|
417
391
|
@property
|
|
418
392
|
def item_ids(self) -> pd.Index:
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
@property
|
|
422
|
-
def static_features(self):
|
|
423
|
-
return self._static_features
|
|
393
|
+
"""List of unique time series IDs contained in the data set."""
|
|
394
|
+
return self.index.unique(level=self.ITEMID)
|
|
424
395
|
|
|
396
|
+
@classmethod
|
|
425
397
|
def _construct_static_features(
|
|
426
398
|
cls,
|
|
427
|
-
static_features:
|
|
428
|
-
id_column:
|
|
399
|
+
static_features: pd.DataFrame | str | Path,
|
|
400
|
+
id_column: str | None = None,
|
|
429
401
|
) -> pd.DataFrame:
|
|
430
402
|
if isinstance(static_features, (str, Path)):
|
|
431
403
|
static_features = load_pd.load(str(static_features))
|
|
@@ -436,14 +408,20 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
436
408
|
|
|
437
409
|
if id_column is not None:
|
|
438
410
|
assert id_column in static_features.columns, f"Column '{id_column}' not found in static_features!"
|
|
439
|
-
if id_column != ITEMID and ITEMID in static_features.columns:
|
|
440
|
-
logger.warning(
|
|
441
|
-
|
|
442
|
-
|
|
411
|
+
if id_column != cls.ITEMID and cls.ITEMID in static_features.columns:
|
|
412
|
+
logger.warning(
|
|
413
|
+
f"Renaming existing column '{cls.ITEMID}' -> '__{cls.ITEMID}' to avoid name collisions."
|
|
414
|
+
)
|
|
415
|
+
static_features.rename(columns={cls.ITEMID: "__" + cls.ITEMID}, inplace=True)
|
|
416
|
+
static_features.rename(columns={id_column: cls.ITEMID}, inplace=True)
|
|
443
417
|
return static_features
|
|
444
418
|
|
|
419
|
+
@property
|
|
420
|
+
def static_features(self):
|
|
421
|
+
return self._static_features
|
|
422
|
+
|
|
445
423
|
@static_features.setter
|
|
446
|
-
def static_features(self, value:
|
|
424
|
+
def static_features(self, value: pd.DataFrame | None):
|
|
447
425
|
# if the current item index is not a multiindex, then we are dealing with a single
|
|
448
426
|
# item slice. this should only happen when the user explicitly requests only a
|
|
449
427
|
# single item or during `slice_by_timestep`. In this case we do not set static features
|
|
@@ -460,10 +438,10 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
460
438
|
|
|
461
439
|
# Avoid modifying static features inplace
|
|
462
440
|
value = value.copy()
|
|
463
|
-
if ITEMID in value.columns and value.index.name != ITEMID:
|
|
464
|
-
value = value.set_index(ITEMID)
|
|
465
|
-
if value.index.name != ITEMID:
|
|
466
|
-
value.index.rename(ITEMID, inplace=True)
|
|
441
|
+
if self.ITEMID in value.columns and value.index.name != self.ITEMID:
|
|
442
|
+
value = value.set_index(self.ITEMID)
|
|
443
|
+
if value.index.name != self.ITEMID:
|
|
444
|
+
value.index.rename(self.ITEMID, inplace=True)
|
|
467
445
|
missing_item_ids = self.item_ids.difference(value.index)
|
|
468
446
|
if len(missing_item_ids) > 0:
|
|
469
447
|
raise ValueError(
|
|
@@ -476,37 +454,102 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
476
454
|
|
|
477
455
|
self._static_features = value
|
|
478
456
|
|
|
457
|
+
def infer_frequency(self, num_items: int | None = None, raise_if_irregular: bool = False) -> str:
|
|
458
|
+
"""Infer the time series frequency based on the timestamps of the observations.
|
|
459
|
+
|
|
460
|
+
Parameters
|
|
461
|
+
----------
|
|
462
|
+
num_items : int or None, default = None
|
|
463
|
+
Number of items (individual time series) randomly selected to infer the frequency. Lower values speed up
|
|
464
|
+
the method, but increase the chance that some items with invalid frequency are missed by subsampling.
|
|
465
|
+
|
|
466
|
+
If set to ``None``, all items will be used for inferring the frequency.
|
|
467
|
+
raise_if_irregular : bool, default = False
|
|
468
|
+
If True, an exception will be raised if some items have an irregular frequency, or if different items have
|
|
469
|
+
different frequencies.
|
|
470
|
+
|
|
471
|
+
Returns
|
|
472
|
+
-------
|
|
473
|
+
freq : str
|
|
474
|
+
If all time series have a regular frequency, returns a pandas-compatible `frequency alias <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_.
|
|
475
|
+
|
|
476
|
+
If some items have an irregular frequency or if different items have different frequencies, returns string
|
|
477
|
+
``IRREG``.
|
|
478
|
+
"""
|
|
479
|
+
ts_df = self
|
|
480
|
+
if num_items is not None and ts_df.num_items > num_items:
|
|
481
|
+
items_subset = ts_df.item_ids.to_series().sample(n=num_items, random_state=123)
|
|
482
|
+
ts_df = ts_df.loc[items_subset]
|
|
483
|
+
|
|
484
|
+
if not ts_df.index.is_monotonic_increasing:
|
|
485
|
+
ts_df = ts_df.sort_index()
|
|
486
|
+
|
|
487
|
+
indptr = ts_df.get_indptr()
|
|
488
|
+
item_ids = ts_df.item_ids
|
|
489
|
+
timestamps = ts_df.index.get_level_values(level=1)
|
|
490
|
+
candidate_freq = ts_df.index.levels[1].freq
|
|
491
|
+
|
|
492
|
+
frequencies = []
|
|
493
|
+
irregular_items = []
|
|
494
|
+
for i in range(len(indptr) - 1):
|
|
495
|
+
start, end = indptr[i], indptr[i + 1]
|
|
496
|
+
item_timestamps = timestamps[start:end]
|
|
497
|
+
inferred_freq = item_timestamps.inferred_freq
|
|
498
|
+
|
|
499
|
+
# Fallback option: maybe original index has a `freq` attribute that pandas fails to infer (e.g., 'SME')
|
|
500
|
+
if inferred_freq is None and candidate_freq is not None:
|
|
501
|
+
try:
|
|
502
|
+
# If this line does not raise an exception, then candidate_freq is a compatible frequency
|
|
503
|
+
item_timestamps.freq = candidate_freq
|
|
504
|
+
except ValueError:
|
|
505
|
+
inferred_freq = None
|
|
506
|
+
else:
|
|
507
|
+
inferred_freq = candidate_freq.freqstr
|
|
508
|
+
|
|
509
|
+
if inferred_freq is None:
|
|
510
|
+
irregular_items.append(item_ids[i])
|
|
511
|
+
else:
|
|
512
|
+
frequencies.append(inferred_freq)
|
|
513
|
+
|
|
514
|
+
unique_freqs = list(set(frequencies))
|
|
515
|
+
if len(unique_freqs) != 1 or len(irregular_items) > 0:
|
|
516
|
+
if raise_if_irregular:
|
|
517
|
+
if irregular_items:
|
|
518
|
+
raise ValueError(
|
|
519
|
+
f"Cannot infer frequency. Items with irregular frequency: {reprlib.repr(irregular_items)}"
|
|
520
|
+
)
|
|
521
|
+
else:
|
|
522
|
+
raise ValueError(f"Cannot infer frequency. Multiple frequencies detected: {unique_freqs}")
|
|
523
|
+
else:
|
|
524
|
+
return self.IRREGULAR_TIME_INDEX_FREQSTR
|
|
525
|
+
else:
|
|
526
|
+
return pd.tseries.frequencies.to_offset(unique_freqs[0]).freqstr
|
|
527
|
+
|
|
479
528
|
@property
|
|
480
529
|
def freq(self):
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
# check the frequencies of the first 100 items to see if frequencies are consistent and
|
|
490
|
-
# can be inferred
|
|
491
|
-
freq_for_each_series = [get_freq(self.loc[idx]) for idx in self.item_ids[:100]]
|
|
492
|
-
freq = freq_for_each_series[0]
|
|
493
|
-
if len(set(freq_for_each_series)) > 1 or freq is None:
|
|
494
|
-
self._cached_freq = IRREGULAR_TIME_INDEX_FREQSTR
|
|
495
|
-
return None
|
|
496
|
-
|
|
497
|
-
freq = freq.freqstr if isinstance(freq, pd._libs.tslibs.BaseOffset) else freq
|
|
498
|
-
self._cached_freq = freq
|
|
499
|
-
return freq
|
|
530
|
+
"""Inferred pandas-compatible frequency of the timestamps in the dataframe.
|
|
531
|
+
|
|
532
|
+
Computed using a random subset of the time series for speed. This may sometimes result in incorrectly inferred
|
|
533
|
+
values. For reliable results, use :meth:`~autogluon.timeseries.TimeSeriesDataFrame.infer_frequency`.
|
|
534
|
+
"""
|
|
535
|
+
inferred_freq = self.infer_frequency(num_items=50)
|
|
536
|
+
return None if inferred_freq == self.IRREGULAR_TIME_INDEX_FREQSTR else inferred_freq
|
|
500
537
|
|
|
501
538
|
@property
|
|
502
539
|
def num_items(self):
|
|
540
|
+
"""Number of items (time series) in the data set."""
|
|
503
541
|
return len(self.item_ids)
|
|
504
542
|
|
|
505
543
|
def num_timesteps_per_item(self) -> pd.Series:
|
|
506
|
-
"""
|
|
507
|
-
|
|
544
|
+
"""Number of observations in each time series in the dataframe.
|
|
545
|
+
|
|
546
|
+
Returns a ``pandas.Series`` with ``item_id`` as index and number of observations per item as values.
|
|
547
|
+
"""
|
|
548
|
+
counts = pd.Series(self.index.codes[0]).value_counts(sort=False)
|
|
549
|
+
counts.index = self.index.levels[0][counts.index]
|
|
550
|
+
return counts
|
|
508
551
|
|
|
509
|
-
def copy(self: TimeSeriesDataFrame, deep: bool = True) ->
|
|
552
|
+
def copy(self: TimeSeriesDataFrame, deep: bool = True) -> TimeSeriesDataFrame:
|
|
510
553
|
"""Make a copy of the TimeSeriesDataFrame.
|
|
511
554
|
|
|
512
555
|
When ``deep=True`` (default), a new object will be created with a copy of the calling object's data and
|
|
@@ -527,24 +570,22 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
527
570
|
return obj
|
|
528
571
|
|
|
529
572
|
def __finalize__( # noqa
|
|
530
|
-
self: TimeSeriesDataFrame, other, method:
|
|
573
|
+
self: TimeSeriesDataFrame, other, method: str | None = None, **kwargs
|
|
531
574
|
) -> TimeSeriesDataFrame:
|
|
532
575
|
super().__finalize__(other=other, method=method, **kwargs)
|
|
533
576
|
# when finalizing the copy/slice operation, we use the property setter to stay consistent
|
|
534
577
|
# with the item index
|
|
535
578
|
if hasattr(other, "_static_features"):
|
|
536
579
|
self.static_features = other._static_features
|
|
537
|
-
if hasattr(other, "_cached_freq"):
|
|
538
|
-
self._cached_freq = other._cached_freq
|
|
539
580
|
return self
|
|
540
581
|
|
|
541
|
-
def split_by_time(self, cutoff_time: pd.Timestamp) ->
|
|
582
|
+
def split_by_time(self, cutoff_time: pd.Timestamp) -> tuple[TimeSeriesDataFrame, TimeSeriesDataFrame]:
|
|
542
583
|
"""Split dataframe to two different ``TimeSeriesDataFrame`` s before and after a certain ``cutoff_time``.
|
|
543
584
|
|
|
544
585
|
Parameters
|
|
545
586
|
----------
|
|
546
587
|
cutoff_time: pd.Timestamp
|
|
547
|
-
The time to split the current
|
|
588
|
+
The time to split the current dataframe into two dataframes.
|
|
548
589
|
|
|
549
590
|
Returns
|
|
550
591
|
-------
|
|
@@ -559,19 +600,16 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
559
600
|
data_after = self.loc[(slice(None), slice(cutoff_time, None)), :]
|
|
560
601
|
before = TimeSeriesDataFrame(data_before, static_features=self.static_features)
|
|
561
602
|
after = TimeSeriesDataFrame(data_after, static_features=self.static_features)
|
|
562
|
-
before._cached_freq = self._cached_freq
|
|
563
|
-
after._cached_freq = self._cached_freq
|
|
564
603
|
return before, after
|
|
565
604
|
|
|
566
|
-
def slice_by_timestep(
|
|
567
|
-
self, start_index: Optional[int] = None, end_index: Optional[int] = None
|
|
568
|
-
) -> TimeSeriesDataFrame:
|
|
605
|
+
def slice_by_timestep(self, start_index: int | None = None, end_index: int | None = None) -> TimeSeriesDataFrame:
|
|
569
606
|
"""Select a subsequence from each time series between start (inclusive) and end (exclusive) indices.
|
|
570
607
|
|
|
571
608
|
This operation is equivalent to selecting a slice ``[start_index : end_index]`` from each time series, and then
|
|
572
609
|
combining these slices into a new ``TimeSeriesDataFrame``. See examples below.
|
|
573
610
|
|
|
574
|
-
|
|
611
|
+
It is recommended to sort the index with ``ts_df.sort_index()`` before calling this method to take advantage of
|
|
612
|
+
a fast optimized algorithm.
|
|
575
613
|
|
|
576
614
|
Parameters
|
|
577
615
|
----------
|
|
@@ -658,11 +696,53 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
658
696
|
if end_index is not None and not isinstance(end_index, int):
|
|
659
697
|
raise ValueError(f"end_index must be of type int or None (got {type(end_index)})")
|
|
660
698
|
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
699
|
+
if start_index is None and end_index is None:
|
|
700
|
+
# Return a copy to avoid in-place modification.
|
|
701
|
+
# self.copy() is much faster than self.loc[ones(len(self), dtype=bool)]
|
|
702
|
+
return self.copy()
|
|
703
|
+
|
|
704
|
+
if self.index.is_monotonic_increasing:
|
|
705
|
+
# Use a fast optimized algorithm if the index is sorted
|
|
706
|
+
indptr = self.get_indptr()
|
|
707
|
+
lengths = np.diff(indptr)
|
|
708
|
+
starts = indptr[:-1]
|
|
709
|
+
|
|
710
|
+
slice_start = (
|
|
711
|
+
np.zeros_like(lengths)
|
|
712
|
+
if start_index is None
|
|
713
|
+
else np.clip(np.where(start_index >= 0, start_index, lengths + start_index), 0, lengths)
|
|
714
|
+
)
|
|
715
|
+
slice_end = (
|
|
716
|
+
lengths.copy()
|
|
717
|
+
if end_index is None
|
|
718
|
+
else np.clip(np.where(end_index >= 0, end_index, lengths + end_index), 0, lengths)
|
|
719
|
+
)
|
|
720
|
+
|
|
721
|
+
# Filter out invalid slices where start >= end
|
|
722
|
+
valid_slices = slice_start < slice_end
|
|
723
|
+
if not np.any(valid_slices):
|
|
724
|
+
# Return empty dataframe with same structure
|
|
725
|
+
return self.loc[np.zeros(len(self), dtype=bool)]
|
|
726
|
+
|
|
727
|
+
starts = starts[valid_slices]
|
|
728
|
+
slice_start = slice_start[valid_slices]
|
|
729
|
+
slice_end = slice_end[valid_slices]
|
|
730
|
+
|
|
731
|
+
# We put 1 at the slice_start index for each item and -1 at the slice_end index for each item.
|
|
732
|
+
# After we apply cumsum we get the indicator mask selecting values between slice_start and slice_end
|
|
733
|
+
# cumsum([0, 0, 1, 0, 0, -1, 0]) -> [0, 0, 1, 1, 1, 0, 0]
|
|
734
|
+
# We need array of size len(self) + 1 in case events[starts + slice_end] tries to access position len(self)
|
|
735
|
+
events = np.zeros(len(self) + 1, dtype=np.int8)
|
|
736
|
+
events[starts + slice_start] += 1
|
|
737
|
+
events[starts + slice_end] -= 1
|
|
738
|
+
mask = np.cumsum(events)[:-1].astype(bool)
|
|
739
|
+
# loc[mask] returns a view of the original data - modifying it will produce a SettingWithCopyWarning
|
|
740
|
+
return self.loc[mask]
|
|
741
|
+
else:
|
|
742
|
+
# Fall back to a slow groupby operation
|
|
743
|
+
result = self.groupby(level=self.ITEMID, sort=False, as_index=False).nth(slice(start_index, end_index))
|
|
744
|
+
result.static_features = self.static_features
|
|
745
|
+
return result
|
|
666
746
|
|
|
667
747
|
def slice_by_time(self, start_time: pd.Timestamp, end_time: pd.Timestamp) -> TimeSeriesDataFrame:
|
|
668
748
|
"""Select a subsequence from each time series between start (inclusive) and end (exclusive) timestamps.
|
|
@@ -691,7 +771,7 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
691
771
|
|
|
692
772
|
@classmethod
|
|
693
773
|
def from_pickle(cls, filepath_or_buffer: Any) -> TimeSeriesDataFrame:
|
|
694
|
-
"""Convenience method to read pickled time series
|
|
774
|
+
"""Convenience method to read pickled time series dataframes. If the read pickle
|
|
695
775
|
file refers to a plain pandas DataFrame, it will be cast to a TimeSeriesDataFrame.
|
|
696
776
|
|
|
697
777
|
Parameters
|
|
@@ -702,7 +782,7 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
702
782
|
Returns
|
|
703
783
|
-------
|
|
704
784
|
ts_df : TimeSeriesDataFrame
|
|
705
|
-
The pickled time series
|
|
785
|
+
The pickled time series dataframe.
|
|
706
786
|
"""
|
|
707
787
|
try:
|
|
708
788
|
data = pd.read_pickle(filepath_or_buffer)
|
|
@@ -713,16 +793,21 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
713
793
|
def fill_missing_values(self, method: str = "auto", value: float = 0.0) -> TimeSeriesDataFrame:
|
|
714
794
|
"""Fill missing values represented by NaN.
|
|
715
795
|
|
|
796
|
+
.. note::
|
|
797
|
+
This method assumes that the index of the TimeSeriesDataFrame is sorted by [item_id, timestamp].
|
|
798
|
+
|
|
799
|
+
If the index is not sorted, this method will log a warning and may produce an incorrect result.
|
|
800
|
+
|
|
716
801
|
Parameters
|
|
717
802
|
----------
|
|
718
803
|
method : str, default = "auto"
|
|
719
804
|
Method used to impute missing values.
|
|
720
805
|
|
|
721
|
-
- "auto" - first forward fill (to fill the in-between and trailing NaNs), then backward fill (to fill the leading NaNs)
|
|
722
|
-
- "ffill" or "pad" - propagate last valid observation forward. Note: missing values at the start of the time series are not filled.
|
|
723
|
-
- "bfill" or "backfill" - use next valid observation to fill gap. Note: this may result in information leakage; missing values at the end of the time series are not filled.
|
|
724
|
-
- "constant" - replace NaNs with the given constant ``value``.
|
|
725
|
-
- "interpolate" - fill NaN values using linear interpolation. Note: this may result in information leakage.
|
|
806
|
+
- ``"auto"`` - first forward fill (to fill the in-between and trailing NaNs), then backward fill (to fill the leading NaNs)
|
|
807
|
+
- ``"ffill"`` or ``"pad"`` - propagate last valid observation forward. Note: missing values at the start of the time series are not filled.
|
|
808
|
+
- ``"bfill"`` or ``"backfill"`` - use next valid observation to fill gap. Note: this may result in information leakage; missing values at the end of the time series are not filled.
|
|
809
|
+
- ``"constant"`` - replace NaNs with the given constant ``value``.
|
|
810
|
+
- ``"interpolate"`` - fill NaN values using linear interpolation. Note: this may result in information leakage.
|
|
726
811
|
value : float, default = 0.0
|
|
727
812
|
Value used by the "constant" imputation method.
|
|
728
813
|
|
|
@@ -759,17 +844,25 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
759
844
|
2019-02-07 4.0
|
|
760
845
|
|
|
761
846
|
"""
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
847
|
+
# Convert to pd.DataFrame for faster processing
|
|
848
|
+
df = pd.DataFrame(self)
|
|
849
|
+
|
|
850
|
+
# Skip filling if there are no NaNs
|
|
851
|
+
if not df.isna().any(axis=None):
|
|
852
|
+
return self
|
|
853
|
+
|
|
854
|
+
if not self.index.is_monotonic_increasing:
|
|
855
|
+
logger.warning(
|
|
856
|
+
"Trying to fill missing values in an unsorted dataframe. "
|
|
857
|
+
"It is highly recommended to call `ts_df.sort_index()` before calling `ts_df.fill_missing_values()`"
|
|
766
858
|
)
|
|
767
859
|
|
|
768
|
-
grouped_df =
|
|
860
|
+
grouped_df = df.groupby(level=self.ITEMID, sort=False, group_keys=False)
|
|
769
861
|
if method == "auto":
|
|
770
862
|
filled_df = grouped_df.ffill()
|
|
771
|
-
#
|
|
772
|
-
|
|
863
|
+
# If necessary, fill missing values at the start of each time series with bfill
|
|
864
|
+
if filled_df.isna().any(axis=None):
|
|
865
|
+
filled_df = filled_df.groupby(level=self.ITEMID, sort=False, group_keys=False).bfill()
|
|
773
866
|
elif method in ["ffill", "pad"]:
|
|
774
867
|
filled_df = grouped_df.ffill()
|
|
775
868
|
elif method in ["bfill", "backfill"]:
|
|
@@ -786,7 +879,7 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
786
879
|
)
|
|
787
880
|
return TimeSeriesDataFrame(filled_df, static_features=self.static_features)
|
|
788
881
|
|
|
789
|
-
def dropna(self, how: str = "any") -> TimeSeriesDataFrame:
|
|
882
|
+
def dropna(self, how: str = "any") -> TimeSeriesDataFrame: # type: ignore[override]
|
|
790
883
|
"""Drop rows containing NaNs.
|
|
791
884
|
|
|
792
885
|
Parameters
|
|
@@ -802,18 +895,27 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
802
895
|
dropped_df = pd.DataFrame(self).dropna(how=how)
|
|
803
896
|
return TimeSeriesDataFrame(dropped_df, static_features=self.static_features)
|
|
804
897
|
|
|
898
|
+
# added for static type checker compatibility
|
|
899
|
+
def assign(self, **kwargs) -> TimeSeriesDataFrame:
|
|
900
|
+
"""Assign new columns to the time series dataframe. See :meth:`pandas.DataFrame.assign` for details."""
|
|
901
|
+
return super().assign(**kwargs) # type: ignore
|
|
902
|
+
|
|
903
|
+
# added for static type checker compatibility
|
|
904
|
+
def sort_index(self, *args, **kwargs) -> TimeSeriesDataFrame:
|
|
905
|
+
return super().sort_index(*args, **kwargs) # type: ignore
|
|
906
|
+
|
|
805
907
|
def get_model_inputs_for_scoring(
|
|
806
|
-
self, prediction_length: int, known_covariates_names:
|
|
807
|
-
) ->
|
|
908
|
+
self, prediction_length: int, known_covariates_names: list[str] | None = None
|
|
909
|
+
) -> tuple[TimeSeriesDataFrame, TimeSeriesDataFrame | None]:
|
|
808
910
|
"""Prepare model inputs necessary to predict the last ``prediction_length`` time steps of each time series in the dataset.
|
|
809
911
|
|
|
810
912
|
Parameters
|
|
811
913
|
----------
|
|
812
914
|
prediction_length : int
|
|
813
915
|
The forecast horizon, i.e., How many time steps into the future must be predicted.
|
|
814
|
-
known_covariates_names :
|
|
916
|
+
known_covariates_names : list[str], optional
|
|
815
917
|
Names of the dataframe columns that contain covariates known in the future.
|
|
816
|
-
See
|
|
918
|
+
See ``known_covariates_names`` of :class:`~autogluon.timeseries.TimeSeriesPredictor` for more details.
|
|
817
919
|
|
|
818
920
|
Returns
|
|
819
921
|
-------
|
|
@@ -834,12 +936,16 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
834
936
|
def train_test_split(
|
|
835
937
|
self,
|
|
836
938
|
prediction_length: int,
|
|
837
|
-
end_index:
|
|
838
|
-
suffix:
|
|
839
|
-
) ->
|
|
939
|
+
end_index: int | None = None,
|
|
940
|
+
suffix: str | None = None,
|
|
941
|
+
) -> tuple[TimeSeriesDataFrame, TimeSeriesDataFrame]:
|
|
840
942
|
"""Generate a train/test split from the given dataset.
|
|
943
|
+
|
|
841
944
|
This method can be used to generate splits for multi-window backtesting.
|
|
842
945
|
|
|
946
|
+
.. note::
|
|
947
|
+
This method automatically sorts the TimeSeriesDataFrame by [item_id, timestamp].
|
|
948
|
+
|
|
843
949
|
Parameters
|
|
844
950
|
----------
|
|
845
951
|
prediction_length : int
|
|
@@ -858,7 +964,11 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
858
964
|
test_data : TimeSeriesDataFrame
|
|
859
965
|
Test portion of the data. Contains the slice ``[:end_idx]`` of each time series in the original dataset.
|
|
860
966
|
"""
|
|
861
|
-
|
|
967
|
+
df = self
|
|
968
|
+
if not df.index.is_monotonic_increasing:
|
|
969
|
+
logger.warning("Sorting the dataframe index before generating the train/test split.")
|
|
970
|
+
df = df.sort_index()
|
|
971
|
+
test_data = df.slice_by_timestep(None, end_index)
|
|
862
972
|
train_data = test_data.slice_by_timestep(None, -prediction_length)
|
|
863
973
|
|
|
864
974
|
if suffix is not None:
|
|
@@ -872,14 +982,14 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
872
982
|
|
|
873
983
|
def convert_frequency(
|
|
874
984
|
self,
|
|
875
|
-
freq:
|
|
985
|
+
freq: str | pd.DateOffset,
|
|
876
986
|
agg_numeric: str = "mean",
|
|
877
987
|
agg_categorical: str = "first",
|
|
878
988
|
num_cpus: int = -1,
|
|
879
989
|
chunk_size: int = 100,
|
|
880
990
|
**kwargs,
|
|
881
991
|
) -> TimeSeriesDataFrame:
|
|
882
|
-
"""Convert each time series in the
|
|
992
|
+
"""Convert each time series in the dataframe to the given frequency.
|
|
883
993
|
|
|
884
994
|
This method is useful for two purposes:
|
|
885
995
|
|
|
@@ -889,10 +999,9 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
889
999
|
Standard ``df.groupby(...).resample(...)`` can be extremely slow for large datasets, so we parallelize this
|
|
890
1000
|
operation across multiple CPU cores.
|
|
891
1001
|
|
|
892
|
-
|
|
893
1002
|
Parameters
|
|
894
1003
|
----------
|
|
895
|
-
freq :
|
|
1004
|
+
freq : str | pd.DateOffset
|
|
896
1005
|
Frequency to which the data should be converted. See `pandas frequency aliases <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_
|
|
897
1006
|
for supported values.
|
|
898
1007
|
agg_numeric : {"max", "min", "sum", "mean", "median", "first", "last"}, default = "mean"
|
|
@@ -953,20 +1062,18 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
953
1062
|
2021-06-30 6.0
|
|
954
1063
|
2021-09-30 7.0
|
|
955
1064
|
2021-12-31 8.0
|
|
956
|
-
>>> ts_df.convert_frequency("
|
|
1065
|
+
>>> ts_df.convert_frequency("YE")
|
|
957
1066
|
target
|
|
958
1067
|
item_id timestamp
|
|
959
1068
|
0 2020-12-31 2.5
|
|
960
1069
|
2021-12-31 6.5
|
|
961
|
-
>>> ts_df.convert_frequency("
|
|
1070
|
+
>>> ts_df.convert_frequency("YE", agg_numeric="sum")
|
|
962
1071
|
target
|
|
963
1072
|
item_id timestamp
|
|
964
1073
|
0 2020-12-31 10.0
|
|
965
1074
|
2021-12-31 26.0
|
|
966
1075
|
"""
|
|
967
1076
|
offset = pd.tseries.frequencies.to_offset(freq)
|
|
968
|
-
if self.freq == offset.freqstr:
|
|
969
|
-
return self
|
|
970
1077
|
|
|
971
1078
|
# We need to aggregate categorical columns separately because .agg("mean") deletes all non-numeric columns
|
|
972
1079
|
aggregation = {}
|
|
@@ -981,22 +1088,68 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
|
|
|
981
1088
|
iterable = iter(iterable)
|
|
982
1089
|
return iter(lambda: tuple(islice(iterable, size)), ())
|
|
983
1090
|
|
|
984
|
-
def resample_chunk(chunk: Iterable[
|
|
1091
|
+
def resample_chunk(chunk: Iterable[tuple[str, pd.DataFrame]]) -> pd.DataFrame:
|
|
985
1092
|
resampled_dfs = []
|
|
986
1093
|
for item_id, df in chunk:
|
|
987
|
-
resampled_df = df.resample(offset, level=TIMESTAMP, **kwargs).agg(aggregation)
|
|
988
|
-
resampled_dfs.append(pd.concat({item_id: resampled_df}, names=[ITEMID]))
|
|
1094
|
+
resampled_df = df.resample(offset, level=self.TIMESTAMP, **kwargs).agg(aggregation)
|
|
1095
|
+
resampled_dfs.append(pd.concat({item_id: resampled_df}, names=[self.ITEMID]))
|
|
989
1096
|
return pd.concat(resampled_dfs)
|
|
990
1097
|
|
|
991
1098
|
# Resampling time for 1 item < overhead time for a single parallel job. Therefore, we group items into chunks
|
|
992
1099
|
# so that the speedup from parallelization isn't dominated by the communication costs.
|
|
993
|
-
|
|
1100
|
+
df = pd.DataFrame(self)
|
|
1101
|
+
# Make sure that timestamp index has dtype 'datetime64[ns]', otherwise index may contain NaT values.
|
|
1102
|
+
# See https://github.com/autogluon/autogluon/issues/4917
|
|
1103
|
+
df.index = df.index.set_levels(df.index.levels[1].astype("datetime64[ns]"), level=self.TIMESTAMP)
|
|
1104
|
+
chunks = split_into_chunks(df.groupby(level=self.ITEMID, sort=False), chunk_size)
|
|
994
1105
|
resampled_chunks = Parallel(n_jobs=num_cpus)(delayed(resample_chunk)(chunk) for chunk in chunks)
|
|
995
1106
|
resampled_df = TimeSeriesDataFrame(pd.concat(resampled_chunks))
|
|
996
1107
|
resampled_df.static_features = self.static_features
|
|
997
1108
|
return resampled_df
|
|
998
1109
|
|
|
999
|
-
def
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1110
|
+
def to_data_frame(self) -> pd.DataFrame:
|
|
1111
|
+
"""Convert ``TimeSeriesDataFrame`` to a ``pandas.DataFrame``"""
|
|
1112
|
+
return pd.DataFrame(self)
|
|
1113
|
+
|
|
1114
|
+
def get_indptr(self) -> np.ndarray:
|
|
1115
|
+
"""[Advanced] Get a numpy array of shape [num_items + 1] that points to the start and end of each time series.
|
|
1116
|
+
|
|
1117
|
+
This method assumes that the TimeSeriesDataFrame is sorted by [item_id, timestamp].
|
|
1118
|
+
"""
|
|
1119
|
+
return np.concatenate([[0], np.cumsum(self.num_timesteps_per_item().to_numpy())]).astype(np.int32)
|
|
1120
|
+
|
|
1121
|
+
# inline typing stubs for various overridden methods
|
|
1122
|
+
if TYPE_CHECKING:
|
|
1123
|
+
|
|
1124
|
+
def query( # type: ignore
|
|
1125
|
+
self, expr: str, *, inplace: bool = False, **kwargs
|
|
1126
|
+
) -> Self: ...
|
|
1127
|
+
|
|
1128
|
+
def reindex(*args, **kwargs) -> Self: ... # type: ignore
|
|
1129
|
+
|
|
1130
|
+
@overload
|
|
1131
|
+
def __new__(cls, data: pd.DataFrame, static_features: pd.DataFrame | None = None) -> Self: ... # type: ignore
|
|
1132
|
+
@overload
|
|
1133
|
+
def __new__(
|
|
1134
|
+
cls,
|
|
1135
|
+
data: pd.DataFrame | str | Path | Iterable,
|
|
1136
|
+
static_features: pd.DataFrame | str | Path | None = None,
|
|
1137
|
+
id_column: str | None = None,
|
|
1138
|
+
timestamp_column: str | None = None,
|
|
1139
|
+
num_cpus: int = -1,
|
|
1140
|
+
*args,
|
|
1141
|
+
**kwargs,
|
|
1142
|
+
) -> Self:
|
|
1143
|
+
"""This overload is needed since in pandas, during type checking, the default constructor resolves to __new__"""
|
|
1144
|
+
...
|
|
1145
|
+
|
|
1146
|
+
@overload
|
|
1147
|
+
def __getitem__(self, items: list[str]) -> Self: ... # type: ignore
|
|
1148
|
+
@overload
|
|
1149
|
+
def __getitem__(self, item: str) -> pd.Series: ... # type: ignore
|
|
1150
|
+
|
|
1151
|
+
|
|
1152
|
+
# TODO: remove with v2.0
|
|
1153
|
+
# module-level constants kept for backward compatibility.
|
|
1154
|
+
ITEMID = TimeSeriesDataFrame.ITEMID
|
|
1155
|
+
TIMESTAMP = TimeSeriesDataFrame.TIMESTAMP
|