autogluon.timeseries 1.0.1b20240304__py3-none-any.whl → 1.4.1b20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of autogluon.timeseries might be problematic. Click here for more details.

Files changed (108) hide show
  1. autogluon/timeseries/configs/__init__.py +3 -2
  2. autogluon/timeseries/configs/hyperparameter_presets.py +62 -0
  3. autogluon/timeseries/configs/predictor_presets.py +84 -0
  4. autogluon/timeseries/dataset/ts_dataframe.py +339 -186
  5. autogluon/timeseries/learner.py +192 -60
  6. autogluon/timeseries/metrics/__init__.py +55 -11
  7. autogluon/timeseries/metrics/abstract.py +96 -25
  8. autogluon/timeseries/metrics/point.py +186 -39
  9. autogluon/timeseries/metrics/quantile.py +47 -20
  10. autogluon/timeseries/metrics/utils.py +6 -6
  11. autogluon/timeseries/models/__init__.py +13 -7
  12. autogluon/timeseries/models/abstract/__init__.py +2 -2
  13. autogluon/timeseries/models/abstract/abstract_timeseries_model.py +533 -273
  14. autogluon/timeseries/models/abstract/model_trial.py +10 -10
  15. autogluon/timeseries/models/abstract/tunable.py +189 -0
  16. autogluon/timeseries/models/autogluon_tabular/__init__.py +2 -0
  17. autogluon/timeseries/models/autogluon_tabular/mlforecast.py +369 -215
  18. autogluon/timeseries/models/autogluon_tabular/per_step.py +513 -0
  19. autogluon/timeseries/models/autogluon_tabular/transforms.py +67 -0
  20. autogluon/timeseries/models/autogluon_tabular/utils.py +3 -51
  21. autogluon/timeseries/models/chronos/__init__.py +4 -0
  22. autogluon/timeseries/models/chronos/chronos2.py +361 -0
  23. autogluon/timeseries/models/chronos/model.py +738 -0
  24. autogluon/timeseries/models/chronos/utils.py +369 -0
  25. autogluon/timeseries/models/ensemble/__init__.py +35 -2
  26. autogluon/timeseries/models/ensemble/{abstract_timeseries_ensemble.py → abstract.py} +50 -26
  27. autogluon/timeseries/models/ensemble/array_based/__init__.py +3 -0
  28. autogluon/timeseries/models/ensemble/array_based/abstract.py +236 -0
  29. autogluon/timeseries/models/ensemble/array_based/models.py +73 -0
  30. autogluon/timeseries/models/ensemble/array_based/regressor/__init__.py +12 -0
  31. autogluon/timeseries/models/ensemble/array_based/regressor/abstract.py +88 -0
  32. autogluon/timeseries/models/ensemble/array_based/regressor/linear_stacker.py +167 -0
  33. autogluon/timeseries/models/ensemble/array_based/regressor/per_quantile_tabular.py +94 -0
  34. autogluon/timeseries/models/ensemble/array_based/regressor/tabular.py +107 -0
  35. autogluon/timeseries/models/ensemble/ensemble_selection.py +167 -0
  36. autogluon/timeseries/models/ensemble/per_item_greedy.py +162 -0
  37. autogluon/timeseries/models/ensemble/weighted/__init__.py +8 -0
  38. autogluon/timeseries/models/ensemble/weighted/abstract.py +40 -0
  39. autogluon/timeseries/models/ensemble/weighted/basic.py +78 -0
  40. autogluon/timeseries/models/ensemble/weighted/greedy.py +57 -0
  41. autogluon/timeseries/models/gluonts/__init__.py +3 -1
  42. autogluon/timeseries/models/gluonts/abstract.py +583 -0
  43. autogluon/timeseries/models/gluonts/dataset.py +109 -0
  44. autogluon/timeseries/models/gluonts/{torch/models.py → models.py} +185 -44
  45. autogluon/timeseries/models/local/__init__.py +1 -10
  46. autogluon/timeseries/models/local/abstract_local_model.py +150 -97
  47. autogluon/timeseries/models/local/naive.py +31 -23
  48. autogluon/timeseries/models/local/npts.py +6 -2
  49. autogluon/timeseries/models/local/statsforecast.py +99 -112
  50. autogluon/timeseries/models/multi_window/multi_window_model.py +99 -40
  51. autogluon/timeseries/models/registry.py +64 -0
  52. autogluon/timeseries/models/toto/__init__.py +3 -0
  53. autogluon/timeseries/models/toto/_internal/__init__.py +9 -0
  54. autogluon/timeseries/models/toto/_internal/backbone/__init__.py +3 -0
  55. autogluon/timeseries/models/toto/_internal/backbone/attention.py +196 -0
  56. autogluon/timeseries/models/toto/_internal/backbone/backbone.py +262 -0
  57. autogluon/timeseries/models/toto/_internal/backbone/distribution.py +70 -0
  58. autogluon/timeseries/models/toto/_internal/backbone/kvcache.py +136 -0
  59. autogluon/timeseries/models/toto/_internal/backbone/rope.py +89 -0
  60. autogluon/timeseries/models/toto/_internal/backbone/rotary_embedding_torch.py +342 -0
  61. autogluon/timeseries/models/toto/_internal/backbone/scaler.py +305 -0
  62. autogluon/timeseries/models/toto/_internal/backbone/transformer.py +333 -0
  63. autogluon/timeseries/models/toto/_internal/dataset.py +165 -0
  64. autogluon/timeseries/models/toto/_internal/forecaster.py +423 -0
  65. autogluon/timeseries/models/toto/dataloader.py +108 -0
  66. autogluon/timeseries/models/toto/hf_pretrained_model.py +118 -0
  67. autogluon/timeseries/models/toto/model.py +236 -0
  68. autogluon/timeseries/predictor.py +826 -305
  69. autogluon/timeseries/regressor.py +253 -0
  70. autogluon/timeseries/splitter.py +10 -31
  71. autogluon/timeseries/trainer/__init__.py +2 -3
  72. autogluon/timeseries/trainer/ensemble_composer.py +439 -0
  73. autogluon/timeseries/trainer/model_set_builder.py +256 -0
  74. autogluon/timeseries/trainer/prediction_cache.py +149 -0
  75. autogluon/timeseries/trainer/trainer.py +1298 -0
  76. autogluon/timeseries/trainer/utils.py +17 -0
  77. autogluon/timeseries/transforms/__init__.py +2 -0
  78. autogluon/timeseries/transforms/covariate_scaler.py +164 -0
  79. autogluon/timeseries/transforms/target_scaler.py +149 -0
  80. autogluon/timeseries/utils/constants.py +10 -0
  81. autogluon/timeseries/utils/datetime/base.py +38 -20
  82. autogluon/timeseries/utils/datetime/lags.py +18 -16
  83. autogluon/timeseries/utils/datetime/seasonality.py +14 -14
  84. autogluon/timeseries/utils/datetime/time_features.py +17 -14
  85. autogluon/timeseries/utils/features.py +317 -53
  86. autogluon/timeseries/utils/forecast.py +31 -17
  87. autogluon/timeseries/utils/timer.py +173 -0
  88. autogluon/timeseries/utils/warning_filters.py +44 -6
  89. autogluon/timeseries/version.py +2 -1
  90. autogluon.timeseries-1.4.1b20251210-py3.11-nspkg.pth +1 -0
  91. {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info}/METADATA +71 -47
  92. autogluon_timeseries-1.4.1b20251210.dist-info/RECORD +103 -0
  93. {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info}/WHEEL +1 -1
  94. autogluon/timeseries/configs/presets_configs.py +0 -11
  95. autogluon/timeseries/evaluator.py +0 -6
  96. autogluon/timeseries/models/ensemble/greedy_ensemble.py +0 -170
  97. autogluon/timeseries/models/gluonts/abstract_gluonts.py +0 -550
  98. autogluon/timeseries/models/gluonts/torch/__init__.py +0 -0
  99. autogluon/timeseries/models/presets.py +0 -325
  100. autogluon/timeseries/trainer/abstract_trainer.py +0 -1144
  101. autogluon/timeseries/trainer/auto_trainer.py +0 -74
  102. autogluon.timeseries-1.0.1b20240304-py3.8-nspkg.pth +0 -1
  103. autogluon.timeseries-1.0.1b20240304.dist-info/RECORD +0 -58
  104. {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info/licenses}/LICENSE +0 -0
  105. {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info/licenses}/NOTICE +0 -0
  106. {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info}/namespace_packages.txt +0 -0
  107. {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info}/top_level.txt +0 -0
  108. {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info}/zip-safe +0 -0
@@ -7,42 +7,23 @@ import reprlib
7
7
  from collections.abc import Iterable
8
8
  from itertools import islice
9
9
  from pathlib import Path
10
- from typing import Any, List, Optional, Tuple, Type, Union
10
+ from typing import TYPE_CHECKING, Any, Final, Type, overload
11
11
 
12
+ import numpy as np
12
13
  import pandas as pd
13
14
  from joblib.parallel import Parallel, delayed
14
- from pandas.core.internals import ArrayManager, BlockManager
15
+ from pandas.core.internals import ArrayManager, BlockManager # type: ignore
16
+ from typing_extensions import Self
15
17
 
16
18
  from autogluon.common.loaders import load_pd
17
19
 
18
20
  logger = logging.getLogger(__name__)
19
21
 
20
- ITEMID = "item_id"
21
- TIMESTAMP = "timestamp"
22
22
 
23
- IRREGULAR_TIME_INDEX_FREQSTR = "IRREG"
24
-
25
-
26
- class TimeSeriesDataFrameDeprecatedMixin:
27
- """Contains deprecated methods from TimeSeriesDataFrame that shouldn't show up in API documentation."""
28
-
29
- def get_reindexed_view(self, *args, **kwargs) -> TimeSeriesDataFrame:
30
- raise ValueError(
31
- "`TimeSeriesDataFrame.get_reindexed_view` has been deprecated. If your data has irregular timestamps, "
32
- "please convert it to a regular frequency with `convert_frequency`."
33
- )
34
-
35
- def to_regular_index(self, *args, **kwargs) -> TimeSeriesDataFrame:
36
- raise ValueError(
37
- "`TimeSeriesDataFrame.to_regular_index` has been deprecated. "
38
- "Please use `TimeSeriesDataFrame.convert_frequency` instead."
39
- )
40
-
41
-
42
- class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
23
+ class TimeSeriesDataFrame(pd.DataFrame):
43
24
  """A collection of univariate time series, where each row is identified by an (``item_id``, ``timestamp``) pair.
44
25
 
45
- For example, a time series data frame could represent the daily sales of a collection of products, where each
26
+ For example, a time series dataframe could represent the daily sales of a collection of products, where each
46
27
  ``item_id`` corresponds to a product and ``timestamp`` corresponds to the day of the record.
47
28
 
48
29
  Parameters
@@ -92,7 +73,7 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
92
73
  You can also use :meth:`~autogluon.timeseries.TimeSeriesDataFrame.from_iterable_dataset` for loading data in such format.
93
74
 
94
75
  static_features : pd.DataFrame, str or pathlib.Path, optional
95
- An optional data frame describing the metadata of each individual time series that does not change with time.
76
+ An optional dataframe describing the metadata of each individual time series that does not change with time.
96
77
  Can take real-valued or categorical values. For example, if ``TimeSeriesDataFrame`` contains sales of various
97
78
  products, static features may refer to time-independent features like color or brand.
98
79
 
@@ -130,27 +111,21 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
130
111
  Number of CPU cores used to process the iterable dataset in parallel. Set to -1 to use all cores. This argument
131
112
  is only used when constructing a TimeSeriesDataFrame using format 4 (iterable dataset).
132
113
 
133
- Attributes
134
- ----------
135
- freq : str
136
- A pandas-compatible string describing the frequency of the time series. For example ``"D"`` for daily data,
137
- ``"H"`` for hourly data, etc. This attribute is determined automatically based on the timestamps. For the full
138
- list of possible values, see `pandas documentation <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_.
139
- num_items : int
140
- Number of items (time series) in the data set.
141
- item_ids : pd.Index
142
- List of unique time series IDs contained in the data set.
143
114
  """
144
115
 
145
- index: pd.MultiIndex
146
- _metadata = ["_static_features", "_cached_freq"]
116
+ index: pd.MultiIndex # type: ignore
117
+ _metadata = ["_static_features"]
118
+
119
+ IRREGULAR_TIME_INDEX_FREQSTR: Final[str] = "IRREG"
120
+ ITEMID: Final[str] = "item_id"
121
+ TIMESTAMP: Final[str] = "timestamp"
147
122
 
148
123
  def __init__(
149
124
  self,
150
- data: Union[pd.DataFrame, str, Path, Iterable],
151
- static_features: Optional[Union[pd.DataFrame, str, Path]] = None,
152
- id_column: Optional[str] = None,
153
- timestamp_column: Optional[str] = None,
125
+ data: pd.DataFrame | str | Path | Iterable,
126
+ static_features: pd.DataFrame | str | Path | None = None,
127
+ id_column: str | None = None,
128
+ timestamp_column: str | None = None,
154
129
  num_cpus: int = -1,
155
130
  *args,
156
131
  **kwargs,
@@ -173,17 +148,11 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
173
148
  data = self._construct_tsdf_from_iterable_dataset(data, num_cpus=num_cpus)
174
149
  else:
175
150
  raise ValueError(f"data must be a pd.DataFrame, Iterable, string or Path (received {type(data)}).")
176
- super().__init__(data=data, *args, **kwargs)
177
- self._static_features: Optional[pd.DataFrame] = None
151
+ super().__init__(data=data, *args, **kwargs) # type: ignore
152
+ self._static_features: pd.DataFrame | None = None
178
153
  if static_features is not None:
179
154
  self.static_features = self._construct_static_features(static_features, id_column=id_column)
180
155
 
181
- # internal value for cached frequency values that are inferred. corresponds to either a
182
- # pandas-compatible frequency string, the value IRREGULAR_TIME_INDEX_FREQSTR that signals
183
- # the time series have irregular timestamps (in which case tsdf.freq returns None), or None
184
- # if inference was not yet performed.
185
- self._cached_freq: Optional[str] = None
186
-
187
156
  @property
188
157
  def _constructor(self) -> Type[TimeSeriesDataFrame]:
189
158
  return TimeSeriesDataFrame
@@ -193,36 +162,39 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
193
162
  # repeatedly calling TimeSeriesDataFrame constructor
194
163
  df = self._from_mgr(mgr, axes=axes)
195
164
  df._static_features = self._static_features
196
- df._cached_freq = self._cached_freq
197
165
  return df
198
166
 
199
167
  @classmethod
200
168
  def _construct_tsdf_from_data_frame(
201
169
  cls,
202
170
  df: pd.DataFrame,
203
- id_column: Optional[str] = None,
204
- timestamp_column: Optional[str] = None,
171
+ id_column: str | None = None,
172
+ timestamp_column: str | None = None,
205
173
  ) -> pd.DataFrame:
206
174
  df = df.copy()
207
175
  if id_column is not None:
208
176
  assert id_column in df.columns, f"Column '{id_column}' not found!"
209
- if id_column != ITEMID and ITEMID in df.columns:
210
- logger.warning(f"Renaming existing column '{ITEMID}' -> '__{ITEMID}' to avoid name collisions.")
211
- df.rename(columns={ITEMID: "__" + ITEMID}, inplace=True)
212
- df.rename(columns={id_column: ITEMID}, inplace=True)
177
+ if id_column != cls.ITEMID and cls.ITEMID in df.columns:
178
+ logger.warning(
179
+ f"Renaming existing column '{cls.ITEMID}' -> '__{cls.ITEMID}' to avoid name collisions."
180
+ )
181
+ df.rename(columns={cls.ITEMID: "__" + cls.ITEMID}, inplace=True)
182
+ df.rename(columns={id_column: cls.ITEMID}, inplace=True)
213
183
 
214
184
  if timestamp_column is not None:
215
185
  assert timestamp_column in df.columns, f"Column '{timestamp_column}' not found!"
216
- if timestamp_column != TIMESTAMP and TIMESTAMP in df.columns:
217
- logger.warning(f"Renaming existing column '{TIMESTAMP}' -> '__{TIMESTAMP}' to avoid name collisions.")
218
- df.rename(columns={TIMESTAMP: "__" + TIMESTAMP}, inplace=True)
219
- df.rename(columns={timestamp_column: TIMESTAMP}, inplace=True)
186
+ if timestamp_column != cls.TIMESTAMP and cls.TIMESTAMP in df.columns:
187
+ logger.warning(
188
+ f"Renaming existing column '{cls.TIMESTAMP}' -> '__{cls.TIMESTAMP}' to avoid name collisions."
189
+ )
190
+ df.rename(columns={cls.TIMESTAMP: "__" + cls.TIMESTAMP}, inplace=True)
191
+ df.rename(columns={timestamp_column: cls.TIMESTAMP}, inplace=True)
220
192
 
221
- if TIMESTAMP in df.columns:
222
- df[TIMESTAMP] = pd.to_datetime(df[TIMESTAMP])
193
+ if cls.TIMESTAMP in df.columns:
194
+ df[cls.TIMESTAMP] = pd.to_datetime(df[cls.TIMESTAMP])
223
195
 
224
196
  cls._validate_data_frame(df)
225
- return df.set_index([ITEMID, TIMESTAMP])
197
+ return df.set_index([cls.ITEMID, cls.TIMESTAMP])
226
198
 
227
199
  @classmethod
228
200
  def _construct_tsdf_from_iterable_dataset(cls, iterable_dataset: Iterable, num_cpus: int = -1) -> pd.DataFrame:
@@ -233,7 +205,7 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
233
205
  start_timestamp = start_timestamp.to_timestamp(how="S")
234
206
  target = ts["target"]
235
207
  datetime_index = tuple(pd.date_range(start_timestamp, periods=len(target), freq=freq))
236
- idx = pd.MultiIndex.from_product([(item_id,), datetime_index], names=[ITEMID, TIMESTAMP])
208
+ idx = pd.MultiIndex.from_product([(item_id,), datetime_index], names=[cls.ITEMID, cls.TIMESTAMP])
237
209
  return pd.Series(target, name="target", index=idx).to_frame()
238
210
 
239
211
  cls._validate_iterable(iterable_dataset)
@@ -250,32 +222,34 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
250
222
  raise ValueError(f"data must be a pd.DataFrame, got {type(data)}")
251
223
  if not isinstance(data.index, pd.MultiIndex):
252
224
  raise ValueError(f"data must have pd.MultiIndex, got {type(data.index)}")
253
- if not pd.api.types.is_datetime64_dtype(data.index.dtypes[TIMESTAMP]):
254
- raise ValueError(f"for {TIMESTAMP}, the only pandas dtype allowed is `datetime64`.")
255
- if not data.index.names == (f"{ITEMID}", f"{TIMESTAMP}"):
256
- raise ValueError(f"data must have index names as ('{ITEMID}', '{TIMESTAMP}'), got {data.index.names}")
257
- item_id_index = data.index.get_level_values(level=ITEMID)
225
+ if not pd.api.types.is_datetime64_dtype(data.index.dtypes[cls.TIMESTAMP]):
226
+ raise ValueError(f"for {cls.TIMESTAMP}, the only pandas dtype allowed is `datetime64`.")
227
+ if not data.index.names == (f"{cls.ITEMID}", f"{cls.TIMESTAMP}"):
228
+ raise ValueError(
229
+ f"data must have index names as ('{cls.ITEMID}', '{cls.TIMESTAMP}'), got {data.index.names}"
230
+ )
231
+ item_id_index = data.index.levels[0]
258
232
  if not (pd.api.types.is_integer_dtype(item_id_index) or pd.api.types.is_string_dtype(item_id_index)):
259
- raise ValueError(f"all entries in index `{ITEMID}` must be of integer or string dtype")
233
+ raise ValueError(f"all entries in index `{cls.ITEMID}` must be of integer or string dtype")
260
234
 
261
235
  @classmethod
262
236
  def _validate_data_frame(cls, df: pd.DataFrame):
263
237
  """Validate that a pd.DataFrame with ITEMID and TIMESTAMP columns can be converted to TimeSeriesDataFrame"""
264
238
  if not isinstance(df, pd.DataFrame):
265
239
  raise ValueError(f"data must be a pd.DataFrame, got {type(df)}")
266
- if ITEMID not in df.columns:
267
- raise ValueError(f"data must have a `{ITEMID}` column")
268
- if TIMESTAMP not in df.columns:
269
- raise ValueError(f"data must have a `{TIMESTAMP}` column")
270
- if df[ITEMID].isnull().any():
271
- raise ValueError(f"`{ITEMID}` column can not have nan")
272
- if df[TIMESTAMP].isnull().any():
273
- raise ValueError(f"`{TIMESTAMP}` column can not have nan")
274
- if not pd.api.types.is_datetime64_dtype(df[TIMESTAMP]):
275
- raise ValueError(f"for {TIMESTAMP}, the only pandas dtype allowed is `datetime64`.")
276
- item_id_column = df[ITEMID]
240
+ if cls.ITEMID not in df.columns:
241
+ raise ValueError(f"data must have a `{cls.ITEMID}` column")
242
+ if cls.TIMESTAMP not in df.columns:
243
+ raise ValueError(f"data must have a `{cls.TIMESTAMP}` column")
244
+ if df[cls.ITEMID].isnull().any():
245
+ raise ValueError(f"`{cls.ITEMID}` column can not have nan")
246
+ if df[cls.TIMESTAMP].isnull().any():
247
+ raise ValueError(f"`{cls.TIMESTAMP}` column can not have nan")
248
+ if not pd.api.types.is_datetime64_dtype(df[cls.TIMESTAMP]):
249
+ raise ValueError(f"for {cls.TIMESTAMP}, the only pandas dtype allowed is `datetime64`.")
250
+ item_id_column = df[cls.ITEMID]
277
251
  if not (pd.api.types.is_integer_dtype(item_id_column) or pd.api.types.is_string_dtype(item_id_column)):
278
- raise ValueError(f"all entries in column `{ITEMID}` must be of integer or string dtype")
252
+ raise ValueError(f"all entries in column `{cls.ITEMID}` must be of integer or string dtype")
279
253
 
280
254
  @classmethod
281
255
  def _validate_iterable(cls, data: Iterable):
@@ -298,9 +272,9 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
298
272
  def from_data_frame(
299
273
  cls,
300
274
  df: pd.DataFrame,
301
- id_column: Optional[str] = None,
302
- timestamp_column: Optional[str] = None,
303
- static_features_df: Optional[pd.DataFrame] = None,
275
+ id_column: str | None = None,
276
+ timestamp_column: str | None = None,
277
+ static_features_df: pd.DataFrame | None = None,
304
278
  ) -> TimeSeriesDataFrame:
305
279
  """Construct a ``TimeSeriesDataFrame`` from a pandas DataFrame.
306
280
 
@@ -334,17 +308,17 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
334
308
  Returns
335
309
  -------
336
310
  ts_df: TimeSeriesDataFrame
337
- A data frame in TimeSeriesDataFrame format.
311
+ A dataframe in TimeSeriesDataFrame format.
338
312
  """
339
313
  return cls(df, static_features=static_features_df, id_column=id_column, timestamp_column=timestamp_column)
340
314
 
341
315
  @classmethod
342
316
  def from_path(
343
317
  cls,
344
- path: Union[str, Path],
345
- id_column: Optional[str] = None,
346
- timestamp_column: Optional[str] = None,
347
- static_features_path: Optional[Union[str, Path]] = None,
318
+ path: str | Path,
319
+ id_column: str | None = None,
320
+ timestamp_column: str | None = None,
321
+ static_features_path: str | Path | None = None,
348
322
  ) -> TimeSeriesDataFrame:
349
323
  """Construct a ``TimeSeriesDataFrame`` from a CSV or Parquet file.
350
324
 
@@ -381,7 +355,7 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
381
355
  Returns
382
356
  -------
383
357
  ts_df: TimeSeriesDataFrame
384
- A data frame in TimeSeriesDataFrame format.
358
+ A dataframe in TimeSeriesDataFrame format.
385
359
  """
386
360
  return cls(path, static_features=static_features_path, id_column=id_column, timestamp_column=timestamp_column)
387
361
 
@@ -410,22 +384,20 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
410
384
  Returns
411
385
  -------
412
386
  ts_df: TimeSeriesDataFrame
413
- A data frame in TimeSeriesDataFrame format.
387
+ A dataframe in TimeSeriesDataFrame format.
414
388
  """
415
389
  return cls(iterable_dataset, num_cpus=num_cpus)
416
390
 
417
391
  @property
418
392
  def item_ids(self) -> pd.Index:
419
- return self.index.unique(level=ITEMID)
420
-
421
- @property
422
- def static_features(self):
423
- return self._static_features
393
+ """List of unique time series IDs contained in the data set."""
394
+ return self.index.unique(level=self.ITEMID)
424
395
 
396
+ @classmethod
425
397
  def _construct_static_features(
426
398
  cls,
427
- static_features: Union[pd.DataFrame, str, Path],
428
- id_column: Optional[str] = None,
399
+ static_features: pd.DataFrame | str | Path,
400
+ id_column: str | None = None,
429
401
  ) -> pd.DataFrame:
430
402
  if isinstance(static_features, (str, Path)):
431
403
  static_features = load_pd.load(str(static_features))
@@ -436,14 +408,20 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
436
408
 
437
409
  if id_column is not None:
438
410
  assert id_column in static_features.columns, f"Column '{id_column}' not found in static_features!"
439
- if id_column != ITEMID and ITEMID in static_features.columns:
440
- logger.warning(f"Renaming existing column '{ITEMID}' -> '__{ITEMID}' to avoid name collisions.")
441
- static_features.rename(columns={ITEMID: "__" + ITEMID}, inplace=True)
442
- static_features.rename(columns={id_column: ITEMID}, inplace=True)
411
+ if id_column != cls.ITEMID and cls.ITEMID in static_features.columns:
412
+ logger.warning(
413
+ f"Renaming existing column '{cls.ITEMID}' -> '__{cls.ITEMID}' to avoid name collisions."
414
+ )
415
+ static_features.rename(columns={cls.ITEMID: "__" + cls.ITEMID}, inplace=True)
416
+ static_features.rename(columns={id_column: cls.ITEMID}, inplace=True)
443
417
  return static_features
444
418
 
419
+ @property
420
+ def static_features(self):
421
+ return self._static_features
422
+
445
423
  @static_features.setter
446
- def static_features(self, value: Optional[pd.DataFrame]):
424
+ def static_features(self, value: pd.DataFrame | None):
447
425
  # if the current item index is not a multiindex, then we are dealing with a single
448
426
  # item slice. this should only happen when the user explicitly requests only a
449
427
  # single item or during `slice_by_timestep`. In this case we do not set static features
@@ -460,10 +438,10 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
460
438
 
461
439
  # Avoid modifying static features inplace
462
440
  value = value.copy()
463
- if ITEMID in value.columns and value.index.name != ITEMID:
464
- value = value.set_index(ITEMID)
465
- if value.index.name != ITEMID:
466
- value.index.rename(ITEMID, inplace=True)
441
+ if self.ITEMID in value.columns and value.index.name != self.ITEMID:
442
+ value = value.set_index(self.ITEMID)
443
+ if value.index.name != self.ITEMID:
444
+ value.index.rename(self.ITEMID, inplace=True)
467
445
  missing_item_ids = self.item_ids.difference(value.index)
468
446
  if len(missing_item_ids) > 0:
469
447
  raise ValueError(
@@ -476,37 +454,102 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
476
454
 
477
455
  self._static_features = value
478
456
 
457
+ def infer_frequency(self, num_items: int | None = None, raise_if_irregular: bool = False) -> str:
458
+ """Infer the time series frequency based on the timestamps of the observations.
459
+
460
+ Parameters
461
+ ----------
462
+ num_items : int or None, default = None
463
+ Number of items (individual time series) randomly selected to infer the frequency. Lower values speed up
464
+ the method, but increase the chance that some items with invalid frequency are missed by subsampling.
465
+
466
+ If set to ``None``, all items will be used for inferring the frequency.
467
+ raise_if_irregular : bool, default = False
468
+ If True, an exception will be raised if some items have an irregular frequency, or if different items have
469
+ different frequencies.
470
+
471
+ Returns
472
+ -------
473
+ freq : str
474
+ If all time series have a regular frequency, returns a pandas-compatible `frequency alias <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_.
475
+
476
+ If some items have an irregular frequency or if different items have different frequencies, returns string
477
+ ``IRREG``.
478
+ """
479
+ ts_df = self
480
+ if num_items is not None and ts_df.num_items > num_items:
481
+ items_subset = ts_df.item_ids.to_series().sample(n=num_items, random_state=123)
482
+ ts_df = ts_df.loc[items_subset]
483
+
484
+ if not ts_df.index.is_monotonic_increasing:
485
+ ts_df = ts_df.sort_index()
486
+
487
+ indptr = ts_df.get_indptr()
488
+ item_ids = ts_df.item_ids
489
+ timestamps = ts_df.index.get_level_values(level=1)
490
+ candidate_freq = ts_df.index.levels[1].freq
491
+
492
+ frequencies = []
493
+ irregular_items = []
494
+ for i in range(len(indptr) - 1):
495
+ start, end = indptr[i], indptr[i + 1]
496
+ item_timestamps = timestamps[start:end]
497
+ inferred_freq = item_timestamps.inferred_freq
498
+
499
+ # Fallback option: maybe original index has a `freq` attribute that pandas fails to infer (e.g., 'SME')
500
+ if inferred_freq is None and candidate_freq is not None:
501
+ try:
502
+ # If this line does not raise an exception, then candidate_freq is a compatible frequency
503
+ item_timestamps.freq = candidate_freq
504
+ except ValueError:
505
+ inferred_freq = None
506
+ else:
507
+ inferred_freq = candidate_freq.freqstr
508
+
509
+ if inferred_freq is None:
510
+ irregular_items.append(item_ids[i])
511
+ else:
512
+ frequencies.append(inferred_freq)
513
+
514
+ unique_freqs = list(set(frequencies))
515
+ if len(unique_freqs) != 1 or len(irregular_items) > 0:
516
+ if raise_if_irregular:
517
+ if irregular_items:
518
+ raise ValueError(
519
+ f"Cannot infer frequency. Items with irregular frequency: {reprlib.repr(irregular_items)}"
520
+ )
521
+ else:
522
+ raise ValueError(f"Cannot infer frequency. Multiple frequencies detected: {unique_freqs}")
523
+ else:
524
+ return self.IRREGULAR_TIME_INDEX_FREQSTR
525
+ else:
526
+ return pd.tseries.frequencies.to_offset(unique_freqs[0]).freqstr
527
+
479
528
  @property
480
529
  def freq(self):
481
- if self._cached_freq is not None and self._cached_freq == IRREGULAR_TIME_INDEX_FREQSTR:
482
- return None # irregularly sampled time series
483
- elif self._cached_freq:
484
- return self._cached_freq
485
-
486
- def get_freq(series):
487
- return series.index.freq or series.index.inferred_freq
488
-
489
- # check the frequencies of the first 100 items to see if frequencies are consistent and
490
- # can be inferred
491
- freq_for_each_series = [get_freq(self.loc[idx]) for idx in self.item_ids[:100]]
492
- freq = freq_for_each_series[0]
493
- if len(set(freq_for_each_series)) > 1 or freq is None:
494
- self._cached_freq = IRREGULAR_TIME_INDEX_FREQSTR
495
- return None
496
-
497
- freq = freq.freqstr if isinstance(freq, pd._libs.tslibs.BaseOffset) else freq
498
- self._cached_freq = freq
499
- return freq
530
+ """Inferred pandas-compatible frequency of the timestamps in the dataframe.
531
+
532
+ Computed using a random subset of the time series for speed. This may sometimes result in incorrectly inferred
533
+ values. For reliable results, use :meth:`~autogluon.timeseries.TimeSeriesDataFrame.infer_frequency`.
534
+ """
535
+ inferred_freq = self.infer_frequency(num_items=50)
536
+ return None if inferred_freq == self.IRREGULAR_TIME_INDEX_FREQSTR else inferred_freq
500
537
 
501
538
  @property
502
539
  def num_items(self):
540
+ """Number of items (time series) in the data set."""
503
541
  return len(self.item_ids)
504
542
 
505
543
  def num_timesteps_per_item(self) -> pd.Series:
506
- """Length of each time series in the dataframe."""
507
- return self.groupby(level=ITEMID, sort=False).size()
544
+ """Number of observations in each time series in the dataframe.
545
+
546
+ Returns a ``pandas.Series`` with ``item_id`` as index and number of observations per item as values.
547
+ """
548
+ counts = pd.Series(self.index.codes[0]).value_counts(sort=False)
549
+ counts.index = self.index.levels[0][counts.index]
550
+ return counts
508
551
 
509
- def copy(self: TimeSeriesDataFrame, deep: bool = True) -> pd.DataFrame: # noqa
552
+ def copy(self: TimeSeriesDataFrame, deep: bool = True) -> TimeSeriesDataFrame:
510
553
  """Make a copy of the TimeSeriesDataFrame.
511
554
 
512
555
  When ``deep=True`` (default), a new object will be created with a copy of the calling object's data and
@@ -527,24 +570,22 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
527
570
  return obj
528
571
 
529
572
  def __finalize__( # noqa
530
- self: TimeSeriesDataFrame, other, method: Optional[str] = None, **kwargs
573
+ self: TimeSeriesDataFrame, other, method: str | None = None, **kwargs
531
574
  ) -> TimeSeriesDataFrame:
532
575
  super().__finalize__(other=other, method=method, **kwargs)
533
576
  # when finalizing the copy/slice operation, we use the property setter to stay consistent
534
577
  # with the item index
535
578
  if hasattr(other, "_static_features"):
536
579
  self.static_features = other._static_features
537
- if hasattr(other, "_cached_freq"):
538
- self._cached_freq = other._cached_freq
539
580
  return self
540
581
 
541
- def split_by_time(self, cutoff_time: pd.Timestamp) -> Tuple[TimeSeriesDataFrame, TimeSeriesDataFrame]:
582
+ def split_by_time(self, cutoff_time: pd.Timestamp) -> tuple[TimeSeriesDataFrame, TimeSeriesDataFrame]:
542
583
  """Split dataframe to two different ``TimeSeriesDataFrame`` s before and after a certain ``cutoff_time``.
543
584
 
544
585
  Parameters
545
586
  ----------
546
587
  cutoff_time: pd.Timestamp
547
- The time to split the current data frame into two data frames.
588
+ The time to split the current dataframe into two dataframes.
548
589
 
549
590
  Returns
550
591
  -------
@@ -559,19 +600,16 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
559
600
  data_after = self.loc[(slice(None), slice(cutoff_time, None)), :]
560
601
  before = TimeSeriesDataFrame(data_before, static_features=self.static_features)
561
602
  after = TimeSeriesDataFrame(data_after, static_features=self.static_features)
562
- before._cached_freq = self._cached_freq
563
- after._cached_freq = self._cached_freq
564
603
  return before, after
565
604
 
566
- def slice_by_timestep(
567
- self, start_index: Optional[int] = None, end_index: Optional[int] = None
568
- ) -> TimeSeriesDataFrame:
605
+ def slice_by_timestep(self, start_index: int | None = None, end_index: int | None = None) -> TimeSeriesDataFrame:
569
606
  """Select a subsequence from each time series between start (inclusive) and end (exclusive) indices.
570
607
 
571
608
  This operation is equivalent to selecting a slice ``[start_index : end_index]`` from each time series, and then
572
609
  combining these slices into a new ``TimeSeriesDataFrame``. See examples below.
573
610
 
574
- Returns a copy of the original data. This is useful for constructing holdout sets for validation.
611
+ It is recommended to sort the index with ``ts_df.sort_index()`` before calling this method to take advantage of
612
+ a fast optimized algorithm.
575
613
 
576
614
  Parameters
577
615
  ----------
@@ -658,11 +696,53 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
658
696
  if end_index is not None and not isinstance(end_index, int):
659
697
  raise ValueError(f"end_index must be of type int or None (got {type(end_index)})")
660
698
 
661
- time_step_slice = slice(start_index, end_index)
662
- result = self.groupby(level=ITEMID, sort=False, as_index=False).nth(time_step_slice)
663
- result.static_features = self.static_features
664
- result._cached_freq = self._cached_freq
665
- return result
699
+ if start_index is None and end_index is None:
700
+ # Return a copy to avoid in-place modification.
701
+ # self.copy() is much faster than self.loc[ones(len(self), dtype=bool)]
702
+ return self.copy()
703
+
704
+ if self.index.is_monotonic_increasing:
705
+ # Use a fast optimized algorithm if the index is sorted
706
+ indptr = self.get_indptr()
707
+ lengths = np.diff(indptr)
708
+ starts = indptr[:-1]
709
+
710
+ slice_start = (
711
+ np.zeros_like(lengths)
712
+ if start_index is None
713
+ else np.clip(np.where(start_index >= 0, start_index, lengths + start_index), 0, lengths)
714
+ )
715
+ slice_end = (
716
+ lengths.copy()
717
+ if end_index is None
718
+ else np.clip(np.where(end_index >= 0, end_index, lengths + end_index), 0, lengths)
719
+ )
720
+
721
+ # Filter out invalid slices where start >= end
722
+ valid_slices = slice_start < slice_end
723
+ if not np.any(valid_slices):
724
+ # Return empty dataframe with same structure
725
+ return self.loc[np.zeros(len(self), dtype=bool)]
726
+
727
+ starts = starts[valid_slices]
728
+ slice_start = slice_start[valid_slices]
729
+ slice_end = slice_end[valid_slices]
730
+
731
+ # We put 1 at the slice_start index for each item and -1 at the slice_end index for each item.
732
+ # After we apply cumsum we get the indicator mask selecting values between slice_start and slice_end
733
+ # cumsum([0, 0, 1, 0, 0, -1, 0]) -> [0, 0, 1, 1, 1, 0, 0]
734
+ # We need array of size len(self) + 1 in case events[starts + slice_end] tries to access position len(self)
735
+ events = np.zeros(len(self) + 1, dtype=np.int8)
736
+ events[starts + slice_start] += 1
737
+ events[starts + slice_end] -= 1
738
+ mask = np.cumsum(events)[:-1].astype(bool)
739
+ # loc[mask] returns a view of the original data - modifying it will produce a SettingWithCopyWarning
740
+ return self.loc[mask]
741
+ else:
742
+ # Fall back to a slow groupby operation
743
+ result = self.groupby(level=self.ITEMID, sort=False, as_index=False).nth(slice(start_index, end_index))
744
+ result.static_features = self.static_features
745
+ return result
666
746
 
667
747
  def slice_by_time(self, start_time: pd.Timestamp, end_time: pd.Timestamp) -> TimeSeriesDataFrame:
668
748
  """Select a subsequence from each time series between start (inclusive) and end (exclusive) timestamps.
@@ -691,7 +771,7 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
691
771
 
692
772
  @classmethod
693
773
  def from_pickle(cls, filepath_or_buffer: Any) -> TimeSeriesDataFrame:
694
- """Convenience method to read pickled time series data frames. If the read pickle
774
+ """Convenience method to read pickled time series dataframes. If the read pickle
695
775
  file refers to a plain pandas DataFrame, it will be cast to a TimeSeriesDataFrame.
696
776
 
697
777
  Parameters
@@ -702,7 +782,7 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
702
782
  Returns
703
783
  -------
704
784
  ts_df : TimeSeriesDataFrame
705
- The pickled time series data frame.
785
+ The pickled time series dataframe.
706
786
  """
707
787
  try:
708
788
  data = pd.read_pickle(filepath_or_buffer)
@@ -713,16 +793,21 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
713
793
  def fill_missing_values(self, method: str = "auto", value: float = 0.0) -> TimeSeriesDataFrame:
714
794
  """Fill missing values represented by NaN.
715
795
 
796
+ .. note::
797
+ This method assumes that the index of the TimeSeriesDataFrame is sorted by [item_id, timestamp].
798
+
799
+ If the index is not sorted, this method will log a warning and may produce an incorrect result.
800
+
716
801
  Parameters
717
802
  ----------
718
803
  method : str, default = "auto"
719
804
  Method used to impute missing values.
720
805
 
721
- - "auto" - first forward fill (to fill the in-between and trailing NaNs), then backward fill (to fill the leading NaNs)
722
- - "ffill" or "pad" - propagate last valid observation forward. Note: missing values at the start of the time series are not filled.
723
- - "bfill" or "backfill" - use next valid observation to fill gap. Note: this may result in information leakage; missing values at the end of the time series are not filled.
724
- - "constant" - replace NaNs with the given constant ``value``.
725
- - "interpolate" - fill NaN values using linear interpolation. Note: this may result in information leakage.
806
+ - ``"auto"`` - first forward fill (to fill the in-between and trailing NaNs), then backward fill (to fill the leading NaNs)
807
+ - ``"ffill"`` or ``"pad"`` - propagate last valid observation forward. Note: missing values at the start of the time series are not filled.
808
+ - ``"bfill"`` or ``"backfill"`` - use next valid observation to fill gap. Note: this may result in information leakage; missing values at the end of the time series are not filled.
809
+ - ``"constant"`` - replace NaNs with the given constant ``value``.
810
+ - ``"interpolate"`` - fill NaN values using linear interpolation. Note: this may result in information leakage.
726
811
  value : float, default = 0.0
727
812
  Value used by the "constant" imputation method.
728
813
 
@@ -759,17 +844,25 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
759
844
  2019-02-07 4.0
760
845
 
761
846
  """
762
- if self.freq is None:
763
- raise ValueError(
764
- "Please make sure that all time series have a regular index before calling `fill_missing_values`"
765
- "(for example, using the `convert_frequency` method)."
847
+ # Convert to pd.DataFrame for faster processing
848
+ df = pd.DataFrame(self)
849
+
850
+ # Skip filling if there are no NaNs
851
+ if not df.isna().any(axis=None):
852
+ return self
853
+
854
+ if not self.index.is_monotonic_increasing:
855
+ logger.warning(
856
+ "Trying to fill missing values in an unsorted dataframe. "
857
+ "It is highly recommended to call `ts_df.sort_index()` before calling `ts_df.fill_missing_values()`"
766
858
  )
767
859
 
768
- grouped_df = pd.DataFrame(self).groupby(level=ITEMID, sort=False, group_keys=False)
860
+ grouped_df = df.groupby(level=self.ITEMID, sort=False, group_keys=False)
769
861
  if method == "auto":
770
862
  filled_df = grouped_df.ffill()
771
- # Fill missing values at the start of each time series with bfill
772
- filled_df = filled_df.groupby(level=ITEMID, sort=False, group_keys=False).bfill()
863
+ # If necessary, fill missing values at the start of each time series with bfill
864
+ if filled_df.isna().any(axis=None):
865
+ filled_df = filled_df.groupby(level=self.ITEMID, sort=False, group_keys=False).bfill()
773
866
  elif method in ["ffill", "pad"]:
774
867
  filled_df = grouped_df.ffill()
775
868
  elif method in ["bfill", "backfill"]:
@@ -786,7 +879,7 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
786
879
  )
787
880
  return TimeSeriesDataFrame(filled_df, static_features=self.static_features)
788
881
 
789
- def dropna(self, how: str = "any") -> TimeSeriesDataFrame:
882
+ def dropna(self, how: str = "any") -> TimeSeriesDataFrame: # type: ignore[override]
790
883
  """Drop rows containing NaNs.
791
884
 
792
885
  Parameters
@@ -802,18 +895,27 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
802
895
  dropped_df = pd.DataFrame(self).dropna(how=how)
803
896
  return TimeSeriesDataFrame(dropped_df, static_features=self.static_features)
804
897
 
898
+ # added for static type checker compatibility
899
+ def assign(self, **kwargs) -> TimeSeriesDataFrame:
900
+ """Assign new columns to the time series dataframe. See :meth:`pandas.DataFrame.assign` for details."""
901
+ return super().assign(**kwargs) # type: ignore
902
+
903
+ # added for static type checker compatibility
904
+ def sort_index(self, *args, **kwargs) -> TimeSeriesDataFrame:
905
+ return super().sort_index(*args, **kwargs) # type: ignore
906
+
805
907
  def get_model_inputs_for_scoring(
806
- self, prediction_length: int, known_covariates_names: Optional[List[str]] = None
807
- ) -> Tuple[TimeSeriesDataFrame, Optional[TimeSeriesDataFrame]]:
908
+ self, prediction_length: int, known_covariates_names: list[str] | None = None
909
+ ) -> tuple[TimeSeriesDataFrame, TimeSeriesDataFrame | None]:
808
910
  """Prepare model inputs necessary to predict the last ``prediction_length`` time steps of each time series in the dataset.
809
911
 
810
912
  Parameters
811
913
  ----------
812
914
  prediction_length : int
813
915
  The forecast horizon, i.e., How many time steps into the future must be predicted.
814
- known_covariates_names : List[str], optional
916
+ known_covariates_names : list[str], optional
815
917
  Names of the dataframe columns that contain covariates known in the future.
816
- See :attr:`known_covariates_names` of :class:`~autogluon.timeseries.TimeSeriesPredictor` for more details.
918
+ See ``known_covariates_names`` of :class:`~autogluon.timeseries.TimeSeriesPredictor` for more details.
817
919
 
818
920
  Returns
819
921
  -------
@@ -834,12 +936,16 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
834
936
  def train_test_split(
835
937
  self,
836
938
  prediction_length: int,
837
- end_index: Optional[int] = None,
838
- suffix: Optional[str] = None,
839
- ) -> Tuple[TimeSeriesDataFrame, TimeSeriesDataFrame]:
939
+ end_index: int | None = None,
940
+ suffix: str | None = None,
941
+ ) -> tuple[TimeSeriesDataFrame, TimeSeriesDataFrame]:
840
942
  """Generate a train/test split from the given dataset.
943
+
841
944
  This method can be used to generate splits for multi-window backtesting.
842
945
 
946
+ .. note::
947
+ This method automatically sorts the TimeSeriesDataFrame by [item_id, timestamp].
948
+
843
949
  Parameters
844
950
  ----------
845
951
  prediction_length : int
@@ -858,7 +964,11 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
858
964
  test_data : TimeSeriesDataFrame
859
965
  Test portion of the data. Contains the slice ``[:end_idx]`` of each time series in the original dataset.
860
966
  """
861
- test_data = self.slice_by_timestep(None, end_index)
967
+ df = self
968
+ if not df.index.is_monotonic_increasing:
969
+ logger.warning("Sorting the dataframe index before generating the train/test split.")
970
+ df = df.sort_index()
971
+ test_data = df.slice_by_timestep(None, end_index)
862
972
  train_data = test_data.slice_by_timestep(None, -prediction_length)
863
973
 
864
974
  if suffix is not None:
@@ -872,14 +982,14 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
872
982
 
873
983
  def convert_frequency(
874
984
  self,
875
- freq: Union[str, pd.DateOffset],
985
+ freq: str | pd.DateOffset,
876
986
  agg_numeric: str = "mean",
877
987
  agg_categorical: str = "first",
878
988
  num_cpus: int = -1,
879
989
  chunk_size: int = 100,
880
990
  **kwargs,
881
991
  ) -> TimeSeriesDataFrame:
882
- """Convert each time series in the data frame to the given frequency.
992
+ """Convert each time series in the dataframe to the given frequency.
883
993
 
884
994
  This method is useful for two purposes:
885
995
 
@@ -889,10 +999,9 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
889
999
  Standard ``df.groupby(...).resample(...)`` can be extremely slow for large datasets, so we parallelize this
890
1000
  operation across multiple CPU cores.
891
1001
 
892
-
893
1002
  Parameters
894
1003
  ----------
895
- freq : Union[str, pd.DateOffset]
1004
+ freq : str | pd.DateOffset
896
1005
  Frequency to which the data should be converted. See `pandas frequency aliases <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_
897
1006
  for supported values.
898
1007
  agg_numeric : {"max", "min", "sum", "mean", "median", "first", "last"}, default = "mean"
@@ -953,20 +1062,18 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
953
1062
  2021-06-30 6.0
954
1063
  2021-09-30 7.0
955
1064
  2021-12-31 8.0
956
- >>> ts_df.convert_frequency("Y")
1065
+ >>> ts_df.convert_frequency("YE")
957
1066
  target
958
1067
  item_id timestamp
959
1068
  0 2020-12-31 2.5
960
1069
  2021-12-31 6.5
961
- >>> ts_df.convert_frequency("Y", agg_numeric="sum")
1070
+ >>> ts_df.convert_frequency("YE", agg_numeric="sum")
962
1071
  target
963
1072
  item_id timestamp
964
1073
  0 2020-12-31 10.0
965
1074
  2021-12-31 26.0
966
1075
  """
967
1076
  offset = pd.tseries.frequencies.to_offset(freq)
968
- if self.freq == offset.freqstr:
969
- return self
970
1077
 
971
1078
  # We need to aggregate categorical columns separately because .agg("mean") deletes all non-numeric columns
972
1079
  aggregation = {}
@@ -981,22 +1088,68 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
981
1088
  iterable = iter(iterable)
982
1089
  return iter(lambda: tuple(islice(iterable, size)), ())
983
1090
 
984
- def resample_chunk(chunk: Iterable[Tuple[str, pd.DataFrame]]) -> pd.DataFrame:
1091
+ def resample_chunk(chunk: Iterable[tuple[str, pd.DataFrame]]) -> pd.DataFrame:
985
1092
  resampled_dfs = []
986
1093
  for item_id, df in chunk:
987
- resampled_df = df.resample(offset, level=TIMESTAMP, **kwargs).agg(aggregation)
988
- resampled_dfs.append(pd.concat({item_id: resampled_df}, names=[ITEMID]))
1094
+ resampled_df = df.resample(offset, level=self.TIMESTAMP, **kwargs).agg(aggregation)
1095
+ resampled_dfs.append(pd.concat({item_id: resampled_df}, names=[self.ITEMID]))
989
1096
  return pd.concat(resampled_dfs)
990
1097
 
991
1098
  # Resampling time for 1 item < overhead time for a single parallel job. Therefore, we group items into chunks
992
1099
  # so that the speedup from parallelization isn't dominated by the communication costs.
993
- chunks = split_into_chunks(pd.DataFrame(self).groupby(level=ITEMID, sort=False), chunk_size)
1100
+ df = pd.DataFrame(self)
1101
+ # Make sure that timestamp index has dtype 'datetime64[ns]', otherwise index may contain NaT values.
1102
+ # See https://github.com/autogluon/autogluon/issues/4917
1103
+ df.index = df.index.set_levels(df.index.levels[1].astype("datetime64[ns]"), level=self.TIMESTAMP)
1104
+ chunks = split_into_chunks(df.groupby(level=self.ITEMID, sort=False), chunk_size)
994
1105
  resampled_chunks = Parallel(n_jobs=num_cpus)(delayed(resample_chunk)(chunk) for chunk in chunks)
995
1106
  resampled_df = TimeSeriesDataFrame(pd.concat(resampled_chunks))
996
1107
  resampled_df.static_features = self.static_features
997
1108
  return resampled_df
998
1109
 
999
- def __dir__(self) -> List[str]:
1000
- # This hides method from IPython autocomplete, but not VSCode autocomplete
1001
- deprecated = ["get_reindexed_view", "to_regular_index"]
1002
- return [d for d in super().__dir__() if d not in deprecated]
1110
+ def to_data_frame(self) -> pd.DataFrame:
1111
+ """Convert ``TimeSeriesDataFrame`` to a ``pandas.DataFrame``"""
1112
+ return pd.DataFrame(self)
1113
+
1114
+ def get_indptr(self) -> np.ndarray:
1115
+ """[Advanced] Get a numpy array of shape [num_items + 1] that points to the start and end of each time series.
1116
+
1117
+ This method assumes that the TimeSeriesDataFrame is sorted by [item_id, timestamp].
1118
+ """
1119
+ return np.concatenate([[0], np.cumsum(self.num_timesteps_per_item().to_numpy())]).astype(np.int32)
1120
+
1121
+ # inline typing stubs for various overridden methods
1122
+ if TYPE_CHECKING:
1123
+
1124
+ def query( # type: ignore
1125
+ self, expr: str, *, inplace: bool = False, **kwargs
1126
+ ) -> Self: ...
1127
+
1128
+ def reindex(*args, **kwargs) -> Self: ... # type: ignore
1129
+
1130
+ @overload
1131
+ def __new__(cls, data: pd.DataFrame, static_features: pd.DataFrame | None = None) -> Self: ... # type: ignore
1132
+ @overload
1133
+ def __new__(
1134
+ cls,
1135
+ data: pd.DataFrame | str | Path | Iterable,
1136
+ static_features: pd.DataFrame | str | Path | None = None,
1137
+ id_column: str | None = None,
1138
+ timestamp_column: str | None = None,
1139
+ num_cpus: int = -1,
1140
+ *args,
1141
+ **kwargs,
1142
+ ) -> Self:
1143
+ """This overload is needed since in pandas, during type checking, the default constructor resolves to __new__"""
1144
+ ...
1145
+
1146
+ @overload
1147
+ def __getitem__(self, items: list[str]) -> Self: ... # type: ignore
1148
+ @overload
1149
+ def __getitem__(self, item: str) -> pd.Series: ... # type: ignore
1150
+
1151
+
1152
+ # TODO: remove with v2.0
1153
+ # module-level constants kept for backward compatibility.
1154
+ ITEMID = TimeSeriesDataFrame.ITEMID
1155
+ TIMESTAMP = TimeSeriesDataFrame.TIMESTAMP