tabpfn-time-series 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1232 @@
1
+ # This file is copied from https://github.com/autogluon/autogluon/blob/main/autogluon/timeseries/ts_dataframe.py
2
+ # and modified to remove the autogluon dependency.
3
+ # - remove the features that allow loading directly from a local file.
4
+
5
+ from __future__ import annotations
6
+
7
+ import copy
8
+ import itertools
9
+ import logging
10
+ import reprlib
11
+ from collections.abc import Iterable
12
+ from itertools import islice
13
+ from pathlib import Path
14
+ from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type, Union, overload
15
+
16
+ import numpy as np
17
+ import pandas as pd
18
+ from joblib.parallel import Parallel, delayed
19
+ from pandas.core.internals import ArrayManager, BlockManager # type: ignore
20
+ from typing_extensions import Self
21
+
22
+ # from autogluon.common.loaders import load_pd
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ ITEMID = "item_id"
27
+ TIMESTAMP = "timestamp"
28
+
29
+ IRREGULAR_TIME_INDEX_FREQSTR = "IRREG"
30
+
31
+
32
+ class TimeSeriesDataFrame(pd.DataFrame):
33
+ """A collection of univariate time series, where each row is identified by an (``item_id``, ``timestamp``) pair.
34
+
35
+ For example, a time series dataframe could represent the daily sales of a collection of products, where each
36
+ ``item_id`` corresponds to a product and ``timestamp`` corresponds to the day of the record.
37
+
38
+ Parameters
39
+ ----------
40
+ data : pd.DataFrame, str, pathlib.Path or Iterable
41
+ Time series data to construct a ``TimeSeriesDataFrame``. The class currently supports four input formats.
42
+
43
+ 1. Time series data in a pandas DataFrame format without multi-index. For example::
44
+
45
+ item_id timestamp target
46
+ 0 0 2019-01-01 0
47
+ 1 0 2019-01-02 1
48
+ 2 0 2019-01-03 2
49
+ 3 1 2019-01-01 3
50
+ 4 1 2019-01-02 4
51
+ 5 1 2019-01-03 5
52
+ 6 2 2019-01-01 6
53
+ 7 2 2019-01-02 7
54
+ 8 2 2019-01-03 8
55
+
56
+ You can also use :meth:`~autogluon.timeseries.TimeSeriesDataFrame.from_data_frame` for loading data in such format.
57
+
58
+ 2. Path to a data file in CSV or Parquet format. The file must contain columns ``item_id`` and ``timestamp``, as well as columns with time series values. This is similar to Option 1 above (pandas DataFrame format without multi-index). Both remote (e.g., S3) and local paths are accepted. You can also use :meth:`~autogluon.timeseries.TimeSeriesDataFrame.from_path` for loading data in such format.
59
+
60
+ 3. Time series data in pandas DataFrame format with multi-index on ``item_id`` and ``timestamp``. For example::
61
+
62
+ target
63
+ item_id timestamp
64
+ 0 2019-01-01 0
65
+ 2019-01-02 1
66
+ 2019-01-03 2
67
+ 1 2019-01-01 3
68
+ 2019-01-02 4
69
+ 2019-01-03 5
70
+ 2 2019-01-01 6
71
+ 2019-01-02 7
72
+ 2019-01-03 8
73
+
74
+ 4. Time series data in Iterable format. For example::
75
+
76
+ iterable_dataset = [
77
+ {"target": [0, 1, 2], "start": pd.Period("01-01-2019", freq='D')},
78
+ {"target": [3, 4, 5], "start": pd.Period("01-01-2019", freq='D')},
79
+ {"target": [6, 7, 8], "start": pd.Period("01-01-2019", freq='D')}
80
+ ]
81
+
82
+ You can also use :meth:`~autogluon.timeseries.TimeSeriesDataFrame.from_iterable_dataset` for loading data in such format.
83
+
84
+ static_features : pd.DataFrame, str or pathlib.Path, optional
85
+ An optional dataframe describing the metadata of each individual time series that does not change with time.
86
+ Can take real-valued or categorical values. For example, if ``TimeSeriesDataFrame`` contains sales of various
87
+ products, static features may refer to time-independent features like color or brand.
88
+
89
+ The index of the ``static_features`` index must contain a single entry for each item present in the respective
90
+ ``TimeSeriesDataFrame``. For example, the following ``TimeSeriesDataFrame``::
91
+
92
+ target
93
+ item_id timestamp
94
+ A 2019-01-01 0
95
+ 2019-01-02 1
96
+ 2019-01-03 2
97
+ B 2019-01-01 3
98
+ 2019-01-02 4
99
+ 2019-01-03 5
100
+
101
+ is compatible with the following ``static_features``::
102
+
103
+ feat_1 feat_2
104
+ item_id
105
+ A 2.0 bar
106
+ B 5.0 foo
107
+
108
+ ``TimeSeriesDataFrame`` will ensure consistency of static features during serialization/deserialization, copy
109
+ and slice operations.
110
+
111
+ If ``static_features`` are provided during ``fit``, the ``TimeSeriesPredictor`` expects the same metadata to be
112
+ available during prediction time.
113
+ id_column : str, optional
114
+ Name of the ``item_id`` column, if it's different from the default. This argument is only used when
115
+ constructing a TimeSeriesDataFrame using format 1 (DataFrame without multi-index) or 2 (path to a file).
116
+ timestamp_column : str, optional
117
+ Name of the ``timestamp`` column, if it's different from the default. This argument is only used when
118
+ constructing a TimeSeriesDataFrame using format 1 (DataFrame without multi-index) or 2 (path to a file).
119
+ num_cpus : int, default = -1
120
+ Number of CPU cores used to process the iterable dataset in parallel. Set to -1 to use all cores. This argument
121
+ is only used when constructing a TimeSeriesDataFrame using format 4 (iterable dataset).
122
+
123
+ """
124
+
125
+ index: pd.MultiIndex
126
+ _metadata = ["_static_features"]
127
+
128
+ def __init__(
129
+ self,
130
+ data: Union[pd.DataFrame, str, Path, Iterable],
131
+ static_features: Optional[Union[pd.DataFrame, str, Path]] = None,
132
+ id_column: Optional[str] = None,
133
+ timestamp_column: Optional[str] = None,
134
+ num_cpus: int = -1,
135
+ *args,
136
+ **kwargs,
137
+ ):
138
+ if isinstance(data, (BlockManager, ArrayManager)):
139
+ # necessary for copy constructor to work in pandas <= 2.0.x. In >= 2.1.x this is replaced by _constructor_from_mgr
140
+ pass
141
+ elif isinstance(data, pd.DataFrame):
142
+ if isinstance(data.index, pd.MultiIndex):
143
+ self._validate_multi_index_data_frame(data)
144
+ else:
145
+ data = self._construct_tsdf_from_data_frame(
146
+ data, id_column=id_column, timestamp_column=timestamp_column
147
+ )
148
+ # elif isinstance(data, (str, Path)):
149
+ # data = self._construct_tsdf_from_data_frame(
150
+ # load_pd.load(str(data)), id_column=id_column, timestamp_column=timestamp_column
151
+ # )
152
+ elif isinstance(data, Iterable):
153
+ data = self._construct_tsdf_from_iterable_dataset(data, num_cpus=num_cpus)
154
+ else:
155
+ raise ValueError(
156
+ f"data must be a pd.DataFrame, Iterable, string or Path (received {type(data)})."
157
+ )
158
+ super().__init__(data=data, *args, **kwargs) # type: ignore
159
+ self._static_features: Optional[pd.DataFrame] = None
160
+ if static_features is not None:
161
+ self.static_features = self._construct_static_features(
162
+ static_features, id_column=id_column
163
+ )
164
+
165
+ @property
166
+ def _constructor(self) -> Type[TimeSeriesDataFrame]:
167
+ return TimeSeriesDataFrame
168
+
169
+ def _constructor_from_mgr(self, mgr, axes):
170
+ # Use the default constructor when constructing from _mgr. Otherwise pandas enters an infinite recursion by
171
+ # repeatedly calling TimeSeriesDataFrame constructor
172
+ df = self._from_mgr(mgr, axes=axes)
173
+ df._static_features = self._static_features
174
+ return df
175
+
176
+ @classmethod
177
+ def _construct_tsdf_from_data_frame(
178
+ cls,
179
+ df: pd.DataFrame,
180
+ id_column: Optional[str] = None,
181
+ timestamp_column: Optional[str] = None,
182
+ ) -> pd.DataFrame:
183
+ df = df.copy()
184
+ if id_column is not None:
185
+ assert id_column in df.columns, f"Column '{id_column}' not found!"
186
+ if id_column != ITEMID and ITEMID in df.columns:
187
+ logger.warning(
188
+ f"Renaming existing column '{ITEMID}' -> '__{ITEMID}' to avoid name collisions."
189
+ )
190
+ df.rename(columns={ITEMID: "__" + ITEMID}, inplace=True)
191
+ df.rename(columns={id_column: ITEMID}, inplace=True)
192
+
193
+ if timestamp_column is not None:
194
+ assert timestamp_column in df.columns, (
195
+ f"Column '{timestamp_column}' not found!"
196
+ )
197
+ if timestamp_column != TIMESTAMP and TIMESTAMP in df.columns:
198
+ logger.warning(
199
+ f"Renaming existing column '{TIMESTAMP}' -> '__{TIMESTAMP}' to avoid name collisions."
200
+ )
201
+ df.rename(columns={TIMESTAMP: "__" + TIMESTAMP}, inplace=True)
202
+ df.rename(columns={timestamp_column: TIMESTAMP}, inplace=True)
203
+
204
+ if TIMESTAMP in df.columns:
205
+ df[TIMESTAMP] = pd.to_datetime(df[TIMESTAMP])
206
+
207
+ cls._validate_data_frame(df)
208
+ return df.set_index([ITEMID, TIMESTAMP])
209
+
210
+ @classmethod
211
+ def _construct_tsdf_from_iterable_dataset(
212
+ cls, iterable_dataset: Iterable, num_cpus: int = -1
213
+ ) -> pd.DataFrame:
214
+ def load_single_item(item_id: int, ts: dict) -> pd.DataFrame:
215
+ start_timestamp = ts["start"]
216
+ freq = start_timestamp.freq
217
+ if isinstance(start_timestamp, pd.Period):
218
+ start_timestamp = start_timestamp.to_timestamp(how="S")
219
+ target = ts["target"]
220
+ datetime_index = tuple(
221
+ pd.date_range(start_timestamp, periods=len(target), freq=freq)
222
+ )
223
+ idx = pd.MultiIndex.from_product(
224
+ [(item_id,), datetime_index], names=[ITEMID, TIMESTAMP]
225
+ )
226
+ return pd.Series(target, name="target", index=idx).to_frame()
227
+
228
+ cls._validate_iterable(iterable_dataset)
229
+ all_ts = Parallel(n_jobs=num_cpus)(
230
+ delayed(load_single_item)(item_id, ts)
231
+ for item_id, ts in enumerate(iterable_dataset)
232
+ )
233
+ return pd.concat(all_ts)
234
+
235
+ @classmethod
236
+ def _validate_multi_index_data_frame(cls, data: pd.DataFrame):
237
+ """Validate a multi-index pd.DataFrame can be converted to TimeSeriesDataFrame"""
238
+
239
+ if not isinstance(data, pd.DataFrame):
240
+ raise ValueError(f"data must be a pd.DataFrame, got {type(data)}")
241
+ if not isinstance(data.index, pd.MultiIndex):
242
+ raise ValueError(f"data must have pd.MultiIndex, got {type(data.index)}")
243
+ if not pd.api.types.is_datetime64_dtype(data.index.dtypes[TIMESTAMP]):
244
+ raise ValueError(
245
+ f"for {TIMESTAMP}, the only pandas dtype allowed is `datetime64`."
246
+ )
247
+ if not data.index.names == (f"{ITEMID}", f"{TIMESTAMP}"):
248
+ raise ValueError(
249
+ f"data must have index names as ('{ITEMID}', '{TIMESTAMP}'), got {data.index.names}"
250
+ )
251
+ item_id_index = data.index.levels[0]
252
+ if not (
253
+ pd.api.types.is_integer_dtype(item_id_index)
254
+ or pd.api.types.is_string_dtype(item_id_index)
255
+ ):
256
+ raise ValueError(
257
+ f"all entries in index `{ITEMID}` must be of integer or string dtype"
258
+ )
259
+
260
+ @classmethod
261
+ def _validate_data_frame(cls, df: pd.DataFrame):
262
+ """Validate that a pd.DataFrame with ITEMID and TIMESTAMP columns can be converted to TimeSeriesDataFrame"""
263
+ if not isinstance(df, pd.DataFrame):
264
+ raise ValueError(f"data must be a pd.DataFrame, got {type(df)}")
265
+ if ITEMID not in df.columns:
266
+ raise ValueError(f"data must have a `{ITEMID}` column")
267
+ if TIMESTAMP not in df.columns:
268
+ raise ValueError(f"data must have a `{TIMESTAMP}` column")
269
+ if df[ITEMID].isnull().any():
270
+ raise ValueError(f"`{ITEMID}` column can not have nan")
271
+ if df[TIMESTAMP].isnull().any():
272
+ raise ValueError(f"`{TIMESTAMP}` column can not have nan")
273
+ if not pd.api.types.is_datetime64_dtype(df[TIMESTAMP]):
274
+ raise ValueError(
275
+ f"for {TIMESTAMP}, the only pandas dtype allowed is `datetime64`."
276
+ )
277
+ item_id_column = df[ITEMID]
278
+ if not (
279
+ pd.api.types.is_integer_dtype(item_id_column)
280
+ or pd.api.types.is_string_dtype(item_id_column)
281
+ ):
282
+ raise ValueError(
283
+ f"all entries in column `{ITEMID}` must be of integer or string dtype"
284
+ )
285
+
286
+ @classmethod
287
+ def _validate_iterable(cls, data: Iterable):
288
+ if not isinstance(data, Iterable):
289
+ raise ValueError("data must be of type Iterable.")
290
+
291
+ first = next(iter(data), None)
292
+ if first is None:
293
+ raise ValueError("data has no time-series.")
294
+
295
+ for i, ts in enumerate(itertools.chain([first], data)):
296
+ if not isinstance(ts, dict):
297
+ raise ValueError(
298
+ f"{i}'th time-series in data must be a dict, got{type(ts)}"
299
+ )
300
+ if not ("target" in ts and "start" in ts):
301
+ raise ValueError(
302
+ f"{i}'th time-series in data must have 'target' and 'start', got{ts.keys()}"
303
+ )
304
+ if not isinstance(ts["start"], pd.Period):
305
+ raise ValueError(
306
+ f"{i}'th time-series must have a pandas Period as 'start', got {ts['start']}"
307
+ )
308
+
309
+ @classmethod
310
+ def from_data_frame(
311
+ cls,
312
+ df: pd.DataFrame,
313
+ id_column: Optional[str] = None,
314
+ timestamp_column: Optional[str] = None,
315
+ static_features_df: Optional[pd.DataFrame] = None,
316
+ ) -> TimeSeriesDataFrame:
317
+ """Construct a ``TimeSeriesDataFrame`` from a pandas DataFrame.
318
+
319
+ Parameters
320
+ ----------
321
+ df : pd.DataFrame
322
+ A pd.DataFrame with 'item_id' and 'timestamp' as columns. For example::
323
+
324
+ item_id timestamp target
325
+ 0 0 2019-01-01 0
326
+ 1 0 2019-01-02 1
327
+ 2 0 2019-01-03 2
328
+ 3 1 2019-01-01 3
329
+ 4 1 2019-01-02 4
330
+ 5 1 2019-01-03 5
331
+ 6 2 2019-01-01 6
332
+ 7 2 2019-01-02 7
333
+ 8 2 2019-01-03 8
334
+ id_column : str, optional
335
+ Name of the 'item_id' column if column name is different
336
+ timestamp_column : str, optional
337
+ Name of the 'timestamp' column if column name is different
338
+ static_features_df : pd.DataFrame, optional
339
+ A pd.DataFrame with 'item_id' column that contains the static features for each time series. For example::
340
+
341
+ item_id feat_1 feat_2
342
+ 0 0 foo 0.5
343
+ 1 1 foo 2.2
344
+ 2 2 bar 0.1
345
+
346
+ Returns
347
+ -------
348
+ ts_df: TimeSeriesDataFrame
349
+ A dataframe in TimeSeriesDataFrame format.
350
+ """
351
+ return cls(
352
+ df,
353
+ static_features=static_features_df,
354
+ id_column=id_column,
355
+ timestamp_column=timestamp_column,
356
+ )
357
+
358
+ @classmethod
359
+ def from_path(
360
+ cls,
361
+ path: Union[str, Path],
362
+ id_column: Optional[str] = None,
363
+ timestamp_column: Optional[str] = None,
364
+ static_features_path: Optional[Union[str, Path]] = None,
365
+ ) -> TimeSeriesDataFrame:
366
+ """Construct a ``TimeSeriesDataFrame`` from a CSV or Parquet file.
367
+
368
+ Parameters
369
+ ----------
370
+ path : str or pathlib.Path
371
+ Path to a local or remote (e.g., S3) file containing the time series data in CSV or Parquet format.
372
+ Example file contents::
373
+
374
+ item_id,timestamp,target
375
+ 0,2019-01-01,0
376
+ 0,2019-01-02,1
377
+ 0,2019-01-03,2
378
+ 1,2019-01-01,3
379
+ 1,2019-01-02,4
380
+ 1,2019-01-03,5
381
+ 2,2019-01-01,6
382
+ 2,2019-01-02,7
383
+ 2,2019-01-03,8
384
+
385
+ id_column : str, optional
386
+ Name of the 'item_id' column if column name is different
387
+ timestamp_column : str, optional
388
+ Name of the 'timestamp' column if column name is different
389
+ static_features_path : str or pathlib.Path, optional
390
+ Path to a local or remote (e.g., S3) file containing static features in CSV or Parquet format.
391
+ Example file contents::
392
+
393
+ item_id,feat_1,feat_2
394
+ 0,foo,0.5
395
+ 1,foo,2.2
396
+ 2,bar,0.1
397
+
398
+ Returns
399
+ -------
400
+ ts_df: TimeSeriesDataFrame
401
+ A dataframe in TimeSeriesDataFrame format.
402
+ """
403
+ return cls(
404
+ path,
405
+ static_features=static_features_path,
406
+ id_column=id_column,
407
+ timestamp_column=timestamp_column,
408
+ )
409
+
410
+ @classmethod
411
+ def from_iterable_dataset(
412
+ cls, iterable_dataset: Iterable, num_cpus: int = -1
413
+ ) -> TimeSeriesDataFrame:
414
+ """Construct a ``TimeSeriesDataFrame`` from an Iterable of dictionaries each of which
415
+ represent a single time series.
416
+
417
+ This function also offers compatibility with GluonTS `ListDataset format <https://ts.gluon.ai/stable/api/gluonts/gluonts.dataset.common.html#gluonts.dataset.common.ListDataset>`_.
418
+
419
+ Parameters
420
+ ----------
421
+ iterable_dataset: Iterable
422
+ An iterator over dictionaries, each with a ``target`` field specifying the value of the
423
+ (univariate) time series, and a ``start`` field with the starting time as a pandas Period .
424
+ Example::
425
+
426
+ iterable_dataset = [
427
+ {"target": [0, 1, 2], "start": pd.Period("01-01-2019", freq='D')},
428
+ {"target": [3, 4, 5], "start": pd.Period("01-01-2019", freq='D')},
429
+ {"target": [6, 7, 8], "start": pd.Period("01-01-2019", freq='D')}
430
+ ]
431
+ num_cpus : int, default = -1
432
+ Number of CPU cores used to process the iterable dataset in parallel. Set to -1 to use all cores.
433
+
434
+ Returns
435
+ -------
436
+ ts_df: TimeSeriesDataFrame
437
+ A dataframe in TimeSeriesDataFrame format.
438
+ """
439
+ return cls(iterable_dataset, num_cpus=num_cpus)
440
+
441
+ @property
442
+ def item_ids(self) -> pd.Index:
443
+ """List of unique time series IDs contained in the data set."""
444
+ return self.index.unique(level=ITEMID)
445
+
446
+ @classmethod
447
+ def _construct_static_features(
448
+ cls,
449
+ static_features: Union[pd.DataFrame, str, Path],
450
+ id_column: Optional[str] = None,
451
+ ) -> pd.DataFrame:
452
+ # if isinstance(static_features, (str, Path)):
453
+ # static_features = load_pd.load(str(static_features))
454
+ if not isinstance(static_features, pd.DataFrame):
455
+ raise ValueError(
456
+ f"static_features must be a pd.DataFrame, string or Path (received {type(static_features)})"
457
+ )
458
+
459
+ if id_column is not None:
460
+ assert id_column in static_features.columns, (
461
+ f"Column '{id_column}' not found in static_features!"
462
+ )
463
+ if id_column != ITEMID and ITEMID in static_features.columns:
464
+ logger.warning(
465
+ f"Renaming existing column '{ITEMID}' -> '__{ITEMID}' to avoid name collisions."
466
+ )
467
+ static_features.rename(columns={ITEMID: "__" + ITEMID}, inplace=True)
468
+ static_features.rename(columns={id_column: ITEMID}, inplace=True)
469
+ return static_features
470
+
471
+ @property
472
+ def static_features(self):
473
+ return self._static_features
474
+
475
+ @static_features.setter
476
+ def static_features(self, value: Optional[pd.DataFrame]):
477
+ # if the current item index is not a multiindex, then we are dealing with a single
478
+ # item slice. this should only happen when the user explicitly requests only a
479
+ # single item or during `slice_by_timestep`. In this case we do not set static features
480
+ if not isinstance(self.index, pd.MultiIndex):
481
+ return
482
+
483
+ if value is not None:
484
+ if isinstance(value, pd.Series):
485
+ value = value.to_frame()
486
+ if not isinstance(value, pd.DataFrame):
487
+ raise ValueError(
488
+ f"static_features must be a pandas DataFrame (received object of type {type(value)})"
489
+ )
490
+ if isinstance(value.index, pd.MultiIndex):
491
+ raise ValueError("static_features cannot have a MultiIndex")
492
+
493
+ # Avoid modifying static features inplace
494
+ value = value.copy()
495
+ if ITEMID in value.columns and value.index.name != ITEMID:
496
+ value = value.set_index(ITEMID)
497
+ if value.index.name != ITEMID:
498
+ value.index.rename(ITEMID, inplace=True)
499
+ missing_item_ids = self.item_ids.difference(value.index)
500
+ if len(missing_item_ids) > 0:
501
+ raise ValueError(
502
+ "Following item_ids are missing from the index of static_features: "
503
+ f"{reprlib.repr(missing_item_ids.to_list())}"
504
+ )
505
+ # if provided static features are a strict superset of the item index, we take a subset to ensure consistency
506
+ if len(value.index.difference(self.item_ids)) > 0:
507
+ value = value.reindex(self.item_ids)
508
+
509
+ self._static_features = value
510
+
511
+ def infer_frequency(
512
+ self, num_items: Optional[int] = None, raise_if_irregular: bool = False
513
+ ) -> str:
514
+ """Infer the time series frequency based on the timestamps of the observations.
515
+
516
+ Parameters
517
+ ----------
518
+ num_items : int or None, default = None
519
+ Number of items (individual time series) randomly selected to infer the frequency. Lower values speed up
520
+ the method, but increase the chance that some items with invalid frequency are missed by subsampling.
521
+
522
+ If set to `None`, all items will be used for inferring the frequency.
523
+ raise_if_irregular : bool, default = False
524
+ If True, an exception will be raised if some items have an irregular frequency, or if different items have
525
+ different frequencies.
526
+
527
+ Returns
528
+ -------
529
+ freq : str
530
+ If all time series have a regular frequency, returns a pandas-compatible `frequency alias <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_.
531
+
532
+ If some items have an irregular frequency or if different items have different frequencies, returns string
533
+ `IRREG`.
534
+ """
535
+ ts_df = self
536
+ if num_items is not None and ts_df.num_items > num_items:
537
+ items_subset = ts_df.item_ids.to_series().sample(
538
+ n=num_items, random_state=123
539
+ )
540
+ ts_df = ts_df.loc[items_subset]
541
+
542
+ if not ts_df.index.is_monotonic_increasing:
543
+ ts_df = ts_df.sort_index()
544
+
545
+ indptr = ts_df.get_indptr()
546
+ item_ids = ts_df.item_ids
547
+ timestamps = ts_df.index.get_level_values(level=1)
548
+ candidate_freq = ts_df.index.levels[1].freq
549
+
550
+ frequencies = []
551
+ irregular_items = []
552
+ for i in range(len(indptr) - 1):
553
+ start, end = indptr[i], indptr[i + 1]
554
+ item_timestamps = timestamps[start:end]
555
+ inferred_freq = item_timestamps.inferred_freq
556
+
557
+ # Fallback option: maybe original index has a `freq` attribute that pandas fails to infer (e.g., 'SME')
558
+ if inferred_freq is None and candidate_freq is not None:
559
+ try:
560
+ # If this line does not raise an exception, then candidate_freq is a compatible frequency
561
+ item_timestamps.freq = candidate_freq
562
+ except ValueError:
563
+ inferred_freq = None
564
+ else:
565
+ inferred_freq = candidate_freq.freqstr
566
+
567
+ if inferred_freq is None:
568
+ irregular_items.append(item_ids[i])
569
+ else:
570
+ frequencies.append(inferred_freq)
571
+
572
+ unique_freqs = list(set(frequencies))
573
+ if len(unique_freqs) != 1 or len(irregular_items) > 0:
574
+ if raise_if_irregular:
575
+ if irregular_items:
576
+ raise ValueError(
577
+ f"Cannot infer frequency. Items with irregular frequency: {reprlib.repr(irregular_items)}"
578
+ )
579
+ else:
580
+ raise ValueError(
581
+ f"Cannot infer frequency. Multiple frequencies detected: {unique_freqs}"
582
+ )
583
+ else:
584
+ return IRREGULAR_TIME_INDEX_FREQSTR
585
+ else:
586
+ return pd.tseries.frequencies.to_offset(unique_freqs[0]).freqstr
587
+
588
+ @property
589
+ def freq(self):
590
+ """Inferred pandas-compatible frequency of the timestamps in the dataframe.
591
+
592
+ Computed using a random subset of the time series for speed. This may sometimes result in incorrectly inferred
593
+ values. For reliable results, use :meth:`~autogluon.timeseries.TimeSeriesDataFrame.infer_frequency`.
594
+ """
595
+ inferred_freq = self.infer_frequency(num_items=50)
596
+ return None if inferred_freq == IRREGULAR_TIME_INDEX_FREQSTR else inferred_freq
597
+
598
+ @property
599
+ def num_items(self):
600
+ """Number of items (time series) in the data set."""
601
+ return len(self.item_ids)
602
+
603
+ def num_timesteps_per_item(self) -> pd.Series:
604
+ """Number of observations in each time series in the dataframe.
605
+
606
+ Returns a `pandas.Series` with item_id as index and number of observations per item as values.
607
+ """
608
+ counts = pd.Series(self.index.codes[0]).value_counts(sort=False)
609
+ counts.index = self.index.levels[0][counts.index]
610
+ return counts
611
+
612
+ def copy(self: TimeSeriesDataFrame, deep: bool = True) -> TimeSeriesDataFrame:
613
+ """Make a copy of the TimeSeriesDataFrame.
614
+
615
+ When ``deep=True`` (default), a new object will be created with a copy of the calling object's data and
616
+ indices. Modifications to the data or indices of the copy will not be reflected in the original object.
617
+
618
+ When ``deep=False``, a new object will be created without copying the calling object's data or index (only
619
+ references to the data and index are copied). Any changes to the data of the original will be reflected in the
620
+ shallow copy (and vice versa).
621
+
622
+ For more details, see `pandas documentation <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.copy.html>`_.
623
+ """
624
+ obj = super().copy(deep=deep)
625
+
626
+ # also perform a deep copy for static features
627
+ if deep:
628
+ for k in obj._metadata:
629
+ setattr(obj, k, copy.deepcopy(getattr(obj, k)))
630
+ return obj
631
+
632
+ def __finalize__( # noqa
633
+ self: TimeSeriesDataFrame, other, method: Optional[str] = None, **kwargs
634
+ ) -> TimeSeriesDataFrame:
635
+ super().__finalize__(other=other, method=method, **kwargs)
636
+ # when finalizing the copy/slice operation, we use the property setter to stay consistent
637
+ # with the item index
638
+ if hasattr(other, "_static_features"):
639
+ self.static_features = other._static_features
640
+ return self
641
+
642
+ def split_by_time(
643
+ self, cutoff_time: pd.Timestamp
644
+ ) -> Tuple[TimeSeriesDataFrame, TimeSeriesDataFrame]:
645
+ """Split dataframe to two different ``TimeSeriesDataFrame`` s before and after a certain ``cutoff_time``.
646
+
647
+ Parameters
648
+ ----------
649
+ cutoff_time: pd.Timestamp
650
+ The time to split the current dataframe into two dataframes.
651
+
652
+ Returns
653
+ -------
654
+ data_before: TimeSeriesDataFrame
655
+ Data frame containing time series before the ``cutoff_time`` (exclude ``cutoff_time``).
656
+ data_after: TimeSeriesDataFrame
657
+ Data frame containing time series after the ``cutoff_time`` (include ``cutoff_time``).
658
+ """
659
+
660
+ nanosecond_before_cutoff = cutoff_time - pd.Timedelta(nanoseconds=1)
661
+ data_before = self.loc[(slice(None), slice(None, nanosecond_before_cutoff)), :]
662
+ data_after = self.loc[(slice(None), slice(cutoff_time, None)), :]
663
+ before = TimeSeriesDataFrame(data_before, static_features=self.static_features)
664
+ after = TimeSeriesDataFrame(data_after, static_features=self.static_features)
665
+ return before, after
666
+
667
+ def slice_by_timestep(
668
+ self, start_index: Optional[int] = None, end_index: Optional[int] = None
669
+ ) -> TimeSeriesDataFrame:
670
+ """Select a subsequence from each time series between start (inclusive) and end (exclusive) indices.
671
+
672
+ This operation is equivalent to selecting a slice ``[start_index : end_index]`` from each time series, and then
673
+ combining these slices into a new ``TimeSeriesDataFrame``. See examples below.
674
+
675
+ It is recommended to sort the index with `ts_df.sort_index()` before calling this method to take advantage of
676
+ a fast optimized algorithm.
677
+
678
+ Parameters
679
+ ----------
680
+ start_index : int or None
681
+ Start index (inclusive) of the slice for each time series.
682
+ Negative values are counted from the end of each time series.
683
+ When set to None, the slice starts from the beginning of each time series.
684
+ end_index : int or None
685
+ End index (exclusive) of the slice for each time series.
686
+ Negative values are counted from the end of each time series.
687
+ When set to None, the slice includes the end of each time series.
688
+
689
+ Returns
690
+ -------
691
+ ts_df : TimeSeriesDataFrame
692
+ A new time series dataframe containing entries of the original time series between start and end indices.
693
+
694
+ Examples
695
+ --------
696
+ >>> ts_df
697
+ target
698
+ item_id timestamp
699
+ 0 2019-01-01 0
700
+ 2019-01-02 1
701
+ 2019-01-03 2
702
+ 1 2019-01-02 3
703
+ 2019-01-03 4
704
+ 2019-01-04 5
705
+ 2 2019-01-03 6
706
+ 2019-01-04 7
707
+ 2019-01-05 8
708
+
709
+ Select the first entry of each time series
710
+
711
+ >>> df.slice_by_timestep(0, 1)
712
+ target
713
+ item_id timestamp
714
+ 0 2019-01-01 0
715
+ 1 2019-01-02 3
716
+ 2 2019-01-03 6
717
+
718
+ Select the last 2 entries of each time series
719
+
720
+ >>> df.slice_by_timestep(-2, None)
721
+ target
722
+ item_id timestamp
723
+ 0 2019-01-02 1
724
+ 2019-01-03 2
725
+ 1 2019-01-03 4
726
+ 2019-01-04 5
727
+ 2 2019-01-04 7
728
+ 2019-01-05 8
729
+
730
+ Select all except the last entry of each time series
731
+
732
+ >>> df.slice_by_timestep(None, -1)
733
+ target
734
+ item_id timestamp
735
+ 0 2019-01-01 0
736
+ 2019-01-02 1
737
+ 1 2019-01-02 3
738
+ 2019-01-03 4
739
+ 2 2019-01-03 6
740
+ 2019-01-04 7
741
+
742
+ Copy the entire dataframe
743
+
744
+ >>> df.slice_by_timestep(None, None)
745
+ target
746
+ item_id timestamp
747
+ 0 2019-01-01 0
748
+ 2019-01-02 1
749
+ 2019-01-03 2
750
+ 1 2019-01-02 3
751
+ 2019-01-03 4
752
+ 2019-01-04 5
753
+ 2 2019-01-03 6
754
+ 2019-01-04 7
755
+ 2019-01-05 8
756
+
757
+ """
758
+ if start_index is not None and not isinstance(start_index, int):
759
+ raise ValueError(
760
+ f"start_index must be of type int or None (got {type(start_index)})"
761
+ )
762
+ if end_index is not None and not isinstance(end_index, int):
763
+ raise ValueError(
764
+ f"end_index must be of type int or None (got {type(end_index)})"
765
+ )
766
+
767
+ if start_index is None and end_index is None:
768
+ # Return a copy to avoid in-place modification.
769
+ # self.copy() is much faster than self.loc[ones(len(self), dtype=bool)]
770
+ return self.copy()
771
+
772
+ if self.index.is_monotonic_increasing:
773
+ # Use a fast optimized algorithm if the index is sorted
774
+ indptr = self.get_indptr()
775
+ lengths = np.diff(indptr)
776
+ starts = indptr[:-1]
777
+
778
+ slice_start = (
779
+ np.zeros_like(lengths)
780
+ if start_index is None
781
+ else np.clip(
782
+ np.where(start_index >= 0, start_index, lengths + start_index),
783
+ 0,
784
+ lengths,
785
+ )
786
+ )
787
+ slice_end = (
788
+ lengths.copy()
789
+ if end_index is None
790
+ else np.clip(
791
+ np.where(end_index >= 0, end_index, lengths + end_index), 0, lengths
792
+ )
793
+ )
794
+
795
+ # Filter out invalid slices where start >= end
796
+ valid_slices = slice_start < slice_end
797
+ if not np.any(valid_slices):
798
+ # Return empty dataframe with same structure
799
+ return self.loc[np.zeros(len(self), dtype=bool)]
800
+
801
+ starts = starts[valid_slices]
802
+ slice_start = slice_start[valid_slices]
803
+ slice_end = slice_end[valid_slices]
804
+
805
+ # We put 1 at the slice_start index for each item and -1 at the slice_end index for each item.
806
+ # After we apply cumsum we get the indicator mask selecting values between slice_start and slice_end
807
+ # cumsum([0, 0, 1, 0, 0, -1, 0]) -> [0, 0, 1, 1, 1, 0, 0]
808
+ # We need array of size len(self) + 1 in case events[starts + slice_end] tries to access position len(self)
809
+ events = np.zeros(len(self) + 1, dtype=np.int8)
810
+ events[starts + slice_start] += 1
811
+ events[starts + slice_end] -= 1
812
+ mask = np.cumsum(events)[:-1].astype(bool)
813
+ # loc[mask] returns a view of the original data - modifying it will produce a SettingWithCopyWarning
814
+ return self.loc[mask]
815
+ else:
816
+ # Fall back to a slow groupby operation
817
+ result = self.groupby(level=ITEMID, sort=False, as_index=False).nth(
818
+ slice(start_index, end_index)
819
+ )
820
+ result.static_features = self.static_features
821
+ return result
822
+
823
+ def slice_by_time(
824
+ self, start_time: pd.Timestamp, end_time: pd.Timestamp
825
+ ) -> TimeSeriesDataFrame:
826
+ """Select a subsequence from each time series between start (inclusive) and end (exclusive) timestamps.
827
+
828
+ Parameters
829
+ ----------
830
+ start_time: pd.Timestamp
831
+ Start time (inclusive) of the slice for each time series.
832
+ end_time: pd.Timestamp
833
+ End time (exclusive) of the slice for each time series.
834
+
835
+ Returns
836
+ -------
837
+ ts_df : TimeSeriesDataFrame
838
+ A new time series dataframe containing entries of the original time series between start and end timestamps.
839
+ """
840
+
841
+ if end_time < start_time:
842
+ raise ValueError(
843
+ f"end_time {end_time} is earlier than start_time {start_time}"
844
+ )
845
+
846
+ nanosecond_before_end_time = end_time - pd.Timedelta(nanoseconds=1)
847
+ return TimeSeriesDataFrame(
848
+ self.loc[(slice(None), slice(start_time, nanosecond_before_end_time)), :],
849
+ static_features=self.static_features,
850
+ )
851
+
852
+ @classmethod
853
+ def from_pickle(cls, filepath_or_buffer: Any) -> TimeSeriesDataFrame:
854
+ """Convenience method to read pickled time series dataframes. If the read pickle
855
+ file refers to a plain pandas DataFrame, it will be cast to a TimeSeriesDataFrame.
856
+
857
+ Parameters
858
+ ----------
859
+ filepath_or_buffer: Any
860
+ Filename provided as a string or an ``IOBuffer`` containing the pickled object.
861
+
862
+ Returns
863
+ -------
864
+ ts_df : TimeSeriesDataFrame
865
+ The pickled time series dataframe.
866
+ """
867
+ try:
868
+ data = pd.read_pickle(filepath_or_buffer)
869
+ return data if isinstance(data, cls) else cls(data)
870
+ except Exception as err: # noqa
871
+ raise IOError(f"Could not load pickled data set due to error: {str(err)}")
872
+
873
+ def fill_missing_values(
874
+ self, method: str = "auto", value: float = 0.0
875
+ ) -> TimeSeriesDataFrame:
876
+ """Fill missing values represented by NaN.
877
+
878
+ .. note::
879
+ This method assumes that the index of the TimeSeriesDataFrame is sorted by [item_id, timestamp].
880
+
881
+ If the index is not sorted, this method will log a warning and may produce an incorrect result.
882
+
883
+ Parameters
884
+ ----------
885
+ method : str, default = "auto"
886
+ Method used to impute missing values.
887
+
888
+ - "auto" - first forward fill (to fill the in-between and trailing NaNs), then backward fill (to fill the leading NaNs)
889
+ - "ffill" or "pad" - propagate last valid observation forward. Note: missing values at the start of the time series are not filled.
890
+ - "bfill" or "backfill" - use next valid observation to fill gap. Note: this may result in information leakage; missing values at the end of the time series are not filled.
891
+ - "constant" - replace NaNs with the given constant ``value``.
892
+ - "interpolate" - fill NaN values using linear interpolation. Note: this may result in information leakage.
893
+ value : float, default = 0.0
894
+ Value used by the "constant" imputation method.
895
+
896
+ Examples
897
+ --------
898
+ >>> ts_df
899
+ target
900
+ item_id timestamp
901
+ 0 2019-01-01 NaN
902
+ 2019-01-02 NaN
903
+ 2019-01-03 1.0
904
+ 2019-01-04 NaN
905
+ 2019-01-05 NaN
906
+ 2019-01-06 2.0
907
+ 2019-01-07 NaN
908
+ 1 2019-02-04 NaN
909
+ 2019-02-05 3.0
910
+ 2019-02-06 NaN
911
+ 2019-02-07 4.0
912
+
913
+ >>> ts_df.fill_missing_values(method="auto")
914
+ target
915
+ item_id timestamp
916
+ 0 2019-01-01 1.0
917
+ 2019-01-02 1.0
918
+ 2019-01-03 1.0
919
+ 2019-01-04 1.0
920
+ 2019-01-05 1.0
921
+ 2019-01-06 2.0
922
+ 2019-01-07 2.0
923
+ 1 2019-02-04 3.0
924
+ 2019-02-05 3.0
925
+ 2019-02-06 3.0
926
+ 2019-02-07 4.0
927
+
928
+ """
929
+ # Convert to pd.DataFrame for faster processing
930
+ df = pd.DataFrame(self)
931
+
932
+ # Skip filling if there are no NaNs
933
+ if not df.isna().any(axis=None):
934
+ return self
935
+
936
+ if not self.index.is_monotonic_increasing:
937
+ logger.warning(
938
+ "Trying to fill missing values in an unsorted dataframe. "
939
+ "It is highly recommended to call `ts_df.sort_index()` before calling `ts_df.fill_missing_values()`"
940
+ )
941
+
942
+ grouped_df = df.groupby(level=ITEMID, sort=False, group_keys=False)
943
+ if method == "auto":
944
+ filled_df = grouped_df.ffill()
945
+ # If necessary, fill missing values at the start of each time series with bfill
946
+ if filled_df.isna().any(axis=None):
947
+ filled_df = filled_df.groupby(
948
+ level=ITEMID, sort=False, group_keys=False
949
+ ).bfill()
950
+ elif method in ["ffill", "pad"]:
951
+ filled_df = grouped_df.ffill()
952
+ elif method in ["bfill", "backfill"]:
953
+ filled_df = grouped_df.bfill()
954
+ elif method == "constant":
955
+ filled_df = self.fillna(value=value)
956
+ elif method == "interpolate":
957
+ filled_df = grouped_df.apply(lambda ts: ts.interpolate())
958
+ else:
959
+ raise ValueError(
960
+ "Invalid fill method. Expecting one of "
961
+ "{'auto', 'ffill', 'pad', 'bfill', 'backfill', 'constant', 'interpolate'}. "
962
+ f"Got {method}"
963
+ )
964
+ return TimeSeriesDataFrame(filled_df, static_features=self.static_features)
965
+
966
+ def dropna(self, how: str = "any") -> TimeSeriesDataFrame: # type: ignore[override]
967
+ """Drop rows containing NaNs.
968
+
969
+ Parameters
970
+ ----------
971
+ how : {"any", "all"}, default = "any"
972
+ Determine if row or column is removed from TimeSeriesDataFrame, when we have at least one NaN or all NaN.
973
+
974
+ - "any" : If any NaN values are present, drop that row or column.
975
+ - "all" : If all values are NaN, drop that row or column.
976
+ """
977
+ # We need to cast to a DataFrame first. Calling self.dropna() results in an exception because self.T
978
+ # (used inside dropna) is not supported for TimeSeriesDataFrame
979
+ dropped_df = pd.DataFrame(self).dropna(how=how)
980
+ return TimeSeriesDataFrame(dropped_df, static_features=self.static_features)
981
+
982
+ # added for static type checker compatibility
983
+ def assign(self, **kwargs) -> TimeSeriesDataFrame:
984
+ """Assign new columns to the time series dataframe. See :meth:`pandas.DataFrame.assign` for details."""
985
+ return super().assign(**kwargs) # type: ignore
986
+
987
+ # added for static type checker compatibility
988
+ def sort_index(self, *args, **kwargs) -> TimeSeriesDataFrame:
989
+ return super().sort_index(*args, **kwargs) # type: ignore
990
+
991
+ def get_model_inputs_for_scoring(
992
+ self, prediction_length: int, known_covariates_names: Optional[List[str]] = None
993
+ ) -> Tuple[TimeSeriesDataFrame, Optional[TimeSeriesDataFrame]]:
994
+ """Prepare model inputs necessary to predict the last ``prediction_length`` time steps of each time series in the dataset.
995
+
996
+ Parameters
997
+ ----------
998
+ prediction_length : int
999
+ The forecast horizon, i.e., How many time steps into the future must be predicted.
1000
+ known_covariates_names : List[str], optional
1001
+ Names of the dataframe columns that contain covariates known in the future.
1002
+ See :attr:`known_covariates_names` of :class:`~autogluon.timeseries.TimeSeriesPredictor` for more details.
1003
+
1004
+ Returns
1005
+ -------
1006
+ past_data : TimeSeriesDataFrame
1007
+ Data, where the last ``prediction_length`` time steps have been removed from the end of each time series.
1008
+ known_covariates : TimeSeriesDataFrame or None
1009
+ If ``known_covariates_names`` was provided, dataframe with the values of the known covariates during the
1010
+ forecast horizon. Otherwise, ``None``.
1011
+ """
1012
+ past_data = self.slice_by_timestep(None, -prediction_length)
1013
+ if known_covariates_names is not None and len(known_covariates_names) > 0:
1014
+ future_data = self.slice_by_timestep(-prediction_length, None)
1015
+ known_covariates = future_data[known_covariates_names]
1016
+ else:
1017
+ known_covariates = None
1018
+ return past_data, known_covariates
1019
+
1020
+ def train_test_split(
1021
+ self,
1022
+ prediction_length: int,
1023
+ end_index: Optional[int] = None,
1024
+ suffix: Optional[str] = None,
1025
+ ) -> Tuple[TimeSeriesDataFrame, TimeSeriesDataFrame]:
1026
+ """Generate a train/test split from the given dataset.
1027
+
1028
+ This method can be used to generate splits for multi-window backtesting.
1029
+
1030
+ .. note::
1031
+ This method automatically sorts the TimeSeriesDataFrame by [item_id, timestamp].
1032
+
1033
+ Parameters
1034
+ ----------
1035
+ prediction_length : int
1036
+ Number of time steps in a single evaluation window.
1037
+ end_index : int, optional
1038
+ If given, all time series will be shortened up to ``end_idx`` before the train/test splitting. In other
1039
+ words, test data will include the slice ``[:end_index]`` of each time series, and train data will include
1040
+ the slice ``[:end_index - prediction_length]``.
1041
+ suffix : str, optional
1042
+ Suffix appended to all entries in the ``item_id`` index level.
1043
+
1044
+ Returns
1045
+ -------
1046
+ train_data : TimeSeriesDataFrame
1047
+ Train portion of the data. Contains the slice ``[:-prediction_length]`` of each time series in ``test_data``.
1048
+ test_data : TimeSeriesDataFrame
1049
+ Test portion of the data. Contains the slice ``[:end_idx]`` of each time series in the original dataset.
1050
+ """
1051
+ df = self
1052
+ if not df.index.is_monotonic_increasing:
1053
+ logger.warning(
1054
+ "Sorting the dataframe index before generating the train/test split."
1055
+ )
1056
+ df = df.sort_index()
1057
+ test_data = df.slice_by_timestep(None, end_index)
1058
+ train_data = test_data.slice_by_timestep(None, -prediction_length)
1059
+
1060
+ if suffix is not None:
1061
+ for data in [train_data, test_data]:
1062
+ new_item_id = data.index.levels[0].astype(str) + suffix
1063
+ data.index = data.index.set_levels(levels=new_item_id, level=0)
1064
+ if data.static_features is not None:
1065
+ data.static_features.index = data.static_features.index.astype(str)
1066
+ data.static_features.index += suffix
1067
+ return train_data, test_data
1068
+
1069
+ def convert_frequency(
1070
+ self,
1071
+ freq: Union[str, pd.DateOffset],
1072
+ agg_numeric: str = "mean",
1073
+ agg_categorical: str = "first",
1074
+ num_cpus: int = -1,
1075
+ chunk_size: int = 100,
1076
+ **kwargs,
1077
+ ) -> TimeSeriesDataFrame:
1078
+ """Convert each time series in the dataframe to the given frequency.
1079
+
1080
+ This method is useful for two purposes:
1081
+
1082
+ 1. Converting an irregularly-sampled time series to a regular time index.
1083
+ 2. Aggregating time series data by downsampling (e.g., convert daily sales into weekly sales)
1084
+
1085
+ Standard ``df.groupby(...).resample(...)`` can be extremely slow for large datasets, so we parallelize this
1086
+ operation across multiple CPU cores.
1087
+
1088
+ Parameters
1089
+ ----------
1090
+ freq : Union[str, pd.DateOffset]
1091
+ Frequency to which the data should be converted. See `pandas frequency aliases <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_
1092
+ for supported values.
1093
+ agg_numeric : {"max", "min", "sum", "mean", "median", "first", "last"}, default = "mean"
1094
+ Aggregation method applied to numeric columns.
1095
+ agg_categorical : {"first", "last"}, default = "first"
1096
+ Aggregation method applied to categorical columns.
1097
+ num_cpus : int, default = -1
1098
+ Number of CPU cores used when resampling in parallel. Set to -1 to use all cores.
1099
+ chunk_size : int, default = 100
1100
+ Number of time series in a chunk assigned to each parallel worker.
1101
+ **kwargs
1102
+ Additional keywords arguments that will be passed to ``pandas.DataFrameGroupBy.resample``.
1103
+
1104
+ Returns
1105
+ -------
1106
+ ts_df : TimeSeriesDataFrame
1107
+ A new time series dataframe with time series resampled at the new frequency. Output may contain missing
1108
+ values represented by ``NaN`` if original data does not have information for the given period.
1109
+
1110
+ Examples
1111
+ --------
1112
+ Convert irregularly-sampled time series data to a regular index
1113
+
1114
+ >>> ts_df
1115
+ target
1116
+ item_id timestamp
1117
+ 0 2019-01-01 NaN
1118
+ 2019-01-03 1.0
1119
+ 2019-01-06 2.0
1120
+ 2019-01-07 NaN
1121
+ 1 2019-02-04 3.0
1122
+ 2019-02-07 4.0
1123
+ >>> ts_df.convert_frequency(freq="D")
1124
+ target
1125
+ item_id timestamp
1126
+ 0 2019-01-01 NaN
1127
+ 2019-01-02 NaN
1128
+ 2019-01-03 1.0
1129
+ 2019-01-04 NaN
1130
+ 2019-01-05 NaN
1131
+ 2019-01-06 2.0
1132
+ 2019-01-07 NaN
1133
+ 1 2019-02-04 3.0
1134
+ 2019-02-05 NaN
1135
+ 2019-02-06 NaN
1136
+ 2019-02-07 4.0
1137
+
1138
+ Downsample quarterly data to yearly frequency
1139
+
1140
+ >>> ts_df
1141
+ target
1142
+ item_id timestamp
1143
+ 0 2020-03-31 1.0
1144
+ 2020-06-30 2.0
1145
+ 2020-09-30 3.0
1146
+ 2020-12-31 4.0
1147
+ 2021-03-31 5.0
1148
+ 2021-06-30 6.0
1149
+ 2021-09-30 7.0
1150
+ 2021-12-31 8.0
1151
+ >>> ts_df.convert_frequency("YE")
1152
+ target
1153
+ item_id timestamp
1154
+ 0 2020-12-31 2.5
1155
+ 2021-12-31 6.5
1156
+ >>> ts_df.convert_frequency("YE", agg_numeric="sum")
1157
+ target
1158
+ item_id timestamp
1159
+ 0 2020-12-31 10.0
1160
+ 2021-12-31 26.0
1161
+ """
1162
+ offset = pd.tseries.frequencies.to_offset(freq)
1163
+
1164
+ # We need to aggregate categorical columns separately because .agg("mean") deletes all non-numeric columns
1165
+ aggregation = {}
1166
+ for col in self.columns:
1167
+ if pd.api.types.is_numeric_dtype(self.dtypes[col]):
1168
+ aggregation[col] = agg_numeric
1169
+ else:
1170
+ aggregation[col] = agg_categorical
1171
+
1172
+ def split_into_chunks(iterable: Iterable, size: int) -> Iterable[Iterable]:
1173
+ # Based on https://stackoverflow.com/a/22045226/5497447
1174
+ iterable = iter(iterable)
1175
+ return iter(lambda: tuple(islice(iterable, size)), ())
1176
+
1177
+ def resample_chunk(chunk: Iterable[Tuple[str, pd.DataFrame]]) -> pd.DataFrame:
1178
+ resampled_dfs = []
1179
+ for item_id, df in chunk:
1180
+ resampled_df = df.resample(offset, level=TIMESTAMP, **kwargs).agg(
1181
+ aggregation
1182
+ )
1183
+ resampled_dfs.append(pd.concat({item_id: resampled_df}, names=[ITEMID]))
1184
+ return pd.concat(resampled_dfs)
1185
+
1186
+ # Resampling time for 1 item < overhead time for a single parallel job. Therefore, we group items into chunks
1187
+ # so that the speedup from parallelization isn't dominated by the communication costs.
1188
+ df = pd.DataFrame(self)
1189
+ # Make sure that timestamp index has dtype 'datetime64[ns]', otherwise index may contain NaT values.
1190
+ # See https://github.com/autogluon/autogluon/issues/4917
1191
+ df.index = df.index.set_levels(
1192
+ df.index.levels[1].astype("datetime64[ns]"), level=TIMESTAMP
1193
+ )
1194
+ chunks = split_into_chunks(df.groupby(level=ITEMID, sort=False), chunk_size)
1195
+ resampled_chunks = Parallel(n_jobs=num_cpus)(
1196
+ delayed(resample_chunk)(chunk) for chunk in chunks
1197
+ )
1198
+ resampled_df = TimeSeriesDataFrame(pd.concat(resampled_chunks))
1199
+ resampled_df.static_features = self.static_features
1200
+ return resampled_df
1201
+
1202
+ def to_data_frame(self) -> pd.DataFrame:
1203
+ """Convert `TimeSeriesDataFrame` to a `pandas.DataFrame`"""
1204
+ return pd.DataFrame(self)
1205
+
1206
+ def get_indptr(self) -> np.ndarray:
1207
+ """[Advanced] Get a numpy array of shape [num_items + 1] that points to the start and end of each time series.
1208
+
1209
+ This method assumes that the TimeSeriesDataFrame is sorted by [item_id, timestamp].
1210
+ """
1211
+ return np.concatenate(
1212
+ [[0], np.cumsum(self.num_timesteps_per_item().to_numpy())]
1213
+ ).astype(np.int32)
1214
+
1215
+ # inline typing stubs for various overridden methods
1216
+ if TYPE_CHECKING:
1217
+
1218
+ def query( # type: ignore
1219
+ self, expr: str, *, inplace: bool = False, **kwargs
1220
+ ) -> Self: ...
1221
+
1222
+ def reindex(*args, **kwargs) -> Self: ... # type: ignore
1223
+
1224
+ @overload
1225
+ def __new__(
1226
+ cls, data: pd.DataFrame, static_features: Optional[pd.DataFrame] = None
1227
+ ) -> Self: ... # type: ignore
1228
+
1229
+ @overload
1230
+ def __getitem__(self, items: List[str]) -> Self: ... # type: ignore
1231
+ @overload
1232
+ def __getitem__(self, item: str) -> pd.Series: ... # type: ignore