autogluon.timeseries 1.0.1b20240304__py3-none-any.whl → 1.4.1b20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of autogluon.timeseries might be problematic. Click here for more details.

Files changed (108) hide show
  1. autogluon/timeseries/configs/__init__.py +3 -2
  2. autogluon/timeseries/configs/hyperparameter_presets.py +62 -0
  3. autogluon/timeseries/configs/predictor_presets.py +84 -0
  4. autogluon/timeseries/dataset/ts_dataframe.py +339 -186
  5. autogluon/timeseries/learner.py +192 -60
  6. autogluon/timeseries/metrics/__init__.py +55 -11
  7. autogluon/timeseries/metrics/abstract.py +96 -25
  8. autogluon/timeseries/metrics/point.py +186 -39
  9. autogluon/timeseries/metrics/quantile.py +47 -20
  10. autogluon/timeseries/metrics/utils.py +6 -6
  11. autogluon/timeseries/models/__init__.py +13 -7
  12. autogluon/timeseries/models/abstract/__init__.py +2 -2
  13. autogluon/timeseries/models/abstract/abstract_timeseries_model.py +533 -273
  14. autogluon/timeseries/models/abstract/model_trial.py +10 -10
  15. autogluon/timeseries/models/abstract/tunable.py +189 -0
  16. autogluon/timeseries/models/autogluon_tabular/__init__.py +2 -0
  17. autogluon/timeseries/models/autogluon_tabular/mlforecast.py +369 -215
  18. autogluon/timeseries/models/autogluon_tabular/per_step.py +513 -0
  19. autogluon/timeseries/models/autogluon_tabular/transforms.py +67 -0
  20. autogluon/timeseries/models/autogluon_tabular/utils.py +3 -51
  21. autogluon/timeseries/models/chronos/__init__.py +4 -0
  22. autogluon/timeseries/models/chronos/chronos2.py +361 -0
  23. autogluon/timeseries/models/chronos/model.py +738 -0
  24. autogluon/timeseries/models/chronos/utils.py +369 -0
  25. autogluon/timeseries/models/ensemble/__init__.py +35 -2
  26. autogluon/timeseries/models/ensemble/{abstract_timeseries_ensemble.py → abstract.py} +50 -26
  27. autogluon/timeseries/models/ensemble/array_based/__init__.py +3 -0
  28. autogluon/timeseries/models/ensemble/array_based/abstract.py +236 -0
  29. autogluon/timeseries/models/ensemble/array_based/models.py +73 -0
  30. autogluon/timeseries/models/ensemble/array_based/regressor/__init__.py +12 -0
  31. autogluon/timeseries/models/ensemble/array_based/regressor/abstract.py +88 -0
  32. autogluon/timeseries/models/ensemble/array_based/regressor/linear_stacker.py +167 -0
  33. autogluon/timeseries/models/ensemble/array_based/regressor/per_quantile_tabular.py +94 -0
  34. autogluon/timeseries/models/ensemble/array_based/regressor/tabular.py +107 -0
  35. autogluon/timeseries/models/ensemble/ensemble_selection.py +167 -0
  36. autogluon/timeseries/models/ensemble/per_item_greedy.py +162 -0
  37. autogluon/timeseries/models/ensemble/weighted/__init__.py +8 -0
  38. autogluon/timeseries/models/ensemble/weighted/abstract.py +40 -0
  39. autogluon/timeseries/models/ensemble/weighted/basic.py +78 -0
  40. autogluon/timeseries/models/ensemble/weighted/greedy.py +57 -0
  41. autogluon/timeseries/models/gluonts/__init__.py +3 -1
  42. autogluon/timeseries/models/gluonts/abstract.py +583 -0
  43. autogluon/timeseries/models/gluonts/dataset.py +109 -0
  44. autogluon/timeseries/models/gluonts/{torch/models.py → models.py} +185 -44
  45. autogluon/timeseries/models/local/__init__.py +1 -10
  46. autogluon/timeseries/models/local/abstract_local_model.py +150 -97
  47. autogluon/timeseries/models/local/naive.py +31 -23
  48. autogluon/timeseries/models/local/npts.py +6 -2
  49. autogluon/timeseries/models/local/statsforecast.py +99 -112
  50. autogluon/timeseries/models/multi_window/multi_window_model.py +99 -40
  51. autogluon/timeseries/models/registry.py +64 -0
  52. autogluon/timeseries/models/toto/__init__.py +3 -0
  53. autogluon/timeseries/models/toto/_internal/__init__.py +9 -0
  54. autogluon/timeseries/models/toto/_internal/backbone/__init__.py +3 -0
  55. autogluon/timeseries/models/toto/_internal/backbone/attention.py +196 -0
  56. autogluon/timeseries/models/toto/_internal/backbone/backbone.py +262 -0
  57. autogluon/timeseries/models/toto/_internal/backbone/distribution.py +70 -0
  58. autogluon/timeseries/models/toto/_internal/backbone/kvcache.py +136 -0
  59. autogluon/timeseries/models/toto/_internal/backbone/rope.py +89 -0
  60. autogluon/timeseries/models/toto/_internal/backbone/rotary_embedding_torch.py +342 -0
  61. autogluon/timeseries/models/toto/_internal/backbone/scaler.py +305 -0
  62. autogluon/timeseries/models/toto/_internal/backbone/transformer.py +333 -0
  63. autogluon/timeseries/models/toto/_internal/dataset.py +165 -0
  64. autogluon/timeseries/models/toto/_internal/forecaster.py +423 -0
  65. autogluon/timeseries/models/toto/dataloader.py +108 -0
  66. autogluon/timeseries/models/toto/hf_pretrained_model.py +118 -0
  67. autogluon/timeseries/models/toto/model.py +236 -0
  68. autogluon/timeseries/predictor.py +826 -305
  69. autogluon/timeseries/regressor.py +253 -0
  70. autogluon/timeseries/splitter.py +10 -31
  71. autogluon/timeseries/trainer/__init__.py +2 -3
  72. autogluon/timeseries/trainer/ensemble_composer.py +439 -0
  73. autogluon/timeseries/trainer/model_set_builder.py +256 -0
  74. autogluon/timeseries/trainer/prediction_cache.py +149 -0
  75. autogluon/timeseries/trainer/trainer.py +1298 -0
  76. autogluon/timeseries/trainer/utils.py +17 -0
  77. autogluon/timeseries/transforms/__init__.py +2 -0
  78. autogluon/timeseries/transforms/covariate_scaler.py +164 -0
  79. autogluon/timeseries/transforms/target_scaler.py +149 -0
  80. autogluon/timeseries/utils/constants.py +10 -0
  81. autogluon/timeseries/utils/datetime/base.py +38 -20
  82. autogluon/timeseries/utils/datetime/lags.py +18 -16
  83. autogluon/timeseries/utils/datetime/seasonality.py +14 -14
  84. autogluon/timeseries/utils/datetime/time_features.py +17 -14
  85. autogluon/timeseries/utils/features.py +317 -53
  86. autogluon/timeseries/utils/forecast.py +31 -17
  87. autogluon/timeseries/utils/timer.py +173 -0
  88. autogluon/timeseries/utils/warning_filters.py +44 -6
  89. autogluon/timeseries/version.py +2 -1
  90. autogluon.timeseries-1.4.1b20251210-py3.11-nspkg.pth +1 -0
  91. {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info}/METADATA +71 -47
  92. autogluon_timeseries-1.4.1b20251210.dist-info/RECORD +103 -0
  93. {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info}/WHEEL +1 -1
  94. autogluon/timeseries/configs/presets_configs.py +0 -11
  95. autogluon/timeseries/evaluator.py +0 -6
  96. autogluon/timeseries/models/ensemble/greedy_ensemble.py +0 -170
  97. autogluon/timeseries/models/gluonts/abstract_gluonts.py +0 -550
  98. autogluon/timeseries/models/gluonts/torch/__init__.py +0 -0
  99. autogluon/timeseries/models/presets.py +0 -325
  100. autogluon/timeseries/trainer/abstract_trainer.py +0 -1144
  101. autogluon/timeseries/trainer/auto_trainer.py +0 -74
  102. autogluon.timeseries-1.0.1b20240304-py3.8-nspkg.pth +0 -1
  103. autogluon.timeseries-1.0.1b20240304.dist-info/RECORD +0 -58
  104. {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info/licenses}/LICENSE +0 -0
  105. {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info/licenses}/NOTICE +0 -0
  106. {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info}/namespace_packages.txt +0 -0
  107. {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info}/top_level.txt +0 -0
  108. {autogluon.timeseries-1.0.1b20240304.dist-info → autogluon_timeseries-1.4.1b20251210.dist-info}/zip-safe +0 -0
@@ -1,8 +1,10 @@
1
1
  import logging
2
2
  import reprlib
3
- from dataclasses import dataclass, field
4
- from typing import List, Optional, Tuple
3
+ import time
4
+ from dataclasses import asdict, dataclass, field
5
+ from typing import Any, Literal
5
6
 
7
+ import numpy as np
6
8
  import pandas as pd
7
9
 
8
10
  from autogluon.common.features.types import R_FLOAT, R_INT
@@ -12,7 +14,8 @@ from autogluon.features.generators import (
12
14
  IdentityFeatureGenerator,
13
15
  PipelineFeatureGenerator,
14
16
  )
15
- from autogluon.timeseries import TimeSeriesDataFrame
17
+ from autogluon.timeseries.dataset import TimeSeriesDataFrame
18
+ from autogluon.timeseries.utils.warning_filters import warning_filter
16
19
 
17
20
  logger = logging.getLogger(__name__)
18
21
 
@@ -21,18 +24,60 @@ logger = logging.getLogger(__name__)
21
24
  class CovariateMetadata:
22
25
  """Provides mapping from different covariate types to columns in the dataset."""
23
26
 
24
- static_features_cat: List[str] = field(default_factory=list)
25
- static_features_real: List[str] = field(default_factory=list)
26
- known_covariates_real: List[str] = field(default_factory=list)
27
- known_covariates_cat: List[str] = field(default_factory=list)
28
- past_covariates_real: List[str] = field(default_factory=list)
29
- past_covariates_cat: List[str] = field(default_factory=list)
27
+ static_features_cat: list[str] = field(default_factory=list)
28
+ static_features_real: list[str] = field(default_factory=list)
29
+ known_covariates_real: list[str] = field(default_factory=list)
30
+ known_covariates_cat: list[str] = field(default_factory=list)
31
+ past_covariates_real: list[str] = field(default_factory=list)
32
+ past_covariates_cat: list[str] = field(default_factory=list)
33
+
34
+ @property
35
+ def static_features(self) -> list[str]:
36
+ return self.static_features_cat + self.static_features_real
37
+
38
+ @property
39
+ def known_covariates(self) -> list[str]:
40
+ return self.known_covariates_cat + self.known_covariates_real
41
+
42
+ @property
43
+ def past_covariates(self) -> list[str]:
44
+ return self.past_covariates_cat + self.past_covariates_real
45
+
46
+ @property
47
+ def covariates(self) -> list[str]:
48
+ return self.known_covariates + self.past_covariates
49
+
50
+ @property
51
+ def covariates_real(self) -> list[str]:
52
+ return self.known_covariates_real + self.past_covariates_real
53
+
54
+ @property
55
+ def covariates_cat(self) -> list[str]:
56
+ return self.known_covariates_cat + self.past_covariates_cat
57
+
58
+ @property
59
+ def real_features(self) -> list[str]:
60
+ return self.static_features_real + self.covariates_real
61
+
62
+ @property
63
+ def cat_features(self) -> list[str]:
64
+ return self.static_features_cat + self.covariates_cat
65
+
66
+ @property
67
+ def all_features(self) -> list[str]:
68
+ return self.static_features + self.covariates
69
+
70
+ def to_dict(self) -> dict[str, Any]:
71
+ return asdict(self)
30
72
 
31
73
 
32
74
  class ContinuousAndCategoricalFeatureGenerator(PipelineFeatureGenerator):
33
- """Generates categorical and continuous features for time series models."""
75
+ """Generates categorical and continuous features for time series models.
76
+
77
+ Imputes missing categorical features with the most frequent value in the training set.
78
+ """
34
79
 
35
- def __init__(self, verbosity: int = 0, minimum_cat_count=2, float_dtype: str = "float32", **kwargs):
80
+ def __init__(self, verbosity: int = 0, minimum_cat_count=2, **kwargs):
36
81
  generators = [
37
82
  CategoryFeatureGenerator(minimum_cat_count=minimum_cat_count, fillna="mode"),
38
83
  IdentityFeatureGenerator(infer_features_in_args={"valid_raw_types": [R_INT, R_FLOAT]}),
@@ -43,53 +88,87 @@ class ContinuousAndCategoricalFeatureGenerator(PipelineFeatureGenerator):
43
88
  pre_generators=[AsTypeFeatureGenerator(convert_bool=False)],
44
89
  pre_enforce_types=False,
45
90
  pre_drop_useless=False,
91
+ post_drop_duplicates=True,
92
+ reset_index=False,
46
93
  verbosity=verbosity,
47
94
  **kwargs,
48
95
  )
49
- self.float_dtype = float_dtype
50
-
51
- def _convert_numerical_columns_to_float(self, df: pd.DataFrame) -> pd.DataFrame:
52
- """Convert the dtype of all numerical (float or int) columns to the given float dtype."""
53
- numeric_columns = [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col])]
54
- return df.astype({col: self.float_dtype for col in numeric_columns})
55
96
 
56
97
  def transform(self, X: pd.DataFrame, *args, **kwargs) -> pd.DataFrame:
57
- if isinstance(X, TimeSeriesDataFrame):
58
- X = pd.DataFrame(X)
59
- return self._convert_numerical_columns_to_float(super().transform(X, *args, **kwargs))
98
+ return super().transform(X, *args, **kwargs)
60
99
 
61
100
  def fit_transform(self, X: pd.DataFrame, *args, **kwargs) -> pd.DataFrame:
62
101
  # PipelineFeatureGenerator does not use transform() inside fit_transform(), so we need to override both methods
63
- if isinstance(X, TimeSeriesDataFrame):
64
- X = pd.DataFrame(X)
65
- return self._convert_numerical_columns_to_float(super().fit_transform(X, *args, **kwargs))
102
+ transformed = super().fit_transform(X, *args, **kwargs)
103
+ # Ignore the '__dummy__' feature generated by PipelineFeatureGenerator if none of the features are informative
104
+ if "__dummy__" in transformed.columns:
105
+ transformed.drop(columns=["__dummy__"], inplace=True)
106
+ return transformed
66
107
 
67
108
 
68
109
  class TimeSeriesFeatureGenerator:
69
110
  """Takes care of preprocessing for static_features and past/known covariates.
70
111
 
71
- All covariates & static features are converted into either float32 or categorical dtype.
112
+ All covariates & static features are converted into either float or categorical dtype.
113
+
114
+ Missing values in the target column are left as-is but missing values in static features & covariates are imputed.
115
+ Imputation logic is as follows:
116
+ 1. For all categorical columns (static, past, known), we fill missing values with the mode of the training set.
117
+ 2. For real static features, we impute missing values with the median of the training set.
118
+ 3. For real covariates (past, known), we ffill + bfill within each time series. If for some time series all
119
+ covariate values are missing, we fill them with the median of the training set.
120
+
121
+ Parameters
122
+ ----------
123
+ target
124
+ Name of the target column.
125
+ known_covariates_names
126
+ Columns that contain covariates that are known into the future.
127
+ float_dtype
128
+ Numpy float dtype to which all numeric columns (float, int, bool) will be converted both in static & dynamic dfs.
129
+ num_samples
130
+ Number of rows sampled from the training dataset to speed up computation of the median (used later for imputation).
131
+ If set to `None`, median will be computed using all rows.
72
132
  """
73
133
 
74
- def __init__(self, target: str, known_covariates_names: List[str], float_dtype: str = "float32"):
134
+ def __init__(
135
+ self,
136
+ target: str,
137
+ known_covariates_names: list[str],
138
+ float_dtype: str = "float32",
139
+ num_samples: int | None = 20_000,
140
+ ):
75
141
  self.target = target
76
142
  self.float_dtype = float_dtype
143
+ self.num_samples = num_samples
144
+
77
145
  self._is_fit = False
78
- self.known_covariates_names = list(known_covariates_names)
79
- self.past_covariates_names = []
146
+ self.known_covariates_names: list[str] = list(known_covariates_names)
147
+ self.past_covariates_names: list[str] = []
80
148
  self.known_covariates_pipeline = ContinuousAndCategoricalFeatureGenerator()
81
149
  self.past_covariates_pipeline = ContinuousAndCategoricalFeatureGenerator()
82
150
  # Cat features with cat_count=1 are fine in static_features since they are repeated for all time steps in a TS
83
151
  self.static_feature_pipeline = ContinuousAndCategoricalFeatureGenerator(minimum_cat_count=1)
84
- self.covariate_metadata: CovariateMetadata = None
152
+ self._covariate_metadata: CovariateMetadata | None = None # type ignore
153
+ self._train_covariates_real_median: pd.Series | None = None
154
+ self._train_static_real_median: pd.Series | None = None
85
155
 
86
156
  @property
87
- def required_column_names(self) -> List[str]:
157
+ def required_column_names(self) -> list[str]:
88
158
  return [self.target] + list(self.known_covariates_names) + list(self.past_covariates_names)
89
159
 
160
+ @property
161
+ def covariate_metadata(self) -> CovariateMetadata:
162
+ assert self._covariate_metadata is not None, "covariate_metadata is not set. Did you call fit?"
163
+ return self._covariate_metadata
164
+
90
165
  def fit(self, data: TimeSeriesDataFrame) -> None:
166
+ self.fit_transform(data)
167
+
168
+ def fit_transform(self, data: TimeSeriesDataFrame) -> TimeSeriesDataFrame:
91
169
  assert not self._is_fit, f"{self.__class__.__name__} has already been fit"
92
170
 
171
+ start_time = time.monotonic()
93
172
  self.past_covariates_names = []
94
173
  for column in data.columns:
95
174
  if column != self.target and column not in self.known_covariates_names:
@@ -99,23 +178,33 @@ class TimeSeriesFeatureGenerator:
99
178
  data, required_column_names=self.required_column_names, data_frame_name="train_data"
100
179
  )
101
180
 
181
+ # Convert to a pd.DataFrame and remove index for faster processing
182
+ df = pd.DataFrame(data)
183
+ index = df.index
184
+ df.reset_index(drop=True, inplace=True)
185
+ df = self._convert_numeric_to_float_dtype(df)
186
+
187
+ dfs_to_concat = [df[[self.target]]]
188
+
102
189
  logger.info("\nProvided data contains following columns:")
103
190
  logger.info(f"\ttarget: '{self.target}'")
104
191
 
105
192
  if len(self.known_covariates_names) > 0:
106
- known_covariates_df = self.known_covariates_pipeline.fit_transform(data[self.known_covariates_names])
193
+ known_covariates_df = self.known_covariates_pipeline.fit_transform(df[self.known_covariates_names])
107
194
  logger.info("\tknown_covariates:")
108
195
  known_covariates_cat, known_covariates_real = self._detect_and_log_column_types(known_covariates_df)
109
196
  self.known_covariates_names = self.known_covariates_pipeline.features_in
197
+ dfs_to_concat.append(known_covariates_df)
110
198
  else:
111
199
  known_covariates_cat = []
112
200
  known_covariates_real = []
113
201
 
114
202
  if len(self.past_covariates_names) > 0:
115
- past_covariates_df = self.past_covariates_pipeline.fit_transform(data[self.past_covariates_names])
203
+ past_covariates_df = self.past_covariates_pipeline.fit_transform(df[self.past_covariates_names])
116
204
  logger.info("\tpast_covariates:")
117
205
  past_covariates_cat, past_covariates_real = self._detect_and_log_column_types(past_covariates_df)
118
206
  self.past_covariates_names = self.past_covariates_pipeline.features_in
207
+ dfs_to_concat.append(past_covariates_df)
119
208
  else:
120
209
  past_covariates_cat = []
121
210
  past_covariates_real = []
@@ -125,14 +214,18 @@ class TimeSeriesFeatureGenerator:
125
214
  )
126
215
 
127
216
  if data.static_features is not None:
128
- static_features_df = self.static_feature_pipeline.fit_transform(data.static_features)
217
+ static_features_df = self.static_feature_pipeline.fit_transform(
218
+ self._convert_numeric_to_float_dtype(data.static_features)
219
+ )
129
220
  logger.info("\tstatic_features:")
130
221
  static_features_cat, static_features_real = self._detect_and_log_column_types(static_features_df)
131
222
  ignored_static_features = data.static_features.columns.difference(self.static_feature_pipeline.features_in)
223
+ self._train_static_real_median = data.static_features[static_features_real].median()
132
224
  else:
133
225
  static_features_cat = []
134
226
  static_features_real = []
135
227
  ignored_static_features = []
228
+ static_features_df = None
136
229
 
137
230
  if len(ignored_covariates) > 0 or len(ignored_static_features) > 0:
138
231
  logger.info("\nAutoGluon will ignore following non-numeric/non-informative columns:")
@@ -146,7 +239,7 @@ class TimeSeriesFeatureGenerator:
146
239
  "\nTo learn how to fix incorrectly inferred types, please see documentation for TimeSeriesPredictor.fit"
147
240
  )
148
241
 
149
- self.covariate_metadata = CovariateMetadata(
242
+ self._covariate_metadata = CovariateMetadata(
150
243
  known_covariates_cat=known_covariates_cat,
151
244
  known_covariates_real=known_covariates_real,
152
245
  past_covariates_cat=past_covariates_cat,
@@ -154,8 +247,47 @@ class TimeSeriesFeatureGenerator:
154
247
  static_features_cat=static_features_cat,
155
248
  static_features_real=static_features_real,
156
249
  )
250
+
251
+ # Median of real-valued covariates will be used for missing value imputation
252
+ if self.num_samples is not None and len(df) > self.num_samples:
253
+ df = df.sample(n=self.num_samples, replace=True)
254
+ self._train_covariates_real_median = df[self.covariate_metadata.covariates_real].median()
255
+
256
+ self.fit_time = time.monotonic() - start_time
157
257
  self._is_fit = True
158
258
 
259
+ df_out = self._concat_dfs(dfs_to_concat)
260
+ df_out.index = index
261
+ ts_df = TimeSeriesDataFrame(df_out, static_features=self._impute_static_features(static_features_df))
262
+ return self._impute_covariates(ts_df, column_names=self.covariate_metadata.covariates_real)
263
+
264
+ @staticmethod
265
+ def _concat_dfs(dfs_to_concat: list[pd.DataFrame]) -> pd.DataFrame:
266
+ if len(dfs_to_concat) == 1:
267
+ return dfs_to_concat[0]
268
+ else:
269
+ return pd.concat(dfs_to_concat, axis=1, copy=False)
270
+
271
+ def _impute_covariates(self, ts_df: TimeSeriesDataFrame, column_names: list[str]) -> TimeSeriesDataFrame:
272
+ """Impute missing values in selected columns with ffill, bfill, and median imputation."""
273
+ if len(column_names) > 0:
274
+ # ffill + bfill covariates that have at least some observed values
275
+ covariates_real = ts_df[column_names].fill_missing_values()
276
+ # If for some items covariates consist completely of NaNs, fill them with median of training data
277
+ if np.isnan(covariates_real.to_numpy()).any():
278
+ covariates_real.fillna(self._train_covariates_real_median, inplace=True)
279
+ ts_df[column_names] = covariates_real
280
+ return ts_df
281
+
282
+ def _impute_static_features(self, static_df: pd.DataFrame | None) -> pd.DataFrame | None:
283
+ """Impute missing values in static features using the median."""
284
+ static_real_names = self.covariate_metadata.static_features_real
285
+ if static_df is not None and static_real_names:
286
+ static_real = static_df[static_real_names]
287
+ if np.isnan(static_real.to_numpy()).any():
288
+ static_df[static_real_names] = static_real.fillna(self._train_static_real_median)
289
+ return static_df
290
+
159
291
  def transform(self, data: TimeSeriesDataFrame, data_frame_name: str = "data") -> TimeSeriesDataFrame:
160
292
  """Transform static features and past/known covariates.
161
293
 
@@ -168,50 +300,61 @@ class TimeSeriesFeatureGenerator:
168
300
  self._check_required_columns_are_present(
169
301
  data, required_column_names=self.required_column_names, data_frame_name=data_frame_name
170
302
  )
171
- dfs = [data[[self.target]]]
303
+ # Convert to a pd.DataFrame and remove index for faster processing
304
+ df = pd.DataFrame(data)
305
+ index = df.index
306
+ df.reset_index(drop=True, inplace=True)
307
+
308
+ dfs_to_concat = [df[[self.target]]]
172
309
 
173
310
  if len(self.known_covariates_names) > 0:
174
- dfs.append(self.known_covariates_pipeline.transform(data[self.known_covariates_names]))
311
+ known_covariates_df = self.known_covariates_pipeline.transform(df[self.known_covariates_names])
312
+ dfs_to_concat.append(known_covariates_df)
175
313
 
176
314
  if len(self.past_covariates_names) > 0:
177
- dfs.append(self.past_covariates_pipeline.transform(data[self.past_covariates_names]))
315
+ past_covariates_df = self.past_covariates_pipeline.transform(df[self.past_covariates_names])
316
+ dfs_to_concat.append(past_covariates_df)
178
317
 
179
318
  if self.static_feature_pipeline.is_fit():
180
319
  if data.static_features is None:
181
320
  raise ValueError(f"Provided {data_frame_name} must contain static_features")
182
- static_features = self.static_feature_pipeline.transform(data.static_features)
321
+ static_features_df = self.static_feature_pipeline.transform(data.static_features)
183
322
  else:
184
- static_features = None
323
+ static_features_df = None
185
324
 
186
- return TimeSeriesDataFrame(pd.concat(dfs, axis=1), static_features=static_features)
325
+ df_out = self._concat_dfs(dfs_to_concat)
326
+ df_out.index = index
327
+ ts_df = TimeSeriesDataFrame(df_out, static_features=self._impute_static_features(static_features_df))
328
+ return self._impute_covariates(ts_df, column_names=self.covariate_metadata.covariates_real)
187
329
 
188
330
  def transform_future_known_covariates(
189
- self, known_covariates: Optional[TimeSeriesDataFrame]
190
- ) -> Optional[TimeSeriesDataFrame]:
331
+ self, known_covariates: TimeSeriesDataFrame | None
332
+ ) -> TimeSeriesDataFrame | None:
191
333
  assert self._is_fit, f"{self.__class__.__name__} has not been fit yet"
192
334
  if len(self.known_covariates_names) > 0:
193
335
  assert known_covariates is not None, "known_covariates must be provided at prediction time"
194
336
  self._check_required_columns_are_present(
195
337
  known_covariates, required_column_names=self.known_covariates_names, data_frame_name="known_covariates"
196
338
  )
197
- return TimeSeriesDataFrame(self.known_covariates_pipeline.transform(known_covariates))
339
+ known_covariates = TimeSeriesDataFrame(
340
+ self.known_covariates_pipeline.transform(pd.DataFrame(known_covariates))
341
+ )
342
+ return self._impute_covariates(
343
+ known_covariates, column_names=self.covariate_metadata.known_covariates_real
344
+ )
198
345
  else:
199
346
  return None
200
347
 
201
- def fit_transform(self, data: TimeSeriesDataFrame, data_frame_name: str = "data") -> TimeSeriesDataFrame:
202
- self.fit(data)
203
- return self.transform(data, data_frame_name=data_frame_name)
204
-
205
348
  @staticmethod
206
- def _detect_and_log_column_types(transformed_df: pd.DataFrame) -> Tuple[List[str], List[str]]:
349
+ def _detect_and_log_column_types(transformed_df: pd.DataFrame) -> tuple[list[str], list[str]]:
207
350
  """Log & return names of categorical and real-valued columns in the DataFrame."""
208
- cat_column_names = []
209
- real_column_names = []
351
+ cat_column_names: list[str] = []
352
+ real_column_names: list[str] = []
210
353
  for column_name, column_dtype in transformed_df.dtypes.items():
211
354
  if isinstance(column_dtype, pd.CategoricalDtype):
212
- cat_column_names.append(column_name)
355
+ cat_column_names.append(str(column_name))
213
356
  elif pd.api.types.is_numeric_dtype(column_dtype):
214
- real_column_names.append(column_name)
357
+ real_column_names.append(str(column_name))
215
358
 
216
359
  logger.info(f"\t\tcategorical: {reprlib.repr(cat_column_names)}")
217
360
  logger.info(f"\t\tcontinuous (float): {reprlib.repr(real_column_names)}")
@@ -219,10 +362,131 @@ class TimeSeriesFeatureGenerator:
219
362
 
220
363
  @staticmethod
221
364
  def _check_required_columns_are_present(
222
- data: TimeSeriesDataFrame, required_column_names: List[str], data_frame_name: str
365
+ data: TimeSeriesDataFrame, required_column_names: list[str], data_frame_name: str
223
366
  ) -> None:
224
- missing_columns = pd.Index(required_column_names).difference(data.columns)
367
+ missing_columns = pd.Index(required_column_names).difference(data.columns) # type: ignore
225
368
  if len(missing_columns) > 0:
226
369
  raise ValueError(
227
370
  f"{len(missing_columns)} columns are missing from {data_frame_name}: {reprlib.repr(missing_columns.to_list())}"
228
371
  )
372
+
373
+ def _convert_numeric_to_float_dtype(self, df: pd.DataFrame) -> pd.DataFrame:
374
+ """Convert the dtype of all numeric (float, int or bool) columns to self.float_dtype."""
375
+ numeric_columns = [
376
+ col for col, dtype in df.dtypes.items() if pd.api.types.is_numeric_dtype(dtype) and col != self.target
377
+ ]
378
+ if len(numeric_columns) > 0:
379
+ df = df.astype({col: self.float_dtype for col in numeric_columns}, copy=False)
380
+ return df
381
+
382
+
383
+ class AbstractFeatureImportanceTransform:
384
+ """Abstract class for transforms that replace a given feature with dummy or shuffled values,
385
+ for use in feature importance operations.
386
+ """
387
+
388
+ def __init__(
389
+ self,
390
+ covariate_metadata: CovariateMetadata,
391
+ prediction_length: int,
392
+ **kwargs,
393
+ ):
394
+ self.covariate_metadata: CovariateMetadata = covariate_metadata
395
+ self.prediction_length: int = prediction_length
396
+
397
+ def _transform_static_series(self, feature_data: pd.Series, is_categorical: bool) -> Any:
398
+ """Transforms a series with the same index as the pandas DataFrame"""
399
+ raise NotImplementedError
400
+
401
+ def _transform_series(self, feature_data: pd.Series, is_categorical: bool) -> pd.Series:
402
+ """Transforms a series with the same index as the pandas DataFrame"""
403
+ raise NotImplementedError
404
+
405
+ def transform(self, data: TimeSeriesDataFrame, feature_name: str, **kwargs) -> TimeSeriesDataFrame:
406
+ if feature_name not in self.covariate_metadata.all_features:
407
+ raise ValueError(f"Target feature {feature_name} not found in covariate metadata")
408
+
409
+ # feature transform works on a shallow copy of the main time series dataframe
410
+ # but a deep copy of the static features.
411
+ data = data.copy(deep=False)
412
+
413
+ is_categorical = feature_name in self.covariate_metadata.cat_features
414
+
415
+ if feature_name in self.covariate_metadata.past_covariates:
416
+ # we'll have to work on the history of the data alone
417
+ data[feature_name] = data[feature_name].copy()
418
+ feature_data = (
419
+ data[feature_name].groupby(level=TimeSeriesDataFrame.ITEMID, sort=False).head(-self.prediction_length)
420
+ )
421
+ # Silence spurious FutureWarning raised by DataFrame.update https://github.com/pandas-dev/pandas/issues/57124
422
+ with warning_filter():
423
+ data[feature_name].update(self._transform_series(feature_data, is_categorical=is_categorical))
424
+ elif feature_name in self.covariate_metadata.static_features:
425
+ assert data.static_features is not None
426
+ feature_data = data.static_features[feature_name].copy()
427
+ feature_data.reset_index(drop=True, inplace=True)
428
+ data.static_features[feature_name] = self._transform_static_series(
429
+ feature_data, is_categorical=is_categorical
430
+ )
431
+ else: # known covariates
432
+ data[feature_name] = self._transform_series(data[feature_name], is_categorical=is_categorical)
433
+
434
+ return data
435
+
436
+
437
+ class PermutationFeatureImportanceTransform(AbstractFeatureImportanceTransform):
438
+ """Naively shuffles a given feature."""
439
+
440
+ def __init__(
441
+ self,
442
+ covariate_metadata: CovariateMetadata,
443
+ prediction_length: int,
444
+ random_seed: int | None = None,
445
+ shuffle_type: Literal["itemwise", "naive"] = "itemwise",
446
+ **kwargs,
447
+ ):
448
+ super().__init__(covariate_metadata, prediction_length, **kwargs)
449
+ self.shuffle_type = shuffle_type
450
+ self.random_seed = random_seed
451
+
452
+ def _transform_static_series(self, feature_data: pd.Series, is_categorical: bool) -> Any:
453
+ return feature_data.sample(frac=1, random_state=self.random_seed).values
454
+
455
+ def _transform_series(self, feature_data: pd.Series, is_categorical: bool) -> pd.Series:
456
+ # set random state once to shuffle 'independently' for different items
457
+ rng = np.random.RandomState(self.random_seed)
458
+
459
+ if self.shuffle_type == "itemwise":
460
+ return feature_data.groupby(level=TimeSeriesDataFrame.ITEMID, sort=False).transform(
461
+ lambda x: x.sample(frac=1, random_state=rng).values
462
+ )
463
+ elif self.shuffle_type == "naive":
464
+ return pd.Series(feature_data.sample(frac=1, random_state=rng).values, index=feature_data.index)
465
+ else:
466
+ raise ValueError(f"Unknown shuffle_type: {self.shuffle_type}")
467
+
468
+
469
+ class ConstantReplacementFeatureImportanceTransform(AbstractFeatureImportanceTransform):
470
+ """Replaces a target feature with the median if it's a real-valued feature, and the mode if it's a
471
+ categorical feature."""
472
+
473
+ def __init__(
474
+ self,
475
+ covariate_metadata: CovariateMetadata,
476
+ prediction_length: int,
477
+ real_value_aggregation: Literal["mean", "median"] = "mean",
478
+ **kwargs,
479
+ ):
480
+ super().__init__(covariate_metadata, prediction_length, **kwargs)
481
+ self.real_value_aggregation = real_value_aggregation
482
+
483
+ def _transform_static_series(self, feature_data: pd.Series, is_categorical: bool) -> Any:
484
+ return feature_data.mode()[0] if is_categorical else feature_data.agg(self.real_value_aggregation)
485
+
486
+ def _transform_series(self, feature_data: pd.Series, is_categorical: bool) -> pd.Series:
487
+ if is_categorical:
488
+ return feature_data.groupby(level=TimeSeriesDataFrame.ITEMID, sort=False).transform(lambda x: x.mode()[0])
489
+ else:
490
+ return feature_data.groupby(level=TimeSeriesDataFrame.ITEMID, sort=False).transform(
491
+ self.real_value_aggregation
492
+ ) # type: ignore
@@ -3,33 +3,47 @@ import warnings
3
3
  import numpy as np
4
4
  import pandas as pd
5
5
 
6
- from autogluon.timeseries.dataset.ts_dataframe import ITEMID, TIMESTAMP, TimeSeriesDataFrame
6
+ from autogluon.common.utils.deprecated_utils import Deprecated
7
+ from autogluon.timeseries.dataset import TimeSeriesDataFrame
7
8
 
8
9
 
9
10
  def get_forecast_horizon_index_single_time_series(
10
11
  past_timestamps: pd.DatetimeIndex, freq: str, prediction_length: int
11
12
  ) -> pd.DatetimeIndex:
12
13
  """Get timestamps for the next prediction_length many time steps of the time series with given frequency."""
13
- start_ts = past_timestamps.max() + 1 * pd.tseries.frequencies.to_offset(freq)
14
- return pd.date_range(start=start_ts, periods=prediction_length, freq=freq, name=TIMESTAMP)
14
+ offset = pd.tseries.frequencies.to_offset(freq)
15
+ if offset is None:
16
+ raise ValueError(f"Invalid frequency: {freq}")
17
+ start_ts = past_timestamps.max() + 1 * offset
18
+ return pd.date_range(start=start_ts, periods=prediction_length, freq=freq, name=TimeSeriesDataFrame.TIMESTAMP)
15
19
 
16
20
 
17
- def get_forecast_horizon_index_ts_dataframe(
18
- ts_dataframe: TimeSeriesDataFrame, prediction_length: int
19
- ) -> pd.MultiIndex:
20
- """For each item in the dataframe, get timestamps for the next prediction_length many time steps into the future.
21
+ @Deprecated(
22
+ min_version_to_warn="1.3", min_version_to_error="2.0", new="TimeSeriesPredictor.forecast_horizon_data_frame"
23
+ )
24
+ def get_forecast_horizon_index_ts_dataframe(*args, **kwargs) -> pd.MultiIndex:
25
+ return pd.MultiIndex.from_frame(make_future_data_frame(*args, **kwargs))
21
26
 
22
- Returns a pandas.MultiIndex, where
23
- - level 0 ("item_id") contains the same item_ids as the input ts_dataframe.
24
- - level 1 ("timestamp") contains the next prediction_length time steps starting from the end of each time series.
25
- """
26
- last = ts_dataframe.reset_index()[[ITEMID, TIMESTAMP]].groupby(by=ITEMID, sort=False, as_index=False).last()
27
- item_ids = np.repeat(last[ITEMID], prediction_length)
28
27
 
29
- offset = pd.tseries.frequencies.to_offset(ts_dataframe.freq)
30
- last_ts = pd.DatetimeIndex(last[TIMESTAMP])
28
+ def make_future_data_frame(
29
+ ts_dataframe: TimeSeriesDataFrame,
30
+ prediction_length: int,
31
+ freq: str | None = None,
32
+ ) -> pd.DataFrame:
33
+ """For each item in the dataframe, get timestamps for the next `prediction_length` time steps into the future.
34
+
35
+ Returns a pandas.DataFrame, with columns "item_id" and "timestamp" corresponding to the forecast horizon.
36
+ """
37
+ indptr = ts_dataframe.get_indptr()
38
+ last = ts_dataframe.index[indptr[1:] - 1].to_frame(index=False)
39
+ item_ids = np.repeat(last[TimeSeriesDataFrame.ITEMID].to_numpy(), prediction_length)
40
+
41
+ if freq is None:
42
+ freq = ts_dataframe.freq
43
+ offset = pd.tseries.frequencies.to_offset(freq)
44
+ last_ts = pd.DatetimeIndex(last[TimeSeriesDataFrame.TIMESTAMP])
31
45
  # Non-vectorized offsets like BusinessDay may produce a PerformanceWarning - we filter them
32
46
  with warnings.catch_warnings():
33
47
  warnings.simplefilter("ignore", category=pd.errors.PerformanceWarning)
34
- timestamps = np.dstack([last_ts + step * offset for step in range(1, prediction_length + 1)]).ravel()
35
- return pd.MultiIndex.from_arrays([item_ids, timestamps], names=[ITEMID, TIMESTAMP])
48
+ timestamps = np.dstack([last_ts + step * offset for step in range(1, prediction_length + 1)]).ravel() # type: ignore[operator]
49
+ return pd.DataFrame({TimeSeriesDataFrame.ITEMID: item_ids, TimeSeriesDataFrame.TIMESTAMP: timestamps})