autogluon.timeseries 1.0.1b20240329__tar.gz → 1.0.1b20240330__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of autogluon.timeseries might be problematic. Click here for more details.

Files changed (62) hide show
  1. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/PKG-INFO +1 -1
  2. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/dataset/ts_dataframe.py +11 -3
  3. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/abstract/abstract_timeseries_model.py +26 -3
  4. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/autogluon_tabular/mlforecast.py +25 -3
  5. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/chronos/model.py +3 -0
  6. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/gluonts/abstract_gluonts.py +3 -2
  7. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/local/abstract_local_model.py +67 -22
  8. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/local/naive.py +18 -14
  9. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/local/npts.py +3 -0
  10. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/local/statsforecast.py +2 -0
  11. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/multi_window/multi_window_model.py +3 -1
  12. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/predictor.py +35 -39
  13. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/utils/features.py +62 -4
  14. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/version.py +1 -1
  15. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon.timeseries.egg-info/PKG-INFO +1 -1
  16. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon.timeseries.egg-info/requires.txt +4 -4
  17. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/setup.cfg +0 -0
  18. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/setup.py +0 -0
  19. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/__init__.py +0 -0
  20. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/configs/__init__.py +0 -0
  21. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/configs/presets_configs.py +0 -0
  22. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/dataset/__init__.py +0 -0
  23. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/evaluator.py +0 -0
  24. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/learner.py +0 -0
  25. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/metrics/__init__.py +0 -0
  26. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/metrics/abstract.py +0 -0
  27. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/metrics/point.py +0 -0
  28. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/metrics/quantile.py +0 -0
  29. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/metrics/utils.py +0 -0
  30. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/__init__.py +0 -0
  31. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/abstract/__init__.py +0 -0
  32. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/abstract/model_trial.py +0 -0
  33. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/autogluon_tabular/__init__.py +0 -0
  34. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/autogluon_tabular/utils.py +0 -0
  35. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/chronos/__init__.py +0 -0
  36. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/chronos/pipeline.py +0 -0
  37. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/ensemble/__init__.py +0 -0
  38. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/ensemble/abstract_timeseries_ensemble.py +0 -0
  39. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/ensemble/greedy_ensemble.py +0 -0
  40. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/gluonts/__init__.py +0 -0
  41. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/gluonts/torch/__init__.py +0 -0
  42. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/gluonts/torch/models.py +0 -0
  43. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/local/__init__.py +0 -0
  44. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/multi_window/__init__.py +0 -0
  45. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/models/presets.py +0 -0
  46. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/splitter.py +0 -0
  47. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/trainer/__init__.py +0 -0
  48. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/trainer/abstract_trainer.py +0 -0
  49. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/trainer/auto_trainer.py +0 -0
  50. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/utils/__init__.py +0 -0
  51. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/utils/datetime/__init__.py +0 -0
  52. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/utils/datetime/base.py +0 -0
  53. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/utils/datetime/lags.py +0 -0
  54. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/utils/datetime/seasonality.py +0 -0
  55. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/utils/datetime/time_features.py +0 -0
  56. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/utils/forecast.py +0 -0
  57. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon/timeseries/utils/warning_filters.py +0 -0
  58. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon.timeseries.egg-info/SOURCES.txt +0 -0
  59. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon.timeseries.egg-info/dependency_links.txt +0 -0
  60. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon.timeseries.egg-info/namespace_packages.txt +0 -0
  61. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon.timeseries.egg-info/top_level.txt +0 -0
  62. {autogluon.timeseries-1.0.1b20240329 → autogluon.timeseries-1.0.1b20240330}/src/autogluon.timeseries.egg-info/zip-safe +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: autogluon.timeseries
3
- Version: 1.0.1b20240329
3
+ Version: 1.0.1b20240330
4
4
  Summary: AutoML for Image, Text, and Tabular Data
5
5
  Home-page: https://github.com/autogluon/autogluon
6
6
  Author: AutoGluon Community
@@ -765,11 +765,19 @@ class TimeSeriesDataFrame(pd.DataFrame, TimeSeriesDataFrameDeprecatedMixin):
765
765
  "(for example, using the `convert_frequency` method)."
766
766
  )
767
767
 
768
- grouped_df = pd.DataFrame(self).groupby(level=ITEMID, sort=False, group_keys=False)
768
+ # Convert to pd.DataFrame for faster processing
769
+ df = pd.DataFrame(self)
770
+
771
+ # Skip filling if there are no NaNs
772
+ if not df.isna().any(axis=None):
773
+ return self
774
+
775
+ grouped_df = df.groupby(level=ITEMID, sort=False, group_keys=False)
769
776
  if method == "auto":
770
777
  filled_df = grouped_df.ffill()
771
- # Fill missing values at the start of each time series with bfill
772
- filled_df = filled_df.groupby(level=ITEMID, sort=False, group_keys=False).bfill()
778
+ # If necessary, fill missing values at the start of each time series with bfill
779
+ if filled_df.isna().any(axis=None):
780
+ filled_df = filled_df.groupby(level=ITEMID, sort=False, group_keys=False).bfill()
773
781
  elif method in ["ffill", "pad"]:
774
782
  filled_df = grouped_df.ffill()
775
783
  elif method in ["bfill", "backfill"]:
@@ -201,7 +201,9 @@ class AbstractTimeSeriesModel(AbstractModel):
201
201
  }
202
202
  return info
203
203
 
204
- def fit(self, **kwargs) -> "AbstractTimeSeriesModel":
204
+ def fit(
205
+ self, train_data: TimeSeriesDataFrame, val_data: Optional[TimeSeriesDataFrame] = None, **kwargs
206
+ ) -> "AbstractTimeSeriesModel":
205
207
  """Fit timeseries model.
206
208
 
207
209
  Models should not override the `fit` method, but instead override the `_fit` method which
@@ -235,7 +237,10 @@ class AbstractTimeSeriesModel(AbstractModel):
235
237
  model: AbstractTimeSeriesModel
236
238
  The fitted model object
237
239
  """
238
- return super().fit(**kwargs)
240
+ train_data = self.preprocess(train_data, is_train=True)
241
+ if self._get_tags()["can_use_val_data"] and val_data is not None:
242
+ val_data = self.preprocess(val_data, is_train=False)
243
+ return super().fit(train_data=train_data, val_data=val_data, **kwargs)
239
244
 
240
245
  def _fit(
241
246
  self,
@@ -290,6 +295,7 @@ class AbstractTimeSeriesModel(AbstractModel):
290
295
  data is given as a separate forecast item in the dictionary, keyed by the `item_id`s
291
296
  of input items.
292
297
  """
298
+ data = self.preprocess(data, is_train=False)
293
299
  predictions = self._predict(data=data, known_covariates=known_covariates, **kwargs)
294
300
  logger.debug(f"Predicting with model {self.name}")
295
301
  # "0.5" might be missing from the quantiles if self is a wrapper (MultiWindowBacktestingModel or ensemble)
@@ -488,7 +494,7 @@ class AbstractTimeSeriesModel(AbstractModel):
488
494
 
489
495
  return hpo_models, analysis
490
496
 
491
- def preprocess(self, data: Any, **kwargs) -> Any:
497
+ def preprocess(self, data: TimeSeriesDataFrame, is_train: bool = False, **kwargs) -> Any:
492
498
  return data
493
499
 
494
500
  def get_memory_size(self, **kwargs) -> Optional[int]:
@@ -506,3 +512,20 @@ class AbstractTimeSeriesModel(AbstractModel):
506
512
  return {}
507
513
  else:
508
514
  return self._user_params.copy()
515
+
516
+ def _more_tags(self) -> dict:
517
+ """Encode model properties using tags, similar to sklearn & autogluon.tabular.
518
+
519
+ For more details, see `autogluon.core.models.abstract.AbstractModel._get_tags()` and https://scikit-learn.org/stable/_sources/developers/develop.rst.txt.
520
+
521
+ List of currently supported tags:
522
+ - allow_nan: Can the model handle data with missing values represented by np.nan?
523
+ - can_refit_full: Does it make sense to retrain the model without validation data?
524
+ See `autogluon.core.models.abstract._tags._DEFAULT_TAGS` for more details.
525
+ - can_use_val_data: Can model use val_data if it's provided to model.fit()?
526
+ """
527
+ return {
528
+ "allow_nan": False,
529
+ "can_refit_full": False,
530
+ "can_use_val_data": False,
531
+ }
@@ -85,6 +85,21 @@ class AbstractMLForecastModel(AbstractTimeSeriesModel):
85
85
  self._scaler: Optional[BaseTargetTransform] = None
86
86
  self._residuals_std_per_item: Optional[pd.Series] = None
87
87
  self._avg_residuals_std: Optional[float] = None
88
+ self._train_target_median: Optional[float] = None
89
+
90
+ def preprocess(self, data: TimeSeriesDataFrame, is_train: bool = False, **kwargs) -> Any:
91
+ if is_train:
92
+ # All-NaN series are removed; partially-NaN series in train_data are handled inside _generate_train_val_dfs
93
+ all_nan_items = data.item_ids[data[self.target].isna().groupby(ITEMID, sort=False).all()]
94
+ if len(all_nan_items):
95
+ data = data.query("item_id not in @all_nan_items")
96
+ return data
97
+ else:
98
+ data = data.fill_missing_values()
99
+ # Fill time series consisting of all NaNs with the median of target in train_data
100
+ if data.isna().any(axis=None):
101
+ data[self.target] = data[self.target].fillna(value=self._train_target_median)
102
+ return data
88
103
 
89
104
  def _get_extra_tabular_init_kwargs(self) -> dict:
90
105
  raise NotImplementedError
@@ -98,8 +113,6 @@ class AbstractMLForecastModel(AbstractTimeSeriesModel):
98
113
  return model_params
99
114
 
100
115
  def _get_mlforecast_init_args(self, train_data: TimeSeriesDataFrame, model_params: dict) -> dict:
101
- # TODO: Support lag generation for all pandas frequencies
102
- # TODO: Support date_feature generation for all pandas frequencies
103
116
  from mlforecast.target_transforms import Differences
104
117
 
105
118
  from .utils import MeanAbsScaler, StandardScaler
@@ -181,6 +194,10 @@ class AbstractMLForecastModel(AbstractTimeSeriesModel):
181
194
  items_to_keep = data.item_ids.to_series().sample(n=int(max_num_items)) # noqa: F841
182
195
  data = data.query("item_id in @items_to_keep")
183
196
 
197
+ # MLForecast.preprocess does not support missing values, but we will exclude them later from the training set
198
+ missing_entries = data.index[data[self.target].isna()]
199
+ data = data.fill_missing_values()
200
+
184
201
  num_items = data.num_items
185
202
  mlforecast_df = self._to_mlforecast_df(data, data.static_features)
186
203
 
@@ -197,6 +214,10 @@ class AbstractMLForecastModel(AbstractTimeSeriesModel):
197
214
 
198
215
  df = self._mask_df(df)
199
216
 
217
+ # We remove originally missing values filled via imputation from the training set
218
+ if len(missing_entries):
219
+ df = df.set_index(["unique_id", "ds"]).drop(missing_entries, errors="ignore").reset_index()
220
+
200
221
  if max_num_samples is not None and len(df) > max_num_samples:
201
222
  df = df.sample(n=max_num_samples)
202
223
 
@@ -246,6 +267,7 @@ class AbstractMLForecastModel(AbstractTimeSeriesModel):
246
267
 
247
268
  self._check_fit_params()
248
269
  fit_start_time = time.time()
270
+ self._train_target_median = train_data[self.target].median()
249
271
  # TabularEstimator is passed to MLForecast later to include tuning_data
250
272
  model_params = self._get_model_params()
251
273
 
@@ -355,7 +377,7 @@ class AbstractMLForecastModel(AbstractTimeSeriesModel):
355
377
  return predictions
356
378
 
357
379
  def _more_tags(self) -> dict:
358
- return {"can_refit_full": True}
380
+ return {"allow_nan": True, "can_refit_full": True}
359
381
 
360
382
 
361
383
  class DirectTabularModel(AbstractMLForecastModel):
@@ -363,3 +363,6 @@ class ChronosModel(AbstractTimeSeriesModel):
363
363
  )
364
364
 
365
365
  return TimeSeriesDataFrame(df)
366
+
367
+ def _more_tags(self) -> Dict:
368
+ return {"allow_nan": True}
@@ -328,8 +328,6 @@ class AbstractGluonTSModel(AbstractTimeSeriesModel):
328
328
 
329
329
  if self.num_feat_static_real > 0:
330
330
  feat_static_real = time_series_df.static_features[self.metadata.static_features_real]
331
- if feat_static_real.isna().values.any():
332
- feat_static_real = feat_static_real.fillna(feat_static_real.mean())
333
331
  else:
334
332
  feat_static_real = None
335
333
 
@@ -548,3 +546,6 @@ class AbstractGluonTSModel(AbstractTimeSeriesModel):
548
546
 
549
547
  forecast_df.index = forecast_index
550
548
  return TimeSeriesDataFrame(forecast_df)
549
+
550
+ def _more_tags(self) -> dict:
551
+ return {"allow_nan": True, "can_use_val_data": True}
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  import time
3
3
  from multiprocessing import TimeoutError, cpu_count
4
- from typing import Any, Dict, List, Optional, Tuple, Union
4
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
5
5
 
6
6
  import numpy as np
7
7
  import pandas as pd
@@ -85,6 +85,12 @@ class AbstractLocalModel(AbstractTimeSeriesModel):
85
85
  self._local_model_args: Dict[str, Any] = None
86
86
  self._seasonal_period: Optional[int] = None
87
87
  self.time_limit: Optional[float] = None
88
+ self._dummy_forecast: Optional[pd.DataFrame] = None
89
+
90
+ def preprocess(self, data: TimeSeriesDataFrame, is_train: bool = False, **kwargs) -> Any:
91
+ if not self._get_tags()["allow_nan"]:
92
+ data = data.fill_missing_values()
93
+ return data
88
94
 
89
95
  def _fit(self, train_data: TimeSeriesDataFrame, time_limit: Optional[int] = None, **kwargs):
90
96
  self._check_fit_params()
@@ -115,8 +121,16 @@ class AbstractLocalModel(AbstractTimeSeriesModel):
115
121
 
116
122
  self._local_model_args = self._update_local_model_args(local_model_args=local_model_args)
117
123
  self.time_limit = time_limit
124
+
125
+ self._dummy_forecast = self._get_dummy_forecast(train_data)
118
126
  return self
119
127
 
128
+ def _get_dummy_forecast(self, train_data: TimeSeriesDataFrame) -> pd.DataFrame:
129
+ agg_functions = ["mean"] + [get_quantile_function(q) for q in self.quantile_levels]
130
+ stats_marginal = train_data[self.target].agg(agg_functions)
131
+ stats_repeated = np.tile(stats_marginal.values, [self.prediction_length, 1])
132
+ return pd.DataFrame(stats_repeated, columns=stats_marginal.index)
133
+
120
134
  def _update_local_model_args(self, local_model_args: Dict[str, Any]) -> Dict[str, Any]:
121
135
  return local_model_args
122
136
 
@@ -164,25 +178,30 @@ class AbstractLocalModel(AbstractTimeSeriesModel):
164
178
  def _predict_wrapper(self, time_series: pd.Series, end_time: Optional[float] = None) -> Tuple[pd.DataFrame, bool]:
165
179
  if end_time is not None and time.time() >= end_time:
166
180
  raise TimeLimitExceeded
167
- try:
168
- result = self._predict_with_local_model(
169
- time_series=time_series,
170
- local_model_args=self._local_model_args.copy(),
171
- )
172
- if not np.isfinite(result.values).all():
173
- raise RuntimeError("Forecast contains NaN or Inf values.")
174
- model_failed = False
175
- except Exception:
176
- if self.use_fallback_model:
177
- result = seasonal_naive_forecast(
178
- target=time_series.values.ravel(),
179
- prediction_length=self.prediction_length,
180
- quantile_levels=self.quantile_levels,
181
- seasonal_period=self._seasonal_period,
181
+
182
+ if time_series.isna().all():
183
+ result = self._dummy_forecast.copy()
184
+ model_failed = True
185
+ else:
186
+ try:
187
+ result = self._predict_with_local_model(
188
+ time_series=time_series,
189
+ local_model_args=self._local_model_args.copy(),
182
190
  )
183
- model_failed = True
184
- else:
185
- raise
191
+ if not np.isfinite(result.values).all():
192
+ raise RuntimeError("Forecast contains NaN or Inf values.")
193
+ model_failed = False
194
+ except Exception:
195
+ if self.use_fallback_model:
196
+ result = seasonal_naive_forecast(
197
+ target=time_series.values.ravel(),
198
+ prediction_length=self.prediction_length,
199
+ quantile_levels=self.quantile_levels,
200
+ seasonal_period=self._seasonal_period,
201
+ )
202
+ model_failed = True
203
+ else:
204
+ raise
186
205
  return result, model_failed
187
206
 
188
207
  def _predict_with_local_model(
@@ -197,25 +216,51 @@ def seasonal_naive_forecast(
197
216
  target: np.ndarray, prediction_length: int, quantile_levels: List[float], seasonal_period: int
198
217
  ) -> pd.DataFrame:
199
218
  """Generate seasonal naive forecast, predicting the last observed value from the same period."""
219
+
220
+ def numpy_ffill(arr: np.ndarray) -> np.ndarray:
221
+ """Fast implementation of forward fill in numpy."""
222
+ idx = np.arange(len(arr))
223
+ mask = np.isnan(arr)
224
+ idx[mask] = 0
225
+ return arr[np.maximum.accumulate(idx)]
226
+
200
227
  forecast = {}
228
+ # Convert to float64 since std computation can be unstable in float32
229
+ target = target.astype(np.float64)
201
230
  # At least seasonal_period + 2 values are required to compute sigma for seasonal naive
202
231
  if len(target) > seasonal_period + 1 and seasonal_period > 1:
232
+ if np.isnan(target[-(seasonal_period + 2) :]).any():
233
+ target = numpy_ffill(target)
234
+
203
235
  indices = [len(target) - seasonal_period + k % seasonal_period for k in range(prediction_length)]
204
236
  forecast["mean"] = target[indices]
205
237
  residuals = target[seasonal_period:] - target[:-seasonal_period]
206
238
 
207
- sigma = np.sqrt(np.mean(np.square(residuals)))
239
+ sigma = np.sqrt(np.nanmean(np.square(residuals)))
208
240
  num_full_seasons = np.arange(1, prediction_length + 1) // seasonal_period
209
241
  sigma_per_timestep = sigma * np.sqrt(num_full_seasons + 1)
210
242
  else:
211
243
  # Fall back to naive forecast
212
- forecast["mean"] = np.full(shape=[prediction_length], fill_value=target[-1])
244
+ last_observed_value = target[np.isfinite(target)][-1]
245
+ forecast["mean"] = np.full(shape=[prediction_length], fill_value=last_observed_value)
213
246
  residuals = target[1:] - target[:-1]
214
247
 
215
- sigma = np.sqrt(np.mean(np.square(residuals)))
248
+ sigma = np.sqrt(np.nanmean(np.square(residuals)))
249
+ if np.isnan(sigma): # happens if there are no two consecutive non-nan observations
250
+ sigma = 0.0
216
251
  sigma_per_timestep = sigma * np.sqrt(np.arange(1, prediction_length + 1))
217
252
 
218
253
  for q in quantile_levels:
219
254
  forecast[str(q)] = forecast["mean"] + norm.ppf(q) * sigma_per_timestep
220
255
 
221
256
  return pd.DataFrame(forecast)
257
+
258
+
259
+ def get_quantile_function(q: float) -> Callable:
260
+ """Returns a function with name "q" that computes the q'th quantile of a pandas.Series."""
261
+
262
+ def quantile_fn(x: pd.Series) -> pd.Series:
263
+ return x.quantile(q)
264
+
265
+ quantile_fn.__name__ = str(q)
266
+ return quantile_fn
@@ -1,9 +1,11 @@
1
- from typing import Callable
2
-
3
1
  import numpy as np
4
2
  import pandas as pd
5
3
 
6
- from autogluon.timeseries.models.local.abstract_local_model import AbstractLocalModel, seasonal_naive_forecast
4
+ from autogluon.timeseries.models.local.abstract_local_model import (
5
+ AbstractLocalModel,
6
+ get_quantile_function,
7
+ seasonal_naive_forecast,
8
+ )
7
9
 
8
10
 
9
11
  class NaiveModel(AbstractLocalModel):
@@ -36,6 +38,9 @@ class NaiveModel(AbstractLocalModel):
36
38
  seasonal_period=1,
37
39
  )
38
40
 
41
+ def _more_tags(self) -> dict:
42
+ return {"allow_nan": True}
43
+
39
44
 
40
45
  class SeasonalNaiveModel(AbstractLocalModel):
41
46
  """Baseline model that sets the forecast equal to the last observed value from the same season.
@@ -75,15 +80,8 @@ class SeasonalNaiveModel(AbstractLocalModel):
75
80
  seasonal_period=local_model_args["seasonal_period"],
76
81
  )
77
82
 
78
-
79
- def _get_quantile_function(q: float) -> Callable:
80
- """Returns a function with name "q" that computes the q'th quantile of a pandas.Series."""
81
-
82
- def quantile_fn(x: pd.Series) -> pd.Series:
83
- return x.quantile(q)
84
-
85
- quantile_fn.__name__ = str(q)
86
- return quantile_fn
83
+ def _more_tags(self) -> dict:
84
+ return {"allow_nan": True}
87
85
 
88
86
 
89
87
  class AverageModel(AbstractLocalModel):
@@ -109,11 +107,14 @@ class AverageModel(AbstractLocalModel):
109
107
  time_series: pd.Series,
110
108
  local_model_args: dict,
111
109
  ) -> pd.DataFrame:
112
- agg_functions = ["mean"] + [_get_quantile_function(q) for q in self.quantile_levels]
110
+ agg_functions = ["mean"] + [get_quantile_function(q) for q in self.quantile_levels]
113
111
  stats_marginal = time_series.agg(agg_functions)
114
112
  stats_repeated = np.tile(stats_marginal.values, [self.prediction_length, 1])
115
113
  return pd.DataFrame(stats_repeated, columns=stats_marginal.index)
116
114
 
115
+ def _more_tags(self) -> dict:
116
+ return {"allow_nan": True}
117
+
117
118
 
118
119
  class SeasonalAverageModel(AbstractLocalModel):
119
120
  """Baseline model that sets the forecast equal to the historic average or quantile in the same season.
@@ -146,7 +147,7 @@ class SeasonalAverageModel(AbstractLocalModel):
146
147
  local_model_args: dict,
147
148
  ) -> pd.DataFrame:
148
149
  seasonal_period = local_model_args["seasonal_period"]
149
- agg_functions = ["mean"] + [_get_quantile_function(q) for q in self.quantile_levels]
150
+ agg_functions = ["mean"] + [get_quantile_function(q) for q in self.quantile_levels]
150
151
 
151
152
  # Compute mean & quantiles for each season
152
153
  ts_df = time_series.reset_index(drop=True).to_frame()
@@ -162,3 +163,6 @@ class SeasonalAverageModel(AbstractLocalModel):
162
163
  stats_marginal = time_series.agg(agg_functions)
163
164
  result = result.fillna(stats_marginal)
164
165
  return result
166
+
167
+ def _more_tags(self) -> dict:
168
+ return {"allow_nan": True}
@@ -88,3 +88,6 @@ class NPTSModel(AbstractLocalModel):
88
88
  for q in self.quantile_levels:
89
89
  forecast_dict[str(q)] = forecast.quantile(q)
90
90
  return pd.DataFrame(forecast_dict)
91
+
92
+ def _more_tags(self) -> dict:
93
+ return {"allow_nan": True}
@@ -204,6 +204,8 @@ class ARIMAModel(AbstractProbabilisticStatsForecastModel):
204
204
  This significantly speeds up fitting and usually leads to no change in accuracy.
205
205
  """
206
206
 
207
+ # TODO: This model requires statsforecast >= 1.5.0, so it will only be available after we upgrade the dependency
208
+
207
209
  allowed_local_model_args = [
208
210
  "order",
209
211
  "seasonal_order",
@@ -243,4 +243,6 @@ class MultiWindowBacktestingModel(AbstractTimeSeriesModel):
243
243
  return refit_model
244
244
 
245
245
  def _more_tags(self) -> dict:
246
- return self.most_recent_model._get_tags()
246
+ tags = self.model_base._get_tags()
247
+ tags["can_use_val_data"] = False
248
+ return tags
@@ -276,7 +276,7 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
276
276
  data: Union[TimeSeriesDataFrame, pd.DataFrame, Path, str],
277
277
  name: str = "data",
278
278
  ) -> TimeSeriesDataFrame:
279
- """Ensure that TimeSeriesDataFrame has a sorted index, valid frequency, and contains no missing values.
279
+ """Ensure that TimeSeriesDataFrame has a sorted index and a valid frequency.
280
280
 
281
281
  If self.freq is None, then self.freq of the predictor will be set to the frequency of the data.
282
282
 
@@ -314,18 +314,6 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
314
314
  if df.freq != self.freq:
315
315
  logger.warning(f"{name} with frequency '{df.freq}' has been resampled to frequency '{self.freq}'.")
316
316
  df = df.convert_frequency(freq=self.freq)
317
-
318
- # Fill missing values
319
- if df.isna().values.any():
320
- # FIXME: Do not automatically fill NaNs here, handle missing values at the level of individual models.
321
- # FIXME: Current solution leads to incorrect metric computation if missing values are present
322
- logger.warning(
323
- f"{name} contains missing values represented by NaN. "
324
- f"They have been filled by carrying forward the last valid observation."
325
- )
326
- df = df.fill_missing_values()
327
- if df.isna().values.any():
328
- raise ValueError(f"Some time series in {name} consist completely of NaN values. Please remove them.")
329
317
  return df
330
318
 
331
319
  def _check_data_for_evaluation(self, data: TimeSeriesDataFrame, name: str = "data"):
@@ -337,15 +325,19 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
337
325
  f"all time series have length > prediction_length (at least {self.prediction_length + 1})"
338
326
  )
339
327
 
340
- @staticmethod
341
- def _get_dataset_stats(data: TimeSeriesDataFrame) -> str:
328
+ def _get_dataset_stats(self, data: TimeSeriesDataFrame) -> str:
342
329
  ts_lengths = data.num_timesteps_per_item()
343
- median_length = int(ts_lengths.median())
330
+ median_length = ts_lengths.median()
344
331
  min_length = ts_lengths.min()
345
332
  max_length = ts_lengths.max()
333
+ missing_value_fraction = data[self.target].isna().mean()
334
+ if missing_value_fraction > 0:
335
+ missing_value_fraction_str = f" (NaN fraction={missing_value_fraction:.1%})"
336
+ else:
337
+ missing_value_fraction_str = ""
346
338
  return (
347
- f"{len(data)} rows, {data.num_items} time series. "
348
- f"Median time series length is {median_length} (min={min_length}, max={max_length}). "
339
+ f"{len(data)} rows{missing_value_fraction_str}, {data.num_items} time series. "
340
+ f"Median time series length is {median_length:.0f} (min={min_length}, max={max_length}). "
349
341
  )
350
342
 
351
343
  def _reduce_num_val_windows_if_necessary(
@@ -374,41 +366,45 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
374
366
  )
375
367
  return new_num_val_windows
376
368
 
377
- def _filter_short_series(
369
+ def _filter_useless_train_data(
378
370
  self,
379
371
  train_data: TimeSeriesDataFrame,
380
372
  num_val_windows: int,
381
373
  val_step_size: int,
382
374
  ) -> Tuple[TimeSeriesDataFrame, Optional[TimeSeriesDataFrame]]:
383
- """Remove time series from train_data that are too short for chosen prediction_length and validation settings.
375
+ """Remove time series from train_data that either contain all NaNs or are too short for chosen settings.
384
376
 
385
- This method ensures that for each validation fold, all train series have length >= max(prediction_length + 1, 5).
377
+ This method ensures that 1) no time series consist of all NaN values and 2) for each validation fold, all train
378
+ series have length >= max(prediction_length + 1, 5).
386
379
 
387
- In other words, this method removes from train_data all time series with length less than
380
+ In other words, this method removes from train_data all time series with only NaN values or length less than
388
381
  min_train_length + prediction_length + (num_val_windows - 1) * val_step_size
389
382
  """
390
383
  min_length = self._min_train_length + self.prediction_length + (num_val_windows - 1) * val_step_size
391
-
392
384
  train_lengths = train_data.num_timesteps_per_item()
393
- train_items_to_drop = train_lengths.index[train_lengths < min_length]
394
- if len(train_items_to_drop) > 0:
385
+ too_short_items = train_lengths.index[train_lengths < min_length]
386
+
387
+ if len(too_short_items) > 0:
395
388
  logger.info(
396
- f"\tRemoving {len(train_items_to_drop)} short time series from train_data. Only series with length "
389
+ f"\tRemoving {len(too_short_items)} short time series from train_data. Only series with length "
397
390
  f">= {min_length} will be used for training."
398
391
  )
399
- filtered_train_data = train_data.query("item_id not in @train_items_to_drop")
400
- if len(filtered_train_data) == 0:
401
- raise ValueError(
402
- f"At least some time series in train_data must have length >= {min_length}. Please provide longer "
403
- f"time series as train_data or reduce prediction_length, num_val_windows, or val_step_size."
404
- )
405
- logger.info(
406
- f"\tAfter removing short series, train_data has {self._get_dataset_stats(filtered_train_data)}"
407
- )
408
- else:
409
- filtered_train_data = train_data
392
+ train_data = train_data.query("item_id not in @too_short_items")
393
+
394
+ all_nan_items = train_data.item_ids[train_data[self.target].isna().groupby(ITEMID, sort=False).all()]
395
+ if len(all_nan_items) > 0:
396
+ logger.info(f"\tRemoving {len(all_nan_items)} time series consisting of only NaN values from train_data.")
397
+ train_data = train_data.query("item_id not in @all_nan_items")
410
398
 
411
- return filtered_train_data
399
+ if len(too_short_items) or len(all_nan_items):
400
+ logger.info(f"\tAfter filtering, train_data has {self._get_dataset_stats(train_data)}")
401
+
402
+ if len(train_data) == 0:
403
+ raise ValueError(
404
+ f"At least some time series in train_data must have >= {min_length} observations. Please provide "
405
+ f"longer time series as train_data or reduce prediction_length, num_val_windows, or val_step_size."
406
+ )
407
+ return train_data
412
408
 
413
409
  @apply_presets(TIMESERIES_PRESETS_CONFIGS)
414
410
  def fit(
@@ -722,7 +718,7 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
722
718
  raise ValueError("Please set num_val_windows >= 1 or provide custom tuning_data")
723
719
 
724
720
  if not skip_model_selection:
725
- train_data = self._filter_short_series(
721
+ train_data = self._filter_useless_train_data(
726
722
  train_data, num_val_windows=num_val_windows, val_step_size=val_step_size
727
723
  )
728
724
 
@@ -28,9 +28,32 @@ class CovariateMetadata:
28
28
  past_covariates_real: List[str] = field(default_factory=list)
29
29
  past_covariates_cat: List[str] = field(default_factory=list)
30
30
 
31
+ @property
32
+ def known_covariates(self) -> List[str]:
33
+ return self.known_covariates_cat + self.known_covariates_real
34
+
35
+ @property
36
+ def past_covariates(self) -> List[str]:
37
+ return self.past_covariates_cat + self.past_covariates_real
38
+
39
+ @property
40
+ def covariates(self) -> List[str]:
41
+ return self.known_covariates + self.past_covariates
42
+
43
+ @property
44
+ def covariates_real(self) -> List[str]:
45
+ return self.known_covariates_real + self.past_covariates_real
46
+
47
+ @property
48
+ def covariates_cat(self) -> List[str]:
49
+ return self.known_covariates_cat + self.past_covariates_cat
50
+
31
51
 
32
52
  class ContinuousAndCategoricalFeatureGenerator(PipelineFeatureGenerator):
33
- """Generates categorical and continuous features for time series models."""
53
+ """Generates categorical and continuous features for time series models.
54
+
55
+ Imputes missing categorical features with the most frequent value in the training set.
56
+ """
34
57
 
35
58
  def __init__(self, verbosity: int = 0, minimum_cat_count=2, float_dtype: str = "float32", **kwargs):
36
59
  generators = [
@@ -62,13 +85,22 @@ class ContinuousAndCategoricalFeatureGenerator(PipelineFeatureGenerator):
62
85
  # PipelineFeatureGenerator does not use transform() inside fit_transform(), so we need to override both methods
63
86
  if isinstance(X, TimeSeriesDataFrame):
64
87
  X = pd.DataFrame(X)
65
- return self._convert_numerical_columns_to_float(super().fit_transform(X, *args, **kwargs))
88
+ transformed = self._convert_numerical_columns_to_float(super().fit_transform(X, *args, **kwargs))
89
+ # Ignore the '__dummy__' feature generated by PipelineFeatureGenerator if none of the features are informative
90
+ return transformed.drop(columns=["__dummy__"], errors="ignore")
66
91
 
67
92
 
68
93
  class TimeSeriesFeatureGenerator:
69
94
  """Takes care of preprocessing for static_features and past/known covariates.
70
95
 
71
96
  All covariates & static features are converted into either float32 or categorical dtype.
97
+
98
+ Missing values in the target column are left as-is but missing values in static features & covariates are imputed.
99
+ Imputation logic is as follows:
100
+ 1. For all categorical columns (static, past, known), we fill missing values with the mode of the training set.
101
+ 2. For real static features, we impute missing values with the median of the training set.
102
+ 3. For real covariates (past, known), we ffill + bfill within each time series. If for some time series all
103
+ covariate values are missing, we fill them with the median of the training set.
72
104
  """
73
105
 
74
106
  def __init__(self, target: str, known_covariates_names: List[str], float_dtype: str = "float32"):
@@ -82,6 +114,8 @@ class TimeSeriesFeatureGenerator:
82
114
  # Cat features with cat_count=1 are fine in static_features since they are repeated for all time steps in a TS
83
115
  self.static_feature_pipeline = ContinuousAndCategoricalFeatureGenerator(minimum_cat_count=1)
84
116
  self.covariate_metadata: CovariateMetadata = None
117
+ self._train_covariates_real_median: Optional[pd.Series] = None
118
+ self._train_static_real_median: Optional[pd.Series] = None
85
119
 
86
120
  @property
87
121
  def required_column_names(self) -> List[str]:
@@ -129,6 +163,7 @@ class TimeSeriesFeatureGenerator:
129
163
  logger.info("\tstatic_features:")
130
164
  static_features_cat, static_features_real = self._detect_and_log_column_types(static_features_df)
131
165
  ignored_static_features = data.static_features.columns.difference(self.static_feature_pipeline.features_in)
166
+ self._train_static_real_median = data.static_features[static_features_real].median()
132
167
  else:
133
168
  static_features_cat = []
134
169
  static_features_real = []
@@ -154,6 +189,7 @@ class TimeSeriesFeatureGenerator:
154
189
  static_features_cat=static_features_cat,
155
190
  static_features_real=static_features_real,
156
191
  )
192
+ self._train_covariates_real_median = data[self.covariate_metadata.covariates_real].median()
157
193
  self._is_fit = True
158
194
 
159
195
  def transform(self, data: TimeSeriesDataFrame, data_frame_name: str = "data") -> TimeSeriesDataFrame:
@@ -180,10 +216,26 @@ class TimeSeriesFeatureGenerator:
180
216
  if data.static_features is None:
181
217
  raise ValueError(f"Provided {data_frame_name} must contain static_features")
182
218
  static_features = self.static_feature_pipeline.transform(data.static_features)
219
+ static_real_names = self.covariate_metadata.static_features_real
220
+ # Fill missing static_features_real with the median of the training set
221
+ if static_real_names and static_features[static_real_names].isna().any(axis=None):
222
+ static_features[static_real_names] = static_features[static_real_names].fillna(
223
+ self._train_static_real_median
224
+ )
183
225
  else:
184
226
  static_features = None
185
227
 
186
- return TimeSeriesDataFrame(pd.concat(dfs, axis=1), static_features=static_features)
228
+ ts_df = TimeSeriesDataFrame(pd.concat(dfs, axis=1), static_features=static_features)
229
+
230
+ covariates_names = self.covariate_metadata.covariates
231
+ if len(covariates_names) > 0:
232
+ # ffill + bfill covariates that have at least some observed values
233
+ ts_df[covariates_names] = ts_df[covariates_names].fill_missing_values()
234
+ # If for some items covariates consist completely of NaNs, fill them with median of training data
235
+ if ts_df[covariates_names].isna().any(axis=None):
236
+ ts_df[covariates_names] = ts_df[covariates_names].fillna(self._train_covariates_real_median)
237
+
238
+ return ts_df
187
239
 
188
240
  def transform_future_known_covariates(
189
241
  self, known_covariates: Optional[TimeSeriesDataFrame]
@@ -194,7 +246,13 @@ class TimeSeriesFeatureGenerator:
194
246
  self._check_required_columns_are_present(
195
247
  known_covariates, required_column_names=self.known_covariates_names, data_frame_name="known_covariates"
196
248
  )
197
- return TimeSeriesDataFrame(self.known_covariates_pipeline.transform(known_covariates))
249
+ known_covariates = TimeSeriesDataFrame(self.known_covariates_pipeline.transform(known_covariates))
250
+ # ffill + bfill covariates that have at least some observed values
251
+ known_covariates = known_covariates.fill_missing_values()
252
+ # If for some items covariates consist completely of NaNs, fill them with median of training data
253
+ if known_covariates.isna().any(axis=None):
254
+ known_covariates = known_covariates.fillna(self._train_covariates_real_median)
255
+ return known_covariates
198
256
  else:
199
257
  return None
200
258
 
@@ -1,3 +1,3 @@
1
1
  """This is the autogluon version file."""
2
- __version__ = '1.0.1b20240329'
2
+ __version__ = '1.0.1b20240330'
3
3
  __lite__ = False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: autogluon.timeseries
3
- Version: 1.0.1b20240329
3
+ Version: 1.0.1b20240330
4
4
  Summary: AutoML for Image, Text, and Tabular Data
5
5
  Home-page: https://github.com/autogluon/autogluon
6
6
  Author: AutoGluon Community
@@ -16,13 +16,13 @@ utilsforecast<0.0.11,>=0.0.10
16
16
  tqdm<5,>=4.38
17
17
  orjson~=3.9
18
18
  tensorboard<3,>=2.9
19
- autogluon.core[raytune]==1.0.1b20240329
20
- autogluon.common==1.0.1b20240329
21
- autogluon.tabular[catboost,lightgbm,xgboost]==1.0.1b20240329
19
+ autogluon.core[raytune]==1.0.1b20240330
20
+ autogluon.common==1.0.1b20240330
21
+ autogluon.tabular[catboost,lightgbm,xgboost]==1.0.1b20240330
22
22
 
23
23
  [all]
24
- optimum[onnxruntime]<1.18,>=1.17
25
24
  optimum[nncf,openvino]<1.18,>=1.17
25
+ optimum[onnxruntime]<1.18,>=1.17
26
26
 
27
27
  [chronos-onnx]
28
28
  optimum[onnxruntime]<1.18,>=1.17