autogluon.timeseries 1.0.1b20240405__py3-none-any.whl → 1.0.1b20240406__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of autogluon.timeseries might be problematic. Click here for more details.

Files changed (19) hide show
  1. autogluon/timeseries/learner.py +70 -1
  2. autogluon/timeseries/models/abstract/abstract_timeseries_model.py +14 -4
  3. autogluon/timeseries/models/autogluon_tabular/mlforecast.py +7 -1
  4. autogluon/timeseries/models/gluonts/abstract_gluonts.py +213 -63
  5. autogluon/timeseries/models/gluonts/torch/models.py +13 -0
  6. autogluon/timeseries/models/multi_window/multi_window_model.py +12 -0
  7. autogluon/timeseries/predictor.py +133 -2
  8. autogluon/timeseries/trainer/abstract_trainer.py +161 -8
  9. autogluon/timeseries/utils/features.py +118 -2
  10. autogluon/timeseries/version.py +1 -1
  11. {autogluon.timeseries-1.0.1b20240405.dist-info → autogluon.timeseries-1.0.1b20240406.dist-info}/METADATA +4 -4
  12. {autogluon.timeseries-1.0.1b20240405.dist-info → autogluon.timeseries-1.0.1b20240406.dist-info}/RECORD +19 -19
  13. /autogluon.timeseries-1.0.1b20240405-py3.8-nspkg.pth → /autogluon.timeseries-1.0.1b20240406-py3.8-nspkg.pth +0 -0
  14. {autogluon.timeseries-1.0.1b20240405.dist-info → autogluon.timeseries-1.0.1b20240406.dist-info}/LICENSE +0 -0
  15. {autogluon.timeseries-1.0.1b20240405.dist-info → autogluon.timeseries-1.0.1b20240406.dist-info}/NOTICE +0 -0
  16. {autogluon.timeseries-1.0.1b20240405.dist-info → autogluon.timeseries-1.0.1b20240406.dist-info}/WHEEL +0 -0
  17. {autogluon.timeseries-1.0.1b20240405.dist-info → autogluon.timeseries-1.0.1b20240406.dist-info}/namespace_packages.txt +0 -0
  18. {autogluon.timeseries-1.0.1b20240405.dist-info → autogluon.timeseries-1.0.1b20240406.dist-info}/top_level.txt +0 -0
  19. {autogluon.timeseries-1.0.1b20240405.dist-info → autogluon.timeseries-1.0.1b20240406.dist-info}/zip-safe +0 -0
@@ -570,7 +570,7 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
570
570
  Valid preset values:
571
571
 
572
572
  * "auto": Performs HPO via bayesian optimization search on GluonTS-backed neural forecasting models and
573
- random search on other models using local scheduler.
573
+ random search on other models using local scheduler.
574
574
  * "random": Performs HPO via random search.
575
575
 
576
576
  You can also provide a dict to specify searchers and schedulers
@@ -893,6 +893,137 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
893
893
  logger.info(json.dumps(scores_dict, indent=4))
894
894
  return scores_dict
895
895
 
896
+ def feature_importance(
897
+ self,
898
+ data: Optional[Union[TimeSeriesDataFrame, pd.DataFrame, Path, str]] = None,
899
+ model: Optional[str] = None,
900
+ metric: Optional[Union[str, TimeSeriesScorer]] = None,
901
+ features: Optional[List[str]] = None,
902
+ time_limit: Optional[float] = None,
903
+ method: Literal["naive", "permutation"] = "permutation",
904
+ subsample_size: int = 50,
905
+ num_iterations: Optional[int] = None,
906
+ random_seed: Optional[int] = 123,
907
+ relative_scores: bool = False,
908
+ include_confidence_band: bool = True,
909
+ confidence_level: float = 0.99,
910
+ ):
911
+ """
912
+ Calculates feature importance scores for the given model via replacing each feature by a shuffled version of the same feature
913
+ (also known as permutation feature importance) or by assigning a constant value representing the median or mode of the feature,
914
+ and computing the relative decrease in the model's predictive performance.
915
+
916
+ A feature's importance score represents the performance drop that results when the model makes predictions on a perturbed copy
917
+ of the data where this feature's values have been randomly shuffled across rows. A feature score of 0.01 would indicate that the
918
+ predictive performance dropped by 0.01 when the feature was randomly shuffled or replaced. The higher the score a feature has,
919
+ the more important it is to the model's performance.
920
+
921
+ If a feature has a negative score, this means that the feature is likely harmful to the final model, and a model trained with
922
+ the feature removed would be expected to achieve a better predictive performance. Note that calculating feature importance can
923
+ be a computationally expensive process, particularly if the model uses many features. In many cases, this can take longer than
924
+ the original model training. Roughly, this will equal to the number of features in the data multiplied by ``num_iterations``
925
+ (or, 1 when ``method="naive"``) and time taken when ``evaluate()`` is called on a dataset with ``subsample_size``.
926
+
927
+ Parameters
928
+ ----------
929
+ data : TimeSeriesDataFrame, pd.DataFrame, Path or str, optional
930
+ The data to evaluate feature importances on. The last ``prediction_length`` time steps of the data set, for each
931
+ item, will be held out for prediction and forecast accuracy will be calculated on these time steps.
932
+ More accurate feature importances will be obtained from new data that was held-out during ``fit()``.
933
+
934
+ If ``known_covariates_names`` were specified when creating the predictor, ``data`` must include the columns
935
+ listed in ``known_covariates_names`` with the covariates values aligned with the target time series.
936
+ This data must contain the label column with the same column name as specified during ``fit()``.
937
+
938
+ If ``train_data`` used to train the predictor contained past covariates or static features, then ``data``
939
+ must also include them (with same column names and dtypes).
940
+
941
+ If provided data is an instance of pandas DataFrame, AutoGluon will attempt to automatically convert it
942
+ to a ``TimeSeriesDataFrame``. If str or Path is passed, ``data`` will be loaded using the str value as the file path.
943
+
944
+ If ``data`` is not provided, then validation (tuning) data provided during training (or the held out data used for
945
+ validation if ``tuning_data`` was not explicitly provided ``fit()``) will be used.
946
+ model : str, optional
947
+ Name of the model that you would like to evaluate. By default, the best model during training
948
+ (with highest validation score) will be used.
949
+ metric : str or TimeSeriesScorer, optional
950
+ Metric to be used for computing feature importance. If None, the ``eval_metric`` specified during initialization of
951
+ the ``TimeSeriesPredictor`` will be used.
952
+ features : List[str], optional
953
+ List of feature names that feature importances are calculated for and returned. By default, all feature importances
954
+ will be returned.
955
+ method : {"permutation", "naive"}, default = "permutation"
956
+ Method to be used for computing feature importance.
957
+
958
+ * ``naive``: computes feature importance by replacing the values of each feature by a constant value and computing
959
+ feature importances as the relative improvement in the evaluation metric. The constant value is the median for
960
+ real-valued features and the mode for categorical features, for both covariates and static features, obtained from the
961
+ feature values in ``data`` provided.
962
+ * ``permutation``: computes feature importance by naively shuffling the values of the feature across different items
963
+ and time steps. Each feature is shuffled for ``num_iterations`` times and feature importances are computed as the
964
+ relative improvement in the evaluation metric. Refer to https://explained.ai/rf-importance/ for an explanation of
965
+ permutation importance.
966
+
967
+ subsample_size : int, default = 50
968
+ The number of items to sample from `data` when computing feature importance. Larger values increase the accuracy of
969
+ the feature importance scores. Runtime linearly scales with `subsample_size`.
970
+ time_limit : float, optional
971
+ Time in seconds to limit the calculation of feature importance. If None, feature importance will calculate without early stopping.
972
+ If ``method="permutation"``, a minimum of 1 full shuffle set will always be evaluated. If a shuffle set evaluation takes longer than
973
+ ``time_limit``, the method will take the length of a shuffle set evaluation to return regardless of the `time_limit`.
974
+ num_iterations : int, optional
975
+ The number of different iterations of the data that are evaluated. If ``method="permutation"``, this will be interpreted
976
+ as the number of shuffle sets (equivalent to ``num_shuffle_sets`` in :meth:`TabularPredictor.feature_importance`). If ``method="naive"``, the
977
+ constant replacement approach is repeated for ``num_iterations`` times, and a different subsample of data (of size ``subsample_size``) will
978
+ be taken in each iteration.
979
+ Default is 1 for ``method="naive"`` and 5 for ``method="permutation"``. The value will be ignored if ``method="naive"`` and the subsample
980
+ size is greater than the number of items in ``data`` as additional iterations will be redundant.
981
+ Larger values will increase the quality of the importance evaluation.
982
+ It is generally recommended to increase ``subsample_size`` before increasing ``num_iterations``.
983
+ Runtime scales linearly with ``num_iterations``.
984
+ random_seed : int or None, default = 123
985
+ If provided, fixes the seed of the random number generator for all models. This guarantees reproducible
986
+ results for feature importance.
987
+ relative_scores : bool, default = False
988
+ By default, this method will return expected average *absolute* improvement in the eval metric due to the feature. If True, then
989
+ the statistics will be computed over the *relative* (percentage) improvements.
990
+ include_confidence_band: bool, default = True
991
+ If True, returned DataFrame will include two additional columns specifying confidence interval for the true underlying importance value of
992
+ each feature. Increasing ``subsample_size`` and ``num_iterations`` will tighten the confidence interval.
993
+ confidence_level: float, default = 0.99
994
+ This argument is only considered when ``include_confidence_band=True``, and can be used to specify the confidence level used
995
+ for constructing confidence intervals. For example, if ``confidence_level`` is set to 0.99, then the returned DataFrame will include
996
+ columns ``p99_high`` and ``p99_low`` which indicates that the true feature importance will be between ``p99_high`` and ``p99_low`` 99% of
997
+ the time (99% confidence interval). More generally, if ``confidence_level`` = 0.XX, then the columns containing the XX% confidence interval
998
+ will be named ``pXX_high`` and ``pXX_low``.
999
+
1000
+ Returns
1001
+ -------
1002
+ :class:`pd.DataFrame` of feature importance scores with 2 columns:
1003
+ index: The feature name.
1004
+ 'importance': The estimated feature importance score.
1005
+ 'stddev': The standard deviation of the feature importance score. If NaN, then not enough ``num_iterations`` were used.
1006
+ """
1007
+ if data is not None:
1008
+ data = self._check_and_prepare_data_frame(data)
1009
+ self._check_data_for_evaluation(data)
1010
+
1011
+ fi_df = self._learner.get_feature_importance(
1012
+ data=data,
1013
+ model=model,
1014
+ metric=metric,
1015
+ features=features,
1016
+ time_limit=time_limit,
1017
+ method=method,
1018
+ subsample_size=subsample_size,
1019
+ num_iterations=num_iterations,
1020
+ random_seed=random_seed,
1021
+ relative_scores=relative_scores,
1022
+ include_confidence_band=include_confidence_band,
1023
+ confidence_level=confidence_level,
1024
+ )
1025
+ return fi_df
1026
+
896
1027
  @classmethod
897
1028
  def _load_version_file(cls, path: str) -> str:
898
1029
  version_file_path = os.path.join(path, cls._predictor_version_file_name)
@@ -1227,7 +1358,7 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
1227
1358
  }
1228
1359
 
1229
1360
  past_data, known_covariates = test_data.get_model_inputs_for_scoring(
1230
- prediction_length=self.prediction_length, known_covariates_names=trainer.metadata.known_covariates_real
1361
+ prediction_length=self.prediction_length, known_covariates_names=trainer.metadata.known_covariates
1231
1362
  )
1232
1363
  pred_proba_dict_test: Dict[str, TimeSeriesDataFrame] = trainer.get_model_pred_dict(
1233
1364
  base_models, data=past_data, known_covariates=known_covariates
@@ -23,7 +23,11 @@ from autogluon.timeseries.models.abstract import AbstractTimeSeriesModel
23
23
  from autogluon.timeseries.models.ensemble import AbstractTimeSeriesEnsembleModel, TimeSeriesGreedyEnsemble
24
24
  from autogluon.timeseries.models.presets import contains_searchspace
25
25
  from autogluon.timeseries.splitter import AbstractWindowSplitter, ExpandingWindowSplitter
26
- from autogluon.timeseries.utils.features import CovariateMetadata
26
+ from autogluon.timeseries.utils.features import (
27
+ ConstantReplacementFeatureImportanceTransform,
28
+ CovariateMetadata,
29
+ PermutationFeatureImportanceTransform,
30
+ )
27
31
  from autogluon.timeseries.utils.warning_filters import disable_tqdm
28
32
 
29
33
  logger = logging.getLogger("autogluon.timeseries.trainer")
@@ -242,6 +246,9 @@ class SimpleAbstractTrainer:
242
246
  class AbstractTimeSeriesTrainer(SimpleAbstractTrainer):
243
247
  _cached_predictions_filename = "cached_predictions.pkl"
244
248
 
249
+ max_rel_importance_score: float = 1e5
250
+ eps_abs_importance_score: float = 1e-5
251
+
245
252
  def __init__(
246
253
  self,
247
254
  path: str,
@@ -763,7 +770,7 @@ class AbstractTimeSeriesTrainer(SimpleAbstractTrainer):
763
770
 
764
771
  if data is not None:
765
772
  past_data, known_covariates = data.get_model_inputs_for_scoring(
766
- prediction_length=self.prediction_length, known_covariates_names=self.metadata.known_covariates_real
773
+ prediction_length=self.prediction_length, known_covariates_names=self.metadata.known_covariates
767
774
  )
768
775
  logger.info(
769
776
  "Additional data provided, testing on additional data. Resulting leaderboard "
@@ -849,7 +856,9 @@ class AbstractTimeSeriesTrainer(SimpleAbstractTrainer):
849
856
  unpersisted_models.append(model)
850
857
  return unpersisted_models
851
858
 
852
- def _get_model_for_prediction(self, model: Optional[Union[str, AbstractTimeSeriesModel]] = None) -> str:
859
+ def _get_model_for_prediction(
860
+ self, model: Optional[Union[str, AbstractTimeSeriesModel]] = None, verbose: bool = True
861
+ ) -> str:
853
862
  """Given an optional identifier or model object, return the name of the model with which to predict.
854
863
 
855
864
  If the model is not provided, this method will default to the best model according to the validation score.
@@ -858,10 +867,11 @@ class AbstractTimeSeriesTrainer(SimpleAbstractTrainer):
858
867
  if self.model_best is None:
859
868
  best_model_name: str = self.get_model_best()
860
869
  self.model_best = best_model_name
861
- logger.info(
862
- f"Model not specified in predict, will default to the model with the "
863
- f"best validation score: {self.model_best}",
864
- )
870
+ if verbose:
871
+ logger.info(
872
+ f"Model not specified in predict, will default to the model with the "
873
+ f"best validation score: {self.model_best}",
874
+ )
865
875
  return self.model_best
866
876
  else:
867
877
  if isinstance(model, AbstractTimeSeriesModel):
@@ -923,7 +933,7 @@ class AbstractTimeSeriesTrainer(SimpleAbstractTrainer):
923
933
  use_cache: bool = True,
924
934
  ) -> Dict[str, float]:
925
935
  past_data, known_covariates = data.get_model_inputs_for_scoring(
926
- prediction_length=self.prediction_length, known_covariates_names=self.metadata.known_covariates_real
936
+ prediction_length=self.prediction_length, known_covariates_names=self.metadata.known_covariates
927
937
  )
928
938
  predictions = self.predict(data=past_data, known_covariates=known_covariates, model=model, use_cache=use_cache)
929
939
  if not isinstance(metrics, list): # a single metric is provided
@@ -936,6 +946,149 @@ class AbstractTimeSeriesTrainer(SimpleAbstractTrainer):
936
946
  )
937
947
  return scores_dict
938
948
 
949
+ def get_feature_importance(
950
+ self,
951
+ data: TimeSeriesDataFrame,
952
+ features: List[str],
953
+ model: Optional[Union[str, AbstractTimeSeriesModel]] = None,
954
+ metric: Optional[Union[str, TimeSeriesScorer]] = None,
955
+ time_limit: Optional[float] = None,
956
+ method: Literal["naive", "permutation"] = "permutation",
957
+ subsample_size: int = 50,
958
+ num_iterations: int = 1,
959
+ random_seed: Optional[int] = None,
960
+ relative_scores: bool = False,
961
+ include_confidence_band: bool = True,
962
+ confidence_level: float = 0.99,
963
+ ) -> pd.DataFrame:
964
+ assert method in ["naive", "permutation"], f"Invalid feature importance method {method}."
965
+ metric = check_get_evaluation_metric(metric) if metric is not None else self.eval_metric
966
+
967
+ logger.info("Computing feature importance")
968
+
969
+ # seed everything if random_seed is provided
970
+ if random_seed is not None:
971
+ seed_everything(random_seed)
972
+
973
+ # start timer and cap subsample size if it's greater than the number of items in the provided data set
974
+ time_start = time.time()
975
+ if subsample_size > data.num_items:
976
+ logger.info(
977
+ f"Subsample_size {subsample_size} is larger than the number of items in the data and will be ignored"
978
+ )
979
+ subsample_size = data.num_items
980
+
981
+ # set default number of iterations and cap iterations if the number of items in the data is smaller
982
+ # than the subsample size for the naive method
983
+ num_iterations = num_iterations or (5 if method == "permutation" else 1)
984
+ if method == "naive" and data.num_items <= subsample_size:
985
+ num_iterations = 1
986
+
987
+ # initialize the importance transform
988
+ importance_transform_type = {
989
+ "permutation": PermutationFeatureImportanceTransform,
990
+ "naive": ConstantReplacementFeatureImportanceTransform,
991
+ }.get(method)
992
+ importance_transform = importance_transform_type(
993
+ covariate_metadata=self.metadata,
994
+ prediction_length=self.prediction_length,
995
+ random_seed=random_seed,
996
+ )
997
+
998
+ # if model is not provided, use the best model according to the validation score
999
+ model = self._get_model_for_prediction(model, verbose=False)
1000
+
1001
+ # persist trainer to speed up repeated inference
1002
+ persisted_models = self.persist(model_names=[model], with_ancestors=True)
1003
+
1004
+ importance_samples = defaultdict(list)
1005
+ for n in range(num_iterations):
1006
+ if subsample_size < data.num_items:
1007
+ item_ids_sampled = data.item_ids.to_series().sample(subsample_size) # noqa
1008
+ data_sample = data.query("item_id in @item_ids_sampled")
1009
+ else:
1010
+ data_sample = data
1011
+
1012
+ base_score = self.evaluate(data=data_sample, model=model, metrics=metric, use_cache=False)[metric.name]
1013
+
1014
+ for feature in features:
1015
+ # override importance for unused features
1016
+ if not self._model_uses_feature(model, feature):
1017
+ continue
1018
+ else:
1019
+ data_sample_replaced = importance_transform.transform(data_sample, feature_name=feature)
1020
+ score = self.evaluate(data=data_sample_replaced, model=model, metrics=metric, use_cache=False)[
1021
+ metric.name
1022
+ ]
1023
+
1024
+ importance = base_score - score
1025
+ if relative_scores:
1026
+ importance /= np.abs(base_score - self.eps_abs_importance_score)
1027
+ importance = min(self.max_rel_importance_score, importance)
1028
+
1029
+ importance_samples[feature].append(importance)
1030
+
1031
+ if time_limit is not None and time.time() - time_start > time_limit:
1032
+ logger.info(f"Time limit reached, stopping feature importance computation after {n} iterations")
1033
+ break
1034
+
1035
+ self.unpersist(model_names=persisted_models)
1036
+
1037
+ importance_df = (
1038
+ (
1039
+ pd.DataFrame(importance_samples)
1040
+ .agg(["mean", "std", "count"])
1041
+ .T.rename(columns={"mean": "importance", "std": "stdev", "count": "n"})
1042
+ )
1043
+ if len(importance_samples) > 0
1044
+ else pd.DataFrame(columns=["importance", "stdev", "n"])
1045
+ )
1046
+
1047
+ if include_confidence_band:
1048
+ importance_df = self._add_ci_to_feature_importance(importance_df, confidence_level=confidence_level)
1049
+
1050
+ return importance_df
1051
+
1052
+ def _model_uses_feature(self, model: Optional[Union[str, AbstractTimeSeriesModel]], feature: str) -> bool:
1053
+ """Check if the given model uses the given feature."""
1054
+ models_with_ancestors = set(self.get_minimum_model_set(model))
1055
+
1056
+ if feature in self.metadata.static_features:
1057
+ return any(self.load_model(m).supports_static_features for m in models_with_ancestors)
1058
+ elif feature in self.metadata.known_covariates:
1059
+ return any(self.load_model(m).supports_known_covariates for m in models_with_ancestors)
1060
+ elif feature in self.metadata.past_covariates:
1061
+ return any(self.load_model(m).supports_past_covariates for m in models_with_ancestors)
1062
+
1063
+ return False
1064
+
1065
+ def _add_ci_to_feature_importance(
1066
+ self, importance_df: pd.DataFrame, confidence_level: float = 0.99
1067
+ ) -> pd.DataFrame:
1068
+ """Add confidence intervals to the feature importance."""
1069
+ import scipy.stats
1070
+
1071
+ if confidence_level <= 0.5 or confidence_level >= 1.0:
1072
+ raise ValueError("confidence_level must lie between 0.5 and 1.0")
1073
+ ci_str = "{:.0f}".format(confidence_level * 100)
1074
+
1075
+ alpha = 1 - confidence_level
1076
+ importance_df[f"p{ci_str}_low"] = np.nan
1077
+ importance_df[f"p{ci_str}_high"] = np.nan
1078
+
1079
+ for i in importance_df.index:
1080
+ r = importance_df.loc[i]
1081
+ importance, stdev, n = r["importance"], r["stdev"], r["n"]
1082
+ if np.isnan(importance) or np.isnan(stdev) or np.isnan(n) or n <= 1:
1083
+ continue
1084
+
1085
+ t_crit = scipy.stats.t.ppf(1 - alpha / 2, df=n - 1)
1086
+
1087
+ importance_df.loc[i, f"p{ci_str}_low"] = importance - t_crit * stdev / np.sqrt(n)
1088
+ importance_df.loc[i, f"p{ci_str}_high"] = importance + t_crit * stdev / np.sqrt(n)
1089
+
1090
+ return importance_df
1091
+
939
1092
  def _predict_model(
940
1093
  self,
941
1094
  model: Union[str, AbstractTimeSeriesModel],
@@ -1,8 +1,9 @@
1
1
  import logging
2
2
  import reprlib
3
3
  from dataclasses import dataclass, field
4
- from typing import List, Optional, Tuple
4
+ from typing import Any, List, Literal, Optional, Tuple
5
5
 
6
+ import numpy as np
6
7
  import pandas as pd
7
8
 
8
9
  from autogluon.common.features.types import R_FLOAT, R_INT
@@ -12,7 +13,7 @@ from autogluon.features.generators import (
12
13
  IdentityFeatureGenerator,
13
14
  PipelineFeatureGenerator,
14
15
  )
15
- from autogluon.timeseries import TimeSeriesDataFrame
16
+ from autogluon.timeseries.dataset.ts_dataframe import ITEMID, TimeSeriesDataFrame
16
17
 
17
18
  logger = logging.getLogger(__name__)
18
19
 
@@ -28,6 +29,10 @@ class CovariateMetadata:
28
29
  past_covariates_real: List[str] = field(default_factory=list)
29
30
  past_covariates_cat: List[str] = field(default_factory=list)
30
31
 
32
+ @property
33
+ def static_features(self) -> List[str]:
34
+ return self.static_features_cat + self.static_features_real
35
+
31
36
  @property
32
37
  def known_covariates(self) -> List[str]:
33
38
  return self.known_covariates_cat + self.known_covariates_real
@@ -48,6 +53,18 @@ class CovariateMetadata:
48
53
  def covariates_cat(self) -> List[str]:
49
54
  return self.known_covariates_cat + self.past_covariates_cat
50
55
 
56
+ @property
57
+ def real_features(self) -> List[str]:
58
+ return self.static_features_real + self.covariates_real
59
+
60
+ @property
61
+ def cat_features(self) -> List[str]:
62
+ return self.static_features_cat + self.covariates_cat
63
+
64
+ @property
65
+ def all_features(self) -> List[str]:
66
+ return self.static_features + self.covariates
67
+
51
68
 
52
69
  class ContinuousAndCategoricalFeatureGenerator(PipelineFeatureGenerator):
53
70
  """Generates categorical and continuous features for time series models.
@@ -284,3 +301,102 @@ class TimeSeriesFeatureGenerator:
284
301
  raise ValueError(
285
302
  f"{len(missing_columns)} columns are missing from {data_frame_name}: {reprlib.repr(missing_columns.to_list())}"
286
303
  )
304
+
305
+
306
+ class AbstractFeatureImportanceTransform:
307
+ """Abstract class for transforms that replace a given feature with dummy or shuffled values,
308
+ for use in feature importance operations.
309
+ """
310
+
311
+ def __init__(
312
+ self,
313
+ covariate_metadata: CovariateMetadata,
314
+ prediction_length: int,
315
+ **kwargs,
316
+ ):
317
+ self.covariate_metadata: CovariateMetadata = covariate_metadata
318
+ self.prediction_length: int = prediction_length
319
+
320
+ def _transform_series(self, data: pd.Series, is_categorical: bool, **kwargs) -> TimeSeriesDataFrame:
321
+ """Transforms a series with the same index as the pandas DataFrame"""
322
+ raise NotImplementedError
323
+
324
+ def transform(self, data: TimeSeriesDataFrame, feature_name: str, **kwargs) -> TimeSeriesDataFrame:
325
+ if feature_name not in self.covariate_metadata.all_features:
326
+ raise ValueError(f"Target feature {feature_name} not found in covariate metadata")
327
+
328
+ # feature transform works on a shallow copy of the main time series data frame
329
+ # but a deep copy of the static features.
330
+ data = data.copy(deep=False)
331
+
332
+ is_categorical = feature_name in self.covariate_metadata.cat_features
333
+
334
+ if feature_name in self.covariate_metadata.past_covariates:
335
+ # we'll have to work on the history of the data alone
336
+ data[feature_name] = data[feature_name].copy()
337
+ feature_data = data[feature_name].groupby(level=ITEMID, sort=False).head(-self.prediction_length)
338
+ data[feature_name].update(self._transform_series(feature_data, is_categorical=is_categorical))
339
+ elif feature_name in self.covariate_metadata.static_features:
340
+ feature_data = data.static_features[feature_name].copy()
341
+ feature_data.reset_index(drop=True, inplace=True)
342
+ data.static_features[feature_name] = self._transform_static_series(
343
+ feature_data, is_categorical=is_categorical
344
+ )
345
+ else: # known covariates
346
+ data[feature_name] = self._transform_series(data[feature_name], is_categorical=is_categorical)
347
+
348
+ return data
349
+
350
+
351
+ class PermutationFeatureImportanceTransform(AbstractFeatureImportanceTransform):
352
+ """Naively shuffles a given feature."""
353
+
354
+ def __init__(
355
+ self,
356
+ covariate_metadata: CovariateMetadata,
357
+ prediction_length: int,
358
+ random_seed: Optional[int] = None,
359
+ shuffle_type: Literal["itemwise", "naive"] = "itemwise",
360
+ **kwargs,
361
+ ):
362
+ super().__init__(covariate_metadata, prediction_length, **kwargs)
363
+ self.shuffle_type = shuffle_type
364
+ self.random_seed = random_seed
365
+
366
+ def _transform_static_series(self, feature_data: pd.Series, is_categorical: bool) -> Any:
367
+ return feature_data.sample(frac=1, random_state=self.random_seed).values
368
+
369
+ def _transform_series(self, feature_data: pd.Series, is_categorical: bool) -> pd.Series:
370
+ # set random state once to shuffle 'independently' for different items
371
+ rng = np.random.RandomState(self.random_seed)
372
+
373
+ if self.shuffle_type == "itemwise":
374
+ return feature_data.groupby(level=ITEMID, sort=False).transform(
375
+ lambda x: x.sample(frac=1, random_state=rng).values
376
+ )
377
+ elif self.shuffle_type == "naive":
378
+ return pd.Series(feature_data.sample(frac=1, random_state=rng).values, index=feature_data.index)
379
+
380
+
381
+ class ConstantReplacementFeatureImportanceTransform(AbstractFeatureImportanceTransform):
382
+ """Replaces a target feature with the median if it's a real-valued feature, and the mode if it's a
383
+ categorical feature."""
384
+
385
+ def __init__(
386
+ self,
387
+ covariate_metadata: CovariateMetadata,
388
+ prediction_length: int,
389
+ real_value_aggregation: Literal["mean", "median"] = "mean",
390
+ **kwargs,
391
+ ):
392
+ super().__init__(covariate_metadata, prediction_length, **kwargs)
393
+ self.real_value_aggregation = real_value_aggregation
394
+
395
+ def _transform_static_series(self, feature_data: pd.Series, is_categorical: bool) -> Any:
396
+ return feature_data.mode()[0] if is_categorical else feature_data.agg(self.real_value_aggregation)
397
+
398
+ def _transform_series(self, feature_data: pd.Series, is_categorical: bool) -> pd.Series:
399
+ if is_categorical:
400
+ return feature_data.groupby(level=ITEMID, sort=False).transform(lambda x: x.mode()[0])
401
+ else:
402
+ return feature_data.groupby(level=ITEMID, sort=False).transform(self.real_value_aggregation)
@@ -1,3 +1,3 @@
1
1
  """This is the autogluon version file."""
2
- __version__ = '1.0.1b20240405'
2
+ __version__ = '1.0.1b20240406'
3
3
  __lite__ = False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: autogluon.timeseries
3
- Version: 1.0.1b20240405
3
+ Version: 1.0.1b20240406
4
4
  Summary: AutoML for Image, Text, and Tabular Data
5
5
  Home-page: https://github.com/autogluon/autogluon
6
6
  Author: AutoGluon Community
@@ -52,9 +52,9 @@ Requires-Dist: utilsforecast <0.0.11,>=0.0.10
52
52
  Requires-Dist: tqdm <5,>=4.38
53
53
  Requires-Dist: orjson ~=3.9
54
54
  Requires-Dist: tensorboard <3,>=2.9
55
- Requires-Dist: autogluon.core[raytune] ==1.0.1b20240405
56
- Requires-Dist: autogluon.common ==1.0.1b20240405
57
- Requires-Dist: autogluon.tabular[catboost,lightgbm,xgboost] ==1.0.1b20240405
55
+ Requires-Dist: autogluon.core[raytune] ==1.0.1b20240406
56
+ Requires-Dist: autogluon.common ==1.0.1b20240406
57
+ Requires-Dist: autogluon.tabular[catboost,lightgbm,xgboost] ==1.0.1b20240406
58
58
  Provides-Extra: all
59
59
  Requires-Dist: optimum[nncf,openvino] <1.18,>=1.17 ; extra == 'all'
60
60
  Requires-Dist: optimum[onnxruntime] <1.18,>=1.17 ; extra == 'all'
@@ -1,10 +1,10 @@
1
- autogluon.timeseries-1.0.1b20240405-py3.8-nspkg.pth,sha256=cQGwpuGPqg1GXscIwt-7PmME1OnSpD-7ixkikJ31WAY,554
1
+ autogluon.timeseries-1.0.1b20240406-py3.8-nspkg.pth,sha256=cQGwpuGPqg1GXscIwt-7PmME1OnSpD-7ixkikJ31WAY,554
2
2
  autogluon/timeseries/__init__.py,sha256=_CrLLc1fkjen7UzWoO0Os8WZoHOgvZbHKy46I8v_4k4,304
3
3
  autogluon/timeseries/evaluator.py,sha256=l642tYfTHsl8WVIq_vV6qhgAFVFr9UuZD7gLra3A_Kc,250
4
- autogluon/timeseries/learner.py,sha256=PpIemiI9rukHjH0J4i6qMjt6ejfj9sPiQhKwveIenlk,10820
5
- autogluon/timeseries/predictor.py,sha256=Z-pnKJ5zdrYw-TNEVR_FhC-yhZaDklWatEgVmCjx30k,71538
4
+ autogluon/timeseries/learner.py,sha256=fPIV2p0BMWcZr5fwOkNsJrk8RxK-IYUH_VON3_YXKOQ,13750
5
+ autogluon/timeseries/predictor.py,sha256=CbtYjj0XOHzl86gmz4NlF-C-AumwJrF_cdsKT6M6ql0,81011
6
6
  autogluon/timeseries/splitter.py,sha256=eghGwAAN2_cxGk5aJBILgjGWtLzjxJcytMy49gg_q18,3061
7
- autogluon/timeseries/version.py,sha256=MYBesC3NyWbXyZpk4dbUqmTB7gO6F6qdhjUR-_Mm8hQ,90
7
+ autogluon/timeseries/version.py,sha256=hLs5RwjSILRXf-EmvoO1fvxeoOqWXitaB0VO3Ptt9-c,90
8
8
  autogluon/timeseries/configs/__init__.py,sha256=BTtHIPCYeGjqgOcvqb8qPD4VNX-ICKOg6wnkew1cPOE,98
9
9
  autogluon/timeseries/configs/presets_configs.py,sha256=ZVV8BsnGnnHPgjBtJBqF-H35MYUdzRBQ8FP7zA3_11g,1949
10
10
  autogluon/timeseries/dataset/__init__.py,sha256=UvnhAN5tjgxXTHoZMQDy64YMDj4Xxa68yY7NP4vAw0o,81
@@ -17,10 +17,10 @@ autogluon/timeseries/metrics/utils.py,sha256=eJ63TCR-UwbeJ1c2Qm7B2q-8B3sFthPgioo
17
17
  autogluon/timeseries/models/__init__.py,sha256=HFjDOYKQWaGlgQWiLlOvfwE2dH0uDmeKJFC8GDL987c,1271
18
18
  autogluon/timeseries/models/presets.py,sha256=p36ROcuOnixgGsI1zBdr9VM-MH2pKCiJCS2Ofb4xT8o,11243
19
19
  autogluon/timeseries/models/abstract/__init__.py,sha256=wvDsQAZIV0N3AwBeMaGItoQ82trEfnT-nol2AAOIxBg,102
20
- autogluon/timeseries/models/abstract/abstract_timeseries_model.py,sha256=VipAL3qqrG-s5-SqEtYPtubYznBDdkfLyfPbgSXjfZk,23009
20
+ autogluon/timeseries/models/abstract/abstract_timeseries_model.py,sha256=aUXlX1ozc5XghinR5ahGIX94MkhBmmYvgmqmMib5BhU,23391
21
21
  autogluon/timeseries/models/abstract/model_trial.py,sha256=ENPg_7nsdxIvaNM0o0UShZ3x8jFlRmwRc5m0fGPC0TM,3720
22
22
  autogluon/timeseries/models/autogluon_tabular/__init__.py,sha256=r9i6jWcyeLHYClkcMSKRVsfrkBUMxpDrTATNTBc_qgQ,136
23
- autogluon/timeseries/models/autogluon_tabular/mlforecast.py,sha256=kRddJeP3WP05E0dp-NIVQ9VU_lkRVgOAPNhL8qDjHV0,30834
23
+ autogluon/timeseries/models/autogluon_tabular/mlforecast.py,sha256=9gNuCWf8vVfVPiXppwG5l_3mLbZZ6i5pHKTM-rSk5Ww,30977
24
24
  autogluon/timeseries/models/autogluon_tabular/utils.py,sha256=4-gTrBtizxeMVQlsuscugPqw9unaXWXhS1TVVssfzYY,2125
25
25
  autogluon/timeseries/models/chronos/__init__.py,sha256=wT77HzTtmQxW3sw2k0mA5Ot6PSHivX-Uvn5fjM05EU4,60
26
26
  autogluon/timeseries/models/chronos/model.py,sha256=5DqxDrm2zO2lShmTviZOtlKUsjpcZSsIac2G-sVSfDI,14873
@@ -29,21 +29,21 @@ autogluon/timeseries/models/ensemble/__init__.py,sha256=kFr11Gmt7lQJu9Rr8HuIPphQ
29
29
  autogluon/timeseries/models/ensemble/abstract_timeseries_ensemble.py,sha256=tifETwmiEGt-YtQ9eNK7ojJ3fBvtFMUJvisbfkIJ7gw,3393
30
30
  autogluon/timeseries/models/ensemble/greedy_ensemble.py,sha256=5HvZuW5osgsZg3V69k82nKEOy_YgeH1JTfQa7F3cU7s,7220
31
31
  autogluon/timeseries/models/gluonts/__init__.py,sha256=M8PV9ZE4WpteScMobXM6RH1Udb1AZiHHtj2g5GQL3TU,329
32
- autogluon/timeseries/models/gluonts/abstract_gluonts.py,sha256=oFvykluISe43EZWtTddoF00hg66iZJYsYg_rUZz_ISs,25546
32
+ autogluon/timeseries/models/gluonts/abstract_gluonts.py,sha256=X1l_MexAoyBNMGiJrWreHQHLDSmZV_OSrhjhJ7MA0JM,34348
33
33
  autogluon/timeseries/models/gluonts/torch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
- autogluon/timeseries/models/gluonts/torch/models.py,sha256=UXxGBNAYQySLoLw95ZtbwH7R9-K3A5nh38KroW95wc0,19217
34
+ autogluon/timeseries/models/gluonts/torch/models.py,sha256=PVDns7CnZtJTbPiCw-FJxahKrDjC-wj0VkwIGsodYY0,19930
35
35
  autogluon/timeseries/models/local/__init__.py,sha256=JyckWWgMG1BTIWJqFTW6e1O-eb0LPPOwtXwmb1ErohQ,756
36
36
  autogluon/timeseries/models/local/abstract_local_model.py,sha256=lota8MNpfgYC1PftM7sKcjx2gVCVq3K_D_dovBGqksg,11692
37
37
  autogluon/timeseries/models/local/naive.py,sha256=iwRcFMFmJKPWPbD9TWaIUS51oav69F_VAp6-jb_5SUE,7249
38
38
  autogluon/timeseries/models/local/npts.py,sha256=Bp74doKnfpGE8ywP4FWOCI_RwRMsmgocYDfGtq764DA,4143
39
39
  autogluon/timeseries/models/local/statsforecast.py,sha256=oDYKKM2LZXEQLhPLEgZZWhvSEC1iE1wBexpl8P-Cxwc,32991
40
40
  autogluon/timeseries/models/multi_window/__init__.py,sha256=Bq7AT2Jxdd4WNqmjTdzeqgNiwn1NCyWp4tBIWaM-zfI,60
41
- autogluon/timeseries/models/multi_window/multi_window_model.py,sha256=M_8G0UkB4TjcB03Q2-eA1FqIdNkqUK_Rp9YsK-wxZCI,10974
41
+ autogluon/timeseries/models/multi_window/multi_window_model.py,sha256=Thge05cLytJoOpShE7g1MuNa-qlZWUrSvaO0aCbKQbA,11348
42
42
  autogluon/timeseries/trainer/__init__.py,sha256=lxiOT-Gc6BEnr_yWQqra85kEngeM_wtH2SCaRbmC_qE,170
43
- autogluon/timeseries/trainer/abstract_trainer.py,sha256=ulPCnBtXZhZLIcM6Z2epzlzSBcUYzEO2jV6a6UI8Hnc,52432
43
+ autogluon/timeseries/trainer/abstract_trainer.py,sha256=2nPLskmbOGRzkj6ttX0tHVkj9h2Y72MHaZy7L78MBZQ,59100
44
44
  autogluon/timeseries/trainer/auto_trainer.py,sha256=psJFZBwWWPlLjNwAgvO4OUJXsRW1sTN2YS9a4pdoeoE,3344
45
45
  autogluon/timeseries/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
- autogluon/timeseries/utils/features.py,sha256=K_b5c-Vv4vCVuzdid5-JWW7Hy7lYFhtM6e57Zw8dV1E,14206
46
+ autogluon/timeseries/utils/features.py,sha256=OvBxLIWKR7fPOIlifonVKXUdaWazH_WbdLssJtFCpGs,19261
47
47
  autogluon/timeseries/utils/forecast.py,sha256=Thjt6yTPSe3V4s5cQ9UbW3ysTJb1lkqxtZiCqgBSt3w,1776
48
48
  autogluon/timeseries/utils/warning_filters.py,sha256=ngjmfv21zIwTG-7VNZT-NkaSR7ssnoNtUwcXCXANZ4A,2076
49
49
  autogluon/timeseries/utils/datetime/__init__.py,sha256=bTMR8jLh1LW55vHjbOr1zvWRMF_PqbvxpS-cUcNIDWI,173
@@ -51,11 +51,11 @@ autogluon/timeseries/utils/datetime/base.py,sha256=MsqIHY14m3QMjSwwtE7Uo1oNwepWU
51
51
  autogluon/timeseries/utils/datetime/lags.py,sha256=kcU4liKbHj7KP2ajNU-KLZ8OYSU35EgT4kJjZNSw0Zg,5875
52
52
  autogluon/timeseries/utils/datetime/seasonality.py,sha256=kgK_ukw2wCviEB7CZXRVC5HZpBJZu9IsRrvCJ9E_rOE,755
53
53
  autogluon/timeseries/utils/datetime/time_features.py,sha256=pROkYyxETQ8rHKfPGhf2paB73C7rWJ2Ui0cCswLqbBg,2562
54
- autogluon.timeseries-1.0.1b20240405.dist-info/LICENSE,sha256=CeipvOyAZxBGUsFoaFqwkx54aPnIKEtm9a5u2uXxEws,10142
55
- autogluon.timeseries-1.0.1b20240405.dist-info/METADATA,sha256=Lg0FRQjWuVAeQ3Ye6q7HkwGPzd2WhCOYYcPty1hYd50,12524
56
- autogluon.timeseries-1.0.1b20240405.dist-info/NOTICE,sha256=7nPQuj8Kp-uXsU0S5so3-2dNU5EctS5hDXvvzzehd7E,114
57
- autogluon.timeseries-1.0.1b20240405.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
58
- autogluon.timeseries-1.0.1b20240405.dist-info/namespace_packages.txt,sha256=giERA4R78OkJf2ijn5slgjURlhRPzfLr7waIcGkzYAo,10
59
- autogluon.timeseries-1.0.1b20240405.dist-info/top_level.txt,sha256=giERA4R78OkJf2ijn5slgjURlhRPzfLr7waIcGkzYAo,10
60
- autogluon.timeseries-1.0.1b20240405.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
61
- autogluon.timeseries-1.0.1b20240405.dist-info/RECORD,,
54
+ autogluon.timeseries-1.0.1b20240406.dist-info/LICENSE,sha256=CeipvOyAZxBGUsFoaFqwkx54aPnIKEtm9a5u2uXxEws,10142
55
+ autogluon.timeseries-1.0.1b20240406.dist-info/METADATA,sha256=YG12p0tq6vbCCp-72ac1u7YDqgMBes37Ki8YTOBtnH8,12524
56
+ autogluon.timeseries-1.0.1b20240406.dist-info/NOTICE,sha256=7nPQuj8Kp-uXsU0S5so3-2dNU5EctS5hDXvvzzehd7E,114
57
+ autogluon.timeseries-1.0.1b20240406.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
58
+ autogluon.timeseries-1.0.1b20240406.dist-info/namespace_packages.txt,sha256=giERA4R78OkJf2ijn5slgjURlhRPzfLr7waIcGkzYAo,10
59
+ autogluon.timeseries-1.0.1b20240406.dist-info/top_level.txt,sha256=giERA4R78OkJf2ijn5slgjURlhRPzfLr7waIcGkzYAo,10
60
+ autogluon.timeseries-1.0.1b20240406.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
61
+ autogluon.timeseries-1.0.1b20240406.dist-info/RECORD,,