PyPI - autogluon.timeseries - Versions diffs - 1.0.1b20240304__py3-none-any.whl → 1.4.1b20251210__py3-none-any.whl - Mend - Supply Chain Defender

autogluon.timeseries 1.0.1b20240304py3-none-any.whl → 1.4.1b20251210py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of autogluon.timeseries might be problematic. Click here for more details.

Files changed (108) hide show

autogluon/timeseries/predictor.py CHANGED Viewed

@@ -5,46 +5,33 @@ import os
 import pprint
 import time
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Literal, Type, cast, overload
 import numpy as np
 import pandas as pd
-from autogluon.common.utils.deprecated_utils import Deprecated
-from autogluon.common.utils.log_utils import add_log_to_file, set_logger_verbosity
+from autogluon.common.utils.log_utils import (
+    add_log_to_file,
+    set_logger_verbosity,
+    warn_if_mlflow_autologging_is_enabled,
+)
 from autogluon.common.utils.system_info import get_ag_system_info
 from autogluon.common.utils.utils import check_saved_predictor_version, setup_outputdir
 from autogluon.core.utils.decorators import apply_presets
 from autogluon.core.utils.loaders import load_pkl, load_str
 from autogluon.core.utils.savers import save_pkl, save_str
 from autogluon.timeseries import __version__ as current_ag_version
-from autogluon.timeseries.configs import TIMESERIES_PRESETS_CONFIGS
-from autogluon.timeseries.dataset.ts_dataframe import ITEMID, TimeSeriesDataFrame
-from autogluon.timeseries.learner import AbstractLearner, TimeSeriesLearner
+from autogluon.timeseries.configs import get_predictor_presets
+from autogluon.timeseries.dataset import TimeSeriesDataFrame
+from autogluon.timeseries.learner import TimeSeriesLearner
 from autogluon.timeseries.metrics import TimeSeriesScorer, check_get_evaluation_metric
-from autogluon.timeseries.splitter import ExpandingWindowSplitter
-from autogluon.timeseries.trainer import AbstractTimeSeriesTrainer
+from autogluon.timeseries.trainer import TimeSeriesTrainer
+from autogluon.timeseries.utils.forecast import make_future_data_frame
 logger = logging.getLogger("autogluon.timeseries")
-class TimeSeriesPredictorDeprecatedMixin:
-    """Contains deprecated methods from TimeSeriesPredictor that shouldn't show up in API documentation."""
-    @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="evaluate")
-    def score(self, *args, **kwargs):
-        return self.evaluate(*args, **kwargs)
-    @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="model_best")
-    def get_model_best(self) -> str:
-        return self.model_best
-    @Deprecated(min_version_to_warn="0.8.3", min_version_to_error="1.2", version_to_remove="1.2", new="model_names")
-    def get_model_names(self) -> str:
-        return self.model_names()
-class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
+class TimeSeriesPredictor:
     """AutoGluon ``TimeSeriesPredictor`` predicts future values of multiple related time series.
     ``TimeSeriesPredictor`` provides probabilistic (quantile) multi-step-ahead forecasts for univariate time series.
@@ -69,7 +56,7 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
         models that predict up to 3 days into the future from the most recent observation.
     freq : str, optional
         Frequency of the time series data (see `pandas documentation <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_
-        for available frequencies). For example, ``"D"`` for daily data or ``"H"`` for hourly data.
+        for available frequencies). For example, ``"D"`` for daily data or ``"h"`` for hourly data.
         By default, the predictor will attempt to automatically infer the frequency from the data. This argument should
         only be set in two cases:
@@ -79,7 +66,7 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
         If ``freq`` is provided when creating the predictor, all data passed to the predictor will be automatically
         resampled at this frequency.
-    eval_metric : Union[str, TimeSeriesScorer], default = "WQL"
+    eval_metric : str | TimeSeriesScorer, default = "WQL"
         Metric by which predictions will be ultimately evaluated on future test data. AutoGluon tunes hyperparameters
         in order to improve this metric on validation data, and ranks models (on validation data) according to this
         metric.
@@ -105,23 +92,29 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
     eval_metric_seasonal_period : int, optional
         Seasonal period used to compute some evaluation metrics such as mean absolute scaled error (MASE). Defaults to
         ``None``, in which case the seasonal period is computed based on the data frequency.
-    known_covariates_names: List[str], optional
+    horizon_weight : list[float], optional
+        Weight assigned to each time step in the forecast horizon when computing the ``eval_metric``. If provided, this
+        must be a list with ``prediction_length`` non-negative values, where at least some values are greater than zero.
+        AutoGluon will automatically normalize the weights so that they sum up to ``prediction_length``. By default, all
+        time steps in the forecast horizon have the same weight, which is equivalent to setting ``horizon_weight = [1] * prediction_length``.
+        This parameter only affects model selection and ensemble construction; it has no effect on the loss function of
+        the individual forecasting models.
+    known_covariates_names: list[str], optional
         Names of the covariates that are known in advance for all time steps in the forecast horizon. These are also
         known as dynamic features, exogenous variables, additional regressors or related time series. Examples of such
         covariates include holidays, promotions or weather forecasts.
-        Currently, only numeric (float of integer dtype) are supported.
         If ``known_covariates_names`` are provided, then:
-        - :meth:`~autogluon.timeseries.TimeSeriesPredictor.fit`, :meth:`~autogluon.timeseries.TimeSeriesPredictor.evaluate`, and :meth:`~autogluon.timeseries.TimeSeriesPredictor.leaderboard` will expect a data frame with columns listed in ``known_covariates_names`` (in addition to the ``target`` column).
+        - :meth:`~autogluon.timeseries.TimeSeriesPredictor.fit`, :meth:`~autogluon.timeseries.TimeSeriesPredictor.evaluate`, and :meth:`~autogluon.timeseries.TimeSeriesPredictor.leaderboard` will expect a dataframe with columns listed in ``known_covariates_names`` (in addition to the ``target`` column).
         - :meth:`~autogluon.timeseries.TimeSeriesPredictor.predict` will expect an additional keyword argument ``known_covariates`` containing the future values of the known covariates in ``TimeSeriesDataFrame`` format.
-    quantile_levels : List[float], optional
+    quantile_levels : list[float], optional
         List of increasing decimals that specifies which quantiles should be estimated when making distributional
         forecasts. Defaults to ``[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]``.
     path : str or pathlib.Path, optional
-        Path to the directory where models and intermediate outputs will be saved. Defaults to a timestamped folder
+        Path to the local directory where models and intermediate outputs will be saved. Defaults to a timestamped folder
         ``AutogluonModels/ag-[TIMESTAMP]`` that will be created in the working directory.
     verbosity : int, default = 2
         Verbosity levels range from 0 to 4 and control how much information is printed to stdout. Higher levels
@@ -131,10 +124,10 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
         debug messages from AutoGluon and all logging in dependencies (GluonTS, PyTorch Lightning, AutoGluon-Tabular, etc.)
     log_to_file: bool, default = True
         Whether to save the logs into a file for later reference
-    log_file_path: Union[str, Path], default = "auto"
+    log_file_path: str | Path, default = "auto"
         File path to save the logs.
-        If auto, logs will be saved under `predictor_path/logs/predictor_log.txt`.
-        Will be ignored if `log_to_file` is set to False
+        If auto, logs will be saved under ``predictor_path/logs/predictor_log.txt``.
+        Will be ignored if ``log_to_file`` is set to False
     cache_predictions : bool, default = True
         If True, the predictor will cache and reuse the predictions made by individual models whenever
         :meth:`~autogluon.timeseries.TimeSeriesPredictor.predict`, :meth:`~autogluon.timeseries.TimeSeriesPredictor.leaderboard`,
@@ -145,32 +138,37 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
         Alias for :attr:`target`.
     """
+    _learner_type = TimeSeriesLearner
     predictor_file_name = "predictor.pkl"
-    _predictor_version_file_name = "__version__"
+    _predictor_version_file_name = "version.txt"
     _predictor_log_file_name = "predictor_log.txt"
     def __init__(
         self,
-        target: Optional[str] = None,
-        known_covariates_names: Optional[List[str]] = None,
+        target: str | None = None,
+        known_covariates_names: list[str] | None = None,
         prediction_length: int = 1,
-        freq: str = None,
-        eval_metric: Union[str, TimeSeriesScorer, None] = None,
-        eval_metric_seasonal_period: Optional[int] = None,
-        path: Optional[Union[str, Path]] = None,
+        freq: str | None = None,
+        eval_metric: str | TimeSeriesScorer | None = None,
+        eval_metric_seasonal_period: int | None = None,
+        horizon_weight: list[float] | None = None,
+        path: str | Path | None = None,
         verbosity: int = 2,
         log_to_file: bool = True,
-        log_file_path: Union[str, Path] = "auto",
-        quantile_levels: Optional[List[float]] = None,
+        log_file_path: str | Path = "auto",
+        quantile_levels: list[float] | None = None,
         cache_predictions: bool = True,
-        learner_type: Optional[Type[AbstractLearner]] = None,
-        learner_kwargs: Optional[dict] = None,
-        label: Optional[str] = None,
+        label: str | None = None,
         **kwargs,
     ):
         self.verbosity = verbosity
         set_logger_verbosity(self.verbosity, logger=logger)
         self.path = setup_outputdir(path)
+        if self.path.lower().startswith("s3://"):
+            logger.warning(
+                "Warning: S3 paths are not supported for the `path` argument in TimeSeriesPredictor. "
+                "Use a local path and upload the trained predictor to S3 manually if needed"
+            )
         self._setup_log_to_file(log_to_file=log_to_file, log_file_path=log_file_path)
         self.cache_predictions = cache_predictions
@@ -190,59 +188,56 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
             raise ValueError(f"Target column {self.target} cannot be one of the known covariates.")
         self.known_covariates_names = list(known_covariates_names)
-        self.prediction_length = prediction_length
+        self.prediction_length = int(prediction_length)
         # For each validation fold, all time series in training set must have length >= _min_train_length
         self._min_train_length = max(self.prediction_length + 1, 5)
         self.freq = freq
         if self.freq is not None:
-            # Standardize frequency string (e.g., "min" -> "T", "Y" -> "A-DEC")
-            std_freq = pd.tseries.frequencies.to_offset(self.freq).freqstr
+            # Standardize frequency string (e.g., "T" -> "min", "Y" -> "YE")
+            offset = pd.tseries.frequencies.to_offset(self.freq)
+            assert offset is not None
+            std_freq = offset.freqstr
             if std_freq != str(self.freq):
                 logger.info(f"Frequency '{self.freq}' stored as '{std_freq}'")
             self.freq = std_freq
-        self.eval_metric = check_get_evaluation_metric(eval_metric)
-        self.eval_metric_seasonal_period = eval_metric_seasonal_period
+        self.eval_metric: TimeSeriesScorer = check_get_evaluation_metric(
+            eval_metric,
+            prediction_length=prediction_length,
+            seasonal_period=eval_metric_seasonal_period,
+            horizon_weight=horizon_weight,
+        )
         if quantile_levels is None:
             quantile_levels = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
         self.quantile_levels = sorted(quantile_levels)
-        if learner_kwargs is None:
-            learner_kwargs = {}
-        learner_kwargs = learner_kwargs.copy()
-        learner_kwargs.update(
-            dict(
-                path_context=self.path,
-                eval_metric=eval_metric,
-                eval_metric_seasonal_period=eval_metric_seasonal_period,
-                target=self.target,
-                known_covariates_names=self.known_covariates_names,
-                prediction_length=self.prediction_length,
-                quantile_levels=self.quantile_levels,
-                cache_predictions=self.cache_predictions,
-            )
+        self._learner: TimeSeriesLearner = self._learner_type(
+            path_context=self.path,
+            eval_metric=self.eval_metric,
+            target=self.target,
+            known_covariates_names=self.known_covariates_names,
+            prediction_length=self.prediction_length,
+            quantile_levels=self.quantile_levels,
+            cache_predictions=self.cache_predictions,
+            ensemble_model_type=kwargs.pop("ensemble_model_type", None),
         )
-        # Using `TimeSeriesLearner` as default argument breaks doc generation with Sphnix
-        if learner_type is None:
-            learner_type = TimeSeriesLearner
-        self._learner: AbstractLearner = learner_type(**learner_kwargs)
-        self._learner_type = type(self._learner)
-        if "ignore_time_index" in kwargs:
-            raise TypeError(
-                "`ignore_time_index` argument to TimeSeriesPredictor.__init__() has been deprecated.\n"
-                "If your data has irregular timestamps, please either 1) specify the desired regular frequency when "
-                "creating the predictor as `TimeSeriesPredictor(freq=...)` or 2) manually convert timestamps to "
-                "regular frequency with `data.convert_frequency(freq=...)`."
-            )
         if len(kwargs) > 0:
             for key in kwargs:
                 raise TypeError(f"TimeSeriesPredictor.__init__() got an unexpected keyword argument '{key}'")
     @property
-    def _trainer(self) -> AbstractTimeSeriesTrainer:
+    def _trainer(self) -> TimeSeriesTrainer:
         return self._learner.load_trainer()  # noqa
-    def _setup_log_to_file(self, log_to_file: bool, log_file_path: Union[str, Path]) -> None:
+    @property
+    def is_fit(self) -> bool:
+        return self._learner.is_fit
+    def _assert_is_fit(self, method_name: str) -> None:
+        """Check if predictor is fit and raise AssertionError with informative message if not."""
+        if not self.is_fit:
+            raise AssertionError(f"Predictor is not fit. Call `.fit` before calling `.{method_name}`. ")
+    def _setup_log_to_file(self, log_to_file: bool, log_file_path: str | Path) -> None:
         if log_to_file:
             if log_file_path == "auto":
                 log_file_path = os.path.join(self.path, "logs", self._predictor_log_file_name)
@@ -252,14 +247,14 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
     def _to_data_frame(
         self,
-        data: Union[TimeSeriesDataFrame, pd.DataFrame, str],
+        data: TimeSeriesDataFrame | pd.DataFrame | Path | str,
         name: str = "data",
-    ) -> "TimeSeriesDataFrame":
+    ) -> TimeSeriesDataFrame:
         if isinstance(data, TimeSeriesDataFrame):
             return data
-        elif isinstance(data, (pd.DataFrame, str)):
+        elif isinstance(data, (pd.DataFrame, Path, str)):
             try:
-                data = TimeSeriesDataFrame(data)
+                data = TimeSeriesDataFrame(data)  # type: ignore
             except:
                 raise ValueError(
                     f"Provided {name} of type {type(data)} cannot be automatically converted to a TimeSeriesDataFrame."
@@ -267,23 +262,23 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
             return data
         else:
             raise TypeError(
-                f"{name} must be a TimeSeriesDataFrame or pandas.DataFrame or string (path to data) "
+                f"{name} must be a TimeSeriesDataFrame, pandas.DataFrame, pathlib.Path or string (path to data) "
                 f"but received an object of type {type(data)}."
             )
     def _check_and_prepare_data_frame(
         self,
-        data: Union[TimeSeriesDataFrame, pd.DataFrame, str],
+        data: TimeSeriesDataFrame | pd.DataFrame | Path | str,
         name: str = "data",
     ) -> TimeSeriesDataFrame:
-        """Ensure that TimeSeriesDataFrame has a sorted index, valid frequency, and contains no missing values.
+        """Ensure that TimeSeriesDataFrame has a sorted index and a valid frequency.
         If self.freq is None, then self.freq of the predictor will be set to the frequency of the data.
         Parameters
         ----------
-        data : Union[TimeSeriesDataFrame, pd.DataFrame, str]
-            Data as a data frame or path to file storing the data.
+        data : TimeSeriesDataFrame | pd.DataFrame | Path | str
+            Data as a dataframe or path to file storing the data.
         name : str
             Name of the data that will be used in log messages (e.g., 'train_data', 'tuning_data', or 'data').
@@ -292,60 +287,77 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
         df : TimeSeriesDataFrame
             Preprocessed data in TimeSeriesDataFrame format.
         """
-        df = self._to_data_frame(data, name=name)
-        df = df.astype({self.target: "float32"})
+        df: TimeSeriesDataFrame = self._to_data_frame(data, name=name)
+        if not pd.api.types.is_numeric_dtype(df[self.target]):
+            raise ValueError(f"Target column {name}['{self.target}'] has a non-numeric dtype {df[self.target].dtype}")
+        # Assign makes a copy, so future operations can be performed in-place
+        df = df.assign(**{self.target: df[self.target].astype("float64")})
+        df.replace(to_replace=[float("-inf"), float("inf")], value=float("nan"), inplace=True)
         # MultiIndex.is_monotonic_increasing checks if index is sorted by ["item_id", "timestamp"]
         if not df.index.is_monotonic_increasing:
             df = df.sort_index()
-            df._cached_freq = None  # in case frequency was incorrectly cached as IRREGULAR_TIME_INDEX_FREQSTR
         # Ensure that data has a regular frequency that matches the predictor frequency
         if self.freq is None:
-            if df.freq is None:
+            try:
+                # Use all items for inferring the frequency
+                data_freq = df.infer_frequency(num_items=None, raise_if_irregular=True)
+            except ValueError:
                 raise ValueError(
                     f"Frequency of {name} is not provided and cannot be inferred. Please set the expected data "
                     f"frequency when creating the predictor with `TimeSeriesPredictor(freq=...)` or ensure that "
                     f"the data has a regular time index with `{name}.convert_frequency(freq=...)`"
                 )
             else:
-                self.freq = df.freq
-                logger.info(f"Inferred time series frequency: '{df.freq}'")
+                self.freq = data_freq
+                logger.info(f"Inferred time series frequency: '{data_freq}'")
         else:
-            if df.freq != self.freq:
-                logger.warning(f"{name} with frequency '{df.freq}' has been resampled to frequency '{self.freq}'.")
+            data_freq = df.infer_frequency(num_items=None)
+            if data_freq != self.freq:
+                logger.warning(f"{name} with frequency '{data_freq}' has been resampled to frequency '{self.freq}'.")
                 df = df.convert_frequency(freq=self.freq)
-        # Fill missing values
-        if df.isna().values.any():
-            # FIXME: Do not automatically fill NaNs here, handle missing values at the level of individual models.
-            # FIXME: Current solution leads to incorrect metric computation if missing values are present
-            logger.warning(
-                f"{name} contains missing values represented by NaN. "
-                f"They have been filled by carrying forward the last valid observation."
-            )
-            df = df.fill_missing_values()
-            if df.isna().values.any():
-                raise ValueError(f"Some time series in {name} consist completely of NaN values. Please remove them.")
         return df
-    def _check_data_for_evaluation(self, data: TimeSeriesDataFrame, name: str = "data"):
-        """Make sure that provided evaluation data includes both historic and future time series values."""
-        if data.num_timesteps_per_item().min() <= self.prediction_length:
+    def _check_and_prepare_data_frame_for_evaluation(
+        self, data: TimeSeriesDataFrame, cutoff: int | None = None, name: str = "data"
+    ) -> TimeSeriesDataFrame:
+        """
+        Make sure that provided evaluation data includes both historical and future time series values.
+        Slices the dataframe based on cutoff, if needed.
+        """
+        cutoff = -1 * self.prediction_length if cutoff is None else cutoff
+        if not (isinstance(cutoff, int) and cutoff <= -self.prediction_length):
+            raise ValueError(f"`cutoff` should be a negative integer <= -prediction_length, got: {cutoff=}")
+        expected_length = -cutoff
+        if data.num_timesteps_per_item().min() <= expected_length:
+            var_name = "-cutoff" if expected_length > self.prediction_length else "prediction_length"
             raise ValueError(
-                f"Cannot reserve last prediction_length={self.prediction_length} time steps for evaluation in some "
-                f"time series in {name}. Please make sure that {name} includes both historic and future data, and that"
-                f"all time series have length > prediction_length (at least {self.prediction_length + 1})"
+                f"Cannot reserve last {expected_length} time steps for evaluation in some "
+                f"time series in {name}. Please make sure that {name} includes both historical and future data, and that"
+                f"all time series have length > {var_name} (at least {expected_length + 1})"
             )
-    @staticmethod
-    def _get_dataset_stats(data: TimeSeriesDataFrame) -> str:
+        if cutoff < -self.prediction_length:
+            data = data.slice_by_timestep(None, cutoff + self.prediction_length)
+        return data
+    def _get_dataset_stats(self, data: TimeSeriesDataFrame) -> str:
         ts_lengths = data.num_timesteps_per_item()
-        median_length = int(ts_lengths.median())
+        median_length = ts_lengths.median()
         min_length = ts_lengths.min()
         max_length = ts_lengths.max()
+        missing_value_fraction = data[self.target].isna().mean()
+        if missing_value_fraction > 0:
+            missing_value_fraction_str = f" (NaN fraction={missing_value_fraction:.1%})"
+        else:
+            missing_value_fraction_str = ""
         return (
-            f"{len(data)} rows, {data.num_items} time series. "
-            f"Median time series length is {median_length} (min={min_length}, max={max_length}). "
+            f"{len(data)} rows{missing_value_fraction_str}, {data.num_items} time series. "
+            f"Median time series length is {median_length:.0f} (min={min_length}, max={max_length}). "
         )
     def _reduce_num_val_windows_if_necessary(
@@ -374,65 +386,72 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
             )
         return new_num_val_windows
-    def _filter_short_series(
+    def _filter_useless_train_data(
         self,
         train_data: TimeSeriesDataFrame,
         num_val_windows: int,
         val_step_size: int,
-    ) -> Tuple[TimeSeriesDataFrame, Optional[TimeSeriesDataFrame]]:
-        """Remove time series from train_data that are too short for chosen prediction_length and validation settings.
+    ) -> TimeSeriesDataFrame:
+        """Remove time series from train_data that either contain all NaNs or are too short for chosen settings.
-        This method ensures that for each validation fold, all train series have length >= max(prediction_length + 1, 5).
+        This method ensures that 1) no time series consist of all NaN values and 2) for each validation fold, all train
+        series have length >= max(prediction_length + 1, 5).
-        In other words, this method removes from train_data all time series with length less than
+        In other words, this method removes from train_data all time series with only NaN values or length less than
         min_train_length + prediction_length + (num_val_windows - 1) * val_step_size
         """
         min_length = self._min_train_length + self.prediction_length + (num_val_windows - 1) * val_step_size
         train_lengths = train_data.num_timesteps_per_item()
-        train_items_to_drop = train_lengths.index[train_lengths < min_length]
-        if len(train_items_to_drop) > 0:
+        too_short_items = train_lengths.index[train_lengths < min_length]
+        if len(too_short_items) > 0:
             logger.info(
-                f"\tRemoving {len(train_items_to_drop)} short time series from train_data. Only series with length "
+                f"\tRemoving {len(too_short_items)} short time series from train_data. Only series with length "
                 f">= {min_length} will be used for training."
             )
-            filtered_train_data = train_data.query("item_id not in @train_items_to_drop")
-            if len(filtered_train_data) == 0:
-                raise ValueError(
-                    f"At least some time series in train_data must have length >= {min_length}. Please provide longer "
-                    f"time series as train_data or reduce prediction_length, num_val_windows, or val_step_size."
-                )
-            logger.info(
-                f"\tAfter removing short series, train_data has {self._get_dataset_stats(filtered_train_data)}"
-            )
-        else:
-            filtered_train_data = train_data
+            train_data = train_data.query("item_id not in @too_short_items")
-        return filtered_train_data
+        all_nan_items = train_data.item_ids[
+            train_data[self.target].isna().groupby(TimeSeriesDataFrame.ITEMID, sort=False).all()
+        ]
+        if len(all_nan_items) > 0:
+            logger.info(f"\tRemoving {len(all_nan_items)} time series consisting of only NaN values from train_data.")
+            train_data = train_data.query("item_id not in @all_nan_items")
+        if len(too_short_items) or len(all_nan_items):
+            logger.info(f"\tAfter filtering, train_data has {self._get_dataset_stats(train_data)}")
-    @apply_presets(TIMESERIES_PRESETS_CONFIGS)
+        if len(train_data) == 0:
+            raise ValueError(
+                f"At least some time series in train_data must have >= {min_length} observations. Please provide "
+                f"longer time series as train_data or reduce prediction_length, num_val_windows, or val_step_size."
+            )
+        return train_data
+    @apply_presets(get_predictor_presets())
     def fit(
         self,
-        train_data: Union[TimeSeriesDataFrame, pd.DataFrame, str],
-        tuning_data: Optional[Union[TimeSeriesDataFrame, pd.DataFrame, str]] = None,
-        time_limit: Optional[int] = None,
-        presets: Optional[str] = None,
-        hyperparameters: Dict[Union[str, Type], Any] = None,
-        hyperparameter_tune_kwargs: Optional[Union[str, Dict]] = None,
-        excluded_model_types: Optional[List[str]] = None,
+        train_data: TimeSeriesDataFrame | pd.DataFrame | Path | str,
+        tuning_data: TimeSeriesDataFrame | pd.DataFrame | Path | str | None = None,
+        time_limit: int | None = None,
+        presets: str | None = None,
+        hyperparameters: str | dict[str | Type, Any] | None = None,
+        hyperparameter_tune_kwargs: str | dict | None = None,
+        excluded_model_types: list[str] | None = None,
         num_val_windows: int = 1,
-        val_step_size: Optional[int] = None,
-        refit_every_n_windows: int = 1,
+        val_step_size: int | None = None,
+        refit_every_n_windows: int | None = 1,
         refit_full: bool = False,
         enable_ensemble: bool = True,
-        random_seed: Optional[int] = 123,
-        verbosity: Optional[int] = None,
+        skip_model_selection: bool = False,
+        random_seed: int | None = 123,
+        verbosity: int | None = None,
     ) -> "TimeSeriesPredictor":
         """Fit probabilistic forecasting models to the given time series dataset.
         Parameters
         ----------
-        train_data : Union[TimeSeriesDataFrame, pd.DataFrame, str]
+        train_data : TimeSeriesDataFrame | pd.DataFrame | Path | str
             Training data in the :class:`~autogluon.timeseries.TimeSeriesDataFrame` format.
             Time series with length ``<= (num_val_windows + 1) * prediction_length`` will be ignored during training.
@@ -440,44 +459,39 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
             If ``known_covariates_names`` were specified when creating the predictor, ``train_data`` must include the
             columns listed in ``known_covariates_names`` with the covariates values aligned with the target time series.
-            The known covariates must have a numeric (float or integer) dtype.
             Columns of ``train_data`` except ``target`` and those listed in ``known_covariates_names`` will be
             interpreted as ``past_covariates`` - covariates that are known only in the past.
-            If ``train_data`` has static features (i.e., ``train_data.static_features`` is a pandas DataFrame), the
-            predictor will interpret columns with ``int`` and ``float`` dtypes as continuous (real-valued) features,
-            columns with ``object`` and ``str`` dtypes as categorical features, and will ignore the rest of columns.
+            If ``train_data`` contains covariates or static features, they will be interpreted as follows:
-            For example, to ensure that column "store_id" with dtype ``int`` is interpreted as a category,
-            we need to change its type to ``category``::
+            * columns with ``int``, ``bool`` and ``float`` dtypes are interpreted as continuous (real-valued) features
+            * columns with ``object``, ``str`` and ``category`` dtypes are as interpreted as categorical features
+            * columns with other dtypes are ignored
-                data.static_features["store_id"] = data.static_features["store_id"].astype("category")
+            To ensure that the column type is interpreted correctly, please convert it to one of the above dtypes.
+            For example, to ensure that column "store_id" with dtype ``int`` is interpreted as a category, change
+            its dtype to ``category``::
-            If provided data is an instance of pandas DataFrame, AutoGluon will attempt to automatically convert it
-            to a ``TimeSeriesDataFrame``.
+                data.static_features["store_id"] = data.static_features["store_id"].astype("category")
-        tuning_data : Union[TimeSeriesDataFrame, pd.DataFrame, str], optional
+            If provided data is a ``pandas.DataFrame``, AutoGluon will attempt to convert it to a ``TimeSeriesDataFrame``.
+            If a ``str`` or a ``Path`` is provided, AutoGluon will attempt to load this file.
+        tuning_data : TimeSeriesDataFrame | pd.DataFrame | Path | str, optional
             Data reserved for model selection and hyperparameter tuning, rather than training individual models. Also
             used to compute the validation scores. Note that only the last ``prediction_length`` time steps of each
             time series are used for computing the validation score.
             If ``tuning_data`` is provided, multi-window backtesting on training data will be disabled, the
-            :attr:`num_val_windows` will be set to ``0``, and :attr:`refit_full` will be set to ``False``.
+            ``num_val_windows`` will be set to ``0``, and ``refit_full`` will be set to ``False``.
             Leaving this argument empty and letting AutoGluon automatically generate the validation set from
             ``train_data`` is a good default.
-            If ``known_covariates_names`` were specified when creating the predictor, ``tuning_data`` must also include
-            the columns listed in ``known_covariates_names`` with the covariates values aligned with the target time
-            series.
-            If ``train_data`` has past covariates or static features, ``tuning_data`` must have also include them (with
-            same columns names and dtypes).
-            If provided data is an instance of pandas DataFrame, AutoGluon will attempt to automatically convert it
-            to a ``TimeSeriesDataFrame``.
+            The names and dtypes of columns and static features in ``tuning_data`` must match the ``train_data``.
+            If provided data is a ``pandas.DataFrame``, AutoGluon will attempt to convert it to a ``TimeSeriesDataFrame``.
+            If a ``str`` or a ``Path`` is provided, AutoGluon will attempt to load this file.
         time_limit : int, optional
             Approximately how long :meth:`~autogluon.timeseries.TimeSeriesPredictor.fit` will run (wall-clock time in
             seconds). If not specified, :meth:`~autogluon.timeseries.TimeSeriesPredictor.fit` will run until all models
@@ -496,14 +510,22 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
             Available presets:
-            - ``"fast_training"``: fit simple statistical models (``ETS``, ``Theta``, ``Naive``, ``SeasonalNaive``) + fast tree-based models ``RecursiveTabular`` and ``DirectTabular``. These models are fast to train but may not be very accurate.
-            - ``"medium_quality"``: all models mentioned above + deep learning model ``TemporalFusionTransformer``. Default setting that produces good forecasts with reasonable training time.
-            - ``"high_quality"``: All ML models available in AutoGluon + additional statistical models (``NPTS``, ``AutoETS``, ``AutoARIMA``, ``CrostonSBA``, ``DynamicOptimizedTheta``). Much more accurate than ``medium_quality``, but takes longer to train.
-            - ``"best_quality"``: Same models as in ``"high_quality"`, but performs validation with multiple backtests. Usually better than ``high_quality``, but takes even longer to train.
+            - ``"fast_training"``: Simple statistical and tree-based ML models. These models are fast to train but may not be very accurate.
+            - ``"medium_quality"``: Same models as above, plus deep learning models ``TemporalFusionTransformer`` and Chronos-Bolt (small). Produces good forecasts with reasonable training time.
+            - ``"high_quality"``: A mix of multiple DL, ML and statistical forecasting models available in AutoGluon that offers the best forecast accuracy. Much more accurate than ``medium_quality``, but takes longer to train.
+            - ``"best_quality"``: Same models as in ``"high_quality"``, but performs validation with multiple backtests. Usually better than ``high_quality``, but takes even longer to train.
+            Available presets with the `Chronos-Bolt <https://github.com/amazon-science/chronos-forecasting>`_ model:
+            - ``"bolt_{model_size}"``: where model size is one of ``tiny,mini,small,base``. Uses the Chronos-Bolt pretrained model for zero-shot forecasting.
+              See the documentation for ``ChronosModel`` or see `Hugging Face <https://huggingface.co/collections/amazon/chronos-models-65f1791d630a8d57cb718444>`_ for more information.
-            Details for these presets can be found in ``autogluon/timeseries/configs/presets_configs.py``. If not
-            provided, user-provided values for ``hyperparameters`` and ``hyperparameter_tune_kwargs`` will be used
-            (defaulting to their default values specified below).
+            Exact definitions of these presets can be found in the source code
+            [`1 <https://github.com/autogluon/autogluon/blob/stable/timeseries/src/autogluon/timeseries/configs/presets_configs.py>`_,
+            `2 <https://github.com/autogluon/autogluon/blob/stable/timeseries/src/autogluon/timeseries/models/presets.py>`_].
+            If no ``presets`` are selected, user-provided values for ``hyperparameters`` will be used (defaulting to their
+            default values specified below).
         hyperparameters : str or dict, optional
             Determines what models are trained and what hyperparameters are used by each model.
@@ -561,7 +583,7 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
             Valid preset values:
             * "auto": Performs HPO via bayesian optimization search on GluonTS-backed neural forecasting models and
-                random search on other models using local scheduler.
+              random search on other models using local scheduler.
             * "random": Performs HPO via random search.
             You can also provide a dict to specify searchers and schedulers
@@ -569,7 +591,7 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
             * "num_trials": How many HPO trials to run
             * "scheduler": Which scheduler to use. Valid values:
-                * "local": Local shceduler that schedules trials FIFO
+                * "local": Local scheduler that schedules trials FIFO
             * "searcher": Which searching algorithm to use. Valid values:
                 * "local_random": Uses the "random" searcher
                 * "random": Perform random search
@@ -588,7 +610,7 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
                         "scheduler": "local",
                     },
                 )
-        excluded_model_types: List[str], optional
+        excluded_model_types: list[str], optional
             Banned subset of model types to avoid training during ``fit()``, even if present in ``hyperparameters``.
             For example, the following code will train all models included in the ``high_quality`` presets except ``DeepAR``::
@@ -603,7 +625,7 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
             of time series in ``train_data`` are long enough for the chosen number of backtests.
             Increasing this parameter increases the training time roughly by a factor of ``num_val_windows // refit_every_n_windows``.
-            See :attr:`refit_every_n_windows` and :attr:`val_step_size`: for details.
+            See ``refit_every_n_windows`` and ``val_step_size`` for details.
             For example, for ``prediction_length=2``, ``num_val_windows=3`` and ``val_step_size=1`` the folds are::
@@ -622,7 +644,11 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
             This argument has no effect if ``tuning_data`` is provided.
         refit_every_n_windows: int or None, default = 1
             When performing cross validation, each model will be retrained every ``refit_every_n_windows`` validation
-            windows. If set to ``None``, model will only be fit once for the first validation window.
+            windows, where the number of validation windows is specified by ``num_val_windows``. Note that in the
+            default setting where ``num_val_windows=1``, this argument has no effect.
+            If set to ``None``, models will only be fit once for the first (oldest) validation window. By default,
+            ``refit_every_n_windows=1``, i.e., all models will be refit for each validation window.
         refit_full : bool, default = False
             If True, after training is complete, AutoGluon will attempt to re-train all models using all of training
             data (including the data initially reserved for validation). This argument has no effect if ``tuning_data``
@@ -630,6 +656,10 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
         enable_ensemble : bool, default = True
             If True, the ``TimeSeriesPredictor`` will fit a simple weighted ensemble on top of the models specified via
             ``hyperparameters``.
+        skip_model_selection : bool, default = False
+            If True, predictor will not compute the validation score. For example, this argument is useful if we want
+            to use the predictor as a wrapper for a single pre-trained model. If set to True, then the ``hyperparameters``
+            dict must contain exactly one model without hyperparameter search spaces or an exception will be raised.
         random_seed : int or None, default = 123
             If provided, fixes the seed of the random number generator for all models. This guarantees reproducible
             results for most models (except those trained on GPU because of the non-determinism of GPU operations).
@@ -639,12 +669,15 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
         """
         time_start = time.time()
-        if self._learner.is_fit:
-            raise AssertionError("Predictor is already fit! To fit additional models create a new `Predictor`.")
+        if self.is_fit:
+            raise AssertionError(
+                "Predictor is already fit! To fit additional models create a new `TimeSeriesPredictor`."
+            )
         if verbosity is None:
             verbosity = self.verbosity
         set_logger_verbosity(verbosity, logger=logger)
+        warn_if_mlflow_autologging_is_enabled(logger=logger)
         logger.info("Beginning AutoGluon training..." + (f" Time limit = {time_limit}s" if time_limit else ""))
         logger.info(f"AutoGluon will save models to '{self.path}'")
@@ -658,7 +691,8 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
             target=self.target,
             known_covariates_names=self.known_covariates_names,
             eval_metric=self.eval_metric,
-            eval_metric_seasonal_period=self.eval_metric_seasonal_period,
+            eval_metric_seasonal_period=self.eval_metric.seasonal_period,
+            horizon_weight=self.eval_metric.horizon_weight,
             quantile_levels=self.quantile_levels,
             freq=self.freq,
             time_limit=time_limit,
@@ -669,6 +703,7 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
             val_step_size=val_step_size,
             refit_every_n_windows=refit_every_n_windows,
             refit_full=refit_full,
+            skip_model_selection=skip_model_selection,
             enable_ensemble=enable_ensemble,
             random_seed=random_seed,
             verbosity=verbosity,
@@ -691,37 +726,44 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
         if tuning_data is not None:
             tuning_data = self._check_and_prepare_data_frame(tuning_data, name="tuning_data")
-            self._check_data_for_evaluation(tuning_data, name="tuning_data")
-            logger.info(f"Provided tuning_data has {self._get_dataset_stats(train_data)}")
+            tuning_data = self._check_and_prepare_data_frame_for_evaluation(tuning_data, name="tuning_data")
+            logger.info(f"Provided tuning_data has {self._get_dataset_stats(tuning_data)}")
             # TODO: Use num_val_windows to perform multi-window backtests on tuning_data
-            if num_val_windows > 0:
+            if num_val_windows > 1:
                 logger.warning(
                     "\tSetting num_val_windows = 0 (disabling backtesting on train_data) because tuning_data is provided."
                 )
-                num_val_windows = 0
+                num_val_windows = 1
         if num_val_windows == 0 and tuning_data is None:
             raise ValueError("Please set num_val_windows >= 1 or provide custom tuning_data")
-        train_data = self._filter_short_series(
-            train_data, num_val_windows=num_val_windows, val_step_size=val_step_size
-        )
+        if num_val_windows <= 1 and refit_every_n_windows is not None and refit_every_n_windows > 1:
+            logger.warning(
+                f"\trefit_every_n_windows provided as {refit_every_n_windows} but num_val_windows is set to {num_val_windows}."
+                " Refit_every_n_windows will have no effect."
+            )
-        val_splitter = ExpandingWindowSplitter(
-            prediction_length=self.prediction_length, num_val_windows=num_val_windows, val_step_size=val_step_size
-        )
+        if not skip_model_selection:
+            train_data = self._filter_useless_train_data(
+                train_data,
+                num_val_windows=0 if tuning_data is not None else num_val_windows,
+                val_step_size=val_step_size,
+            )
         time_left = None if time_limit is None else time_limit - (time.time() - time_start)
         self._learner.fit(
             train_data=train_data,
-            val_data=tuning_data,
             hyperparameters=hyperparameters,
+            val_data=tuning_data,
             hyperparameter_tune_kwargs=hyperparameter_tune_kwargs,
             excluded_model_types=excluded_model_types,
             time_limit=time_left,
             verbosity=verbosity,
-            val_splitter=val_splitter,
+            num_val_windows=(num_val_windows,) if isinstance(num_val_windows, int) else num_val_windows,
+            val_step_size=val_step_size,
             refit_every_n_windows=refit_every_n_windows,
+            skip_model_selection=skip_model_selection,
             enable_ensemble=enable_ensemble,
             random_seed=random_seed,
         )
@@ -734,40 +776,41 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
         self.save()
         return self
-    def model_names(self) -> List[str]:
+    def model_names(self) -> list[str]:
         """Returns the list of model names trained by this predictor object."""
+        self._assert_is_fit("model_names")
         return self._trainer.get_model_names()
     def predict(
         self,
-        data: Union[TimeSeriesDataFrame, pd.DataFrame, str],
-        known_covariates: Optional[Union[TimeSeriesDataFrame, pd.DataFrame, str]] = None,
-        model: Optional[str] = None,
+        data: TimeSeriesDataFrame | pd.DataFrame | Path | str,
+        known_covariates: TimeSeriesDataFrame | pd.DataFrame | Path | str | None = None,
+        model: str | None = None,
         use_cache: bool = True,
-        random_seed: Optional[int] = 123,
+        random_seed: int | None = 123,
     ) -> TimeSeriesDataFrame:
         """Return quantile and mean forecasts for the given dataset, starting from the end of each time series.
         Parameters
         ----------
-        data : Union[TimeSeriesDataFrame, pd.DataFrame, str]
-            Time series data to forecast with.
+        data : TimeSeriesDataFrame | pd.DataFrame | Path | str
+            Historical time series data for which the forecast needs to be made.
-            If ``known_covariates_names`` were specified when creating the predictor, ``data`` must include the columns
-            listed in ``known_covariates_names`` with the covariates values aligned with the target time series.
+            The names and dtypes of columns and static features in ``data`` must match the ``train_data`` used to train
+            the predictor.
-            If ``train_data`` used to train the predictor contained past covariates or static features, then ``data``
-            must also include them (with same column names and dtypes).
-            If provided data is an instance of pandas DataFrame, AutoGluon will attempt to automatically convert it
-            to a ``TimeSeriesDataFrame``.
-        known_covariates : Union[TimeSeriesDataFrame, pd.DataFrame, str], optional
+            If provided data is a ``pandas.DataFrame``, AutoGluon will attempt to convert it to a ``TimeSeriesDataFrame``.
+            If a ``str`` or a ``Path`` is provided, AutoGluon will attempt to load this file.
+        known_covariates : TimeSeriesDataFrame | pd.DataFrame | Path | str, optional
             If ``known_covariates_names`` were specified when creating the predictor, it is necessary to provide the
-            values of the known covariates for each time series during the forecast horizon. That is:
+            values of the known covariates for each time series during the forecast horizon. Specifically:
+            - Must contain all columns listed in ``known_covariates_names``.
+            - Must include all ``item_id`` values present in the input ``data``.
+            - Must include ``timestamp`` values for the full forecast horizon (i.e., ``prediction_length`` time steps) following the end of each series in the input ``data``.
-            - The columns must include all columns listed in ``known_covariates_names``
-            - The ``item_id`` index must include all item ids present in ``data``
-            - The ``timestamp`` index must include the values for ``prediction_length`` many time steps into the future from the end of each time series in ``data``
+            You can use :meth:`autogluon.timeseries.TimeSeriesPredictor.make_future_data_frame` to generate a template
+            containing the required ``item_id`` and ``timestamp`` combinations for the ``known_covariates`` dataframe.
             See example below.
         model : str, optional
@@ -808,8 +851,10 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
         B       2020-03-04    17.1
                 2020-03-05     8.3
         """
-        # Don't use data.item_ids in case data is not a TimeSeriesDataFrame
-        original_item_id_order = data.reset_index()[ITEMID].unique()
+        self._assert_is_fit("predict")
+        # Save original item_id order to return predictions in the same order as input data
+        data = self._to_data_frame(data)
+        original_item_id_order = data.item_ids
         data = self._check_and_prepare_data_frame(data)
         if known_covariates is not None:
             known_covariates = self._to_data_frame(known_covariates)
@@ -820,41 +865,250 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
             use_cache=use_cache,
             random_seed=random_seed,
         )
-        return predictions.reindex(original_item_id_order, level=ITEMID)
+        return cast(TimeSeriesDataFrame, predictions.reindex(original_item_id_order, level=TimeSeriesDataFrame.ITEMID))
+    @overload
+    def backtest_predictions(
+        self,
+        data: TimeSeriesDataFrame | None = None,
+        *,
+        model: str | None = None,
+        num_val_windows: int | None = None,
+        val_step_size: int | None = None,
+        use_cache: bool = True,
+    ) -> list[TimeSeriesDataFrame]: ...
+    @overload
+    def backtest_predictions(
+        self,
+        data: TimeSeriesDataFrame | None = None,
+        *,
+        model: list[str],
+        num_val_windows: int | None = None,
+        val_step_size: int | None = None,
+        use_cache: bool = True,
+    ) -> dict[str, list[TimeSeriesDataFrame]]: ...
+    def backtest_predictions(
+        self,
+        data: TimeSeriesDataFrame | None = None,
+        *,
+        model: str | list[str] | None = None,
+        num_val_windows: int | None = None,
+        val_step_size: int | None = None,
+        use_cache: bool = True,
+    ) -> list[TimeSeriesDataFrame] | dict[str, list[TimeSeriesDataFrame]]:
+        """Return predictions for multiple validation windows.
+        When ``data=None``, returns the predictions that were saved during training. Otherwise, generates new
+        predictions by splitting ``data`` into multiple windows using an expanding window strategy.
+        The corresponding target values for each window can be obtained using
+        :meth:`~autogluon.timeseries.TimeSeriesPredictor.backtest_targets`.
+        Parameters
+        ----------
+        data : TimeSeriesDataFrame, optional
+            Time series data to generate predictions for. If ``None``, returns the predictions that were saved
+            during training on ``train_data``.
+            If provided, all time series in ``data`` must have length at least
+            ``prediction_length + (num_val_windows - 1) * val_step_size + 1``.
+            The names and dtypes of columns and static features in ``data`` must match the ``train_data`` used to train
+            the predictor.
+        model : str, list[str], or None, default = None
+            Name of the model(s) to generate predictions with. By default, the best model during training
+            (with highest validation score) will be used.
+            - If ``str``: Returns predictions for a single model as a list.
+            - If ``list[str]``: Returns predictions for multiple models as a dict mapping model names to lists.
+            - If ``None``: Uses the best model.
+        num_val_windows : int, optional
+            Number of validation windows to generate. If ``None``, uses the ``num_val_windows`` value from training
+            configuration when ``data=None``, otherwise defaults to 1.
+            For example, with ``prediction_length=2``, ``num_val_windows=3``, and ``val_step_size=1``, the validation
+            windows are::
+                |-------------------|
+                | x x x x x y y - - |
+                | x x x x x x y y - |
+                | x x x x x x x y y |
+            where ``x`` denotes training time steps and ``y`` denotes validation time steps for each window.
+        val_step_size : int, optional
+            Number of time steps between the start of consecutive validation windows. If ``None``, defaults to
+            ``prediction_length``.
+        use_cache : bool, default = True
+            If True, will attempt to use cached predictions. If False, cached predictions will be ignored.
+            This argument is ignored if ``cache_predictions`` was set to False when creating the ``TimeSeriesPredictor``.
+        Returns
+        -------
+        list[TimeSeriesDataFrame] or dict[str, list[TimeSeriesDataFrame]]
+            Predictions for each validation window.
+            - If ``model`` is a ``str`` or ``None``: Returns a list of length ``num_val_windows``, where each element
+              contains the predictions for one validation window.
+            - If ``model`` is a ``list[str]``: Returns a dict mapping each model name to a list of predictions for
+              each validation window.
+        Examples
+        --------
+        Make predictions on new data with the best model
+        >>> predictor.backtest_predictions(test_data, num_val_windows=2)
+        Load validation predictions for all models that were saved during training
+        >>> predictor.backtest_predictions(model=predictor.model_names())
+        See Also
+        --------
+        backtest_targets
+            Return target values aligned with predictions.
+        evaluate
+            Evaluate forecast accuracy on a hold-out set.
+        predict
+            Generate forecasts for future time steps.
+        """
+        self._assert_is_fit("backtest_predictions")
+        if data is not None:
+            data = self._check_and_prepare_data_frame(data)
+        if model is None:
+            model_names = [self.model_best]
+        elif isinstance(model, str):
+            model_names = [model]
+        else:
+            model_names = model
+        result = self._learner.backtest_predictions(
+            data=data,
+            model_names=model_names,
+            num_val_windows=num_val_windows,
+            val_step_size=val_step_size,
+            use_cache=use_cache,
+        )
+        if isinstance(model, list):
+            return result
+        else:
+            return result[model_names[0]]
+    def backtest_targets(
+        self,
+        data: TimeSeriesDataFrame | None = None,
+        *,
+        num_val_windows: int | None = None,
+        val_step_size: int | None = None,
+    ) -> list[TimeSeriesDataFrame]:
+        """Return target values for each validation window.
+        Returns the actual target values corresponding to each validation window used in
+        :meth:`~autogluon.timeseries.TimeSeriesPredictor.backtest_predictions`. The returned targets are aligned
+        with the predictions, making it easy to compute custom evaluation metrics or analyze forecast errors.
+        Parameters
+        ----------
+        data : TimeSeriesDataFrame, optional
+            Time series data to extract targets from. If ``None``, returns the targets from the validation windows
+            used during training.
+            If provided, all time series in ``data`` must have length at least
+            ``prediction_length + (num_val_windows - 1) * val_step_size + 1``.
+            The names and dtypes of columns and static features in ``data`` must match the ``train_data`` used to train
+            the predictor.
+        num_val_windows : int, optional
+            Number of validation windows to extract targets for. If ``None``, uses the ``num_val_windows`` value from
+            training configuration when ``data=None``, otherwise defaults to 1.
+            This should match the ``num_val_windows`` argument passed to
+            :meth:`~autogluon.timeseries.TimeSeriesPredictor.backtest_predictions`.
+        val_step_size : int, optional
+            Number of time steps between the start of consecutive validation windows. If ``None``, defaults to
+            ``prediction_length``.
+            This should match the ``val_step_size`` argument passed to
+            :meth:`~autogluon.timeseries.TimeSeriesPredictor.backtest_predictions`.
+        Returns
+        -------
+        list[TimeSeriesDataFrame]
+            Target values for each validation window. Returns a list of length ``num_val_windows``,
+            where each element contains the full time series data for one validation window.
+            Each dataframe includes both historical context and the last ``prediction_length`` time steps
+            that represent the target values to compare against predictions.
+            The returned targets are aligned with the output of
+            :meth:`~autogluon.timeseries.TimeSeriesPredictor.backtest_predictions`, so ``targets[i]`` corresponds
+            to ``predictions[i]`` for the i-th validation window.
+        See Also
+        --------
+        backtest_predictions
+            Return predictions for multiple validation windows.
+        evaluate
+            Evaluate forecast accuracy on a hold-out set.
+        """
+        self._assert_is_fit("backtest_targets")
+        if data is not None:
+            data = self._check_and_prepare_data_frame(data)
+        return self._learner.backtest_targets(
+            data=data,
+            num_val_windows=num_val_windows,
+            val_step_size=val_step_size,
+        )
     def evaluate(
         self,
-        data: Union[TimeSeriesDataFrame, pd.DataFrame, str],
-        model: Optional[str] = None,
-        metrics: Optional[Union[str, TimeSeriesScorer, List[Union[str, TimeSeriesScorer]]]] = None,
+        data: TimeSeriesDataFrame | pd.DataFrame | Path | str,
+        model: str | None = None,
+        metrics: str | TimeSeriesScorer | list[str | TimeSeriesScorer] | None = None,
+        cutoff: int | None = None,
         display: bool = False,
         use_cache: bool = True,
-    ) -> Dict[str, float]:
+    ) -> dict[str, float]:
         """Evaluate the forecast accuracy for given dataset.
         This method measures the forecast accuracy using the last ``self.prediction_length`` time steps of each time
         series in ``data`` as a hold-out set.
+        .. note::
+            Metrics are always reported in 'higher is better' format.
+            This means that metrics such as MASE or MAPE will be multiplied by -1, so their values will be negative.
+            This is necessary to avoid the user needing to know the metric to understand if higher is better when
+            looking at the evaluation results.
         Parameters
         ----------
-        data : Union[TimeSeriesDataFrame, pd.DataFrame, str]
-            The data to evaluate the best model on. The last ``prediction_length`` time steps of the data set, for each
-            item, will be held out for prediction and forecast accuracy will be calculated on these time steps.
+        data : TimeSeriesDataFrame | pd.DataFrame | Path | str
+            The data to evaluate the best model on. If a ``cutoff`` is not provided, the last ``prediction_length``
+            time steps of each time series in ``data`` will be held out for prediction and forecast accuracy will
+            be calculated on these time steps. When a ``cutoff`` is provided, the ``-cutoff``-th to the
+            ``-cutoff + prediction_length``-th time steps of each time series are used for evaluation.
-            If ``known_covariates_names`` were specified when creating the predictor, ``data`` must include the columns
-            listed in ``known_covariates_names`` with the covariates values aligned with the target time series.
+            Must include both historical and future data (i.e., length of all time series in ``data`` must be at least
+            ``prediction_length + 1``, if ``cutoff`` is not provided, ``-cutoff + 1`` otherwise).
-            If ``train_data`` used to train the predictor contained past covariates or static features, then ``data``
-            must also include them (with same column names and dtypes).
+            The names and dtypes of columns and static features in ``data`` must match the ``train_data`` used to train
+            the predictor.
-            If provided data is an instance of pandas DataFrame, AutoGluon will attempt to automatically convert it
-            to a ``TimeSeriesDataFrame``.
+            If provided data is a ``pandas.DataFrame``, AutoGluon will attempt to convert it to a ``TimeSeriesDataFrame``.
+            If a ``str`` or a ``Path`` is provided, AutoGluon will attempt to load this file.
         model : str, optional
             Name of the model that you would like to evaluate. By default, the best model during training
             (with highest validation score) will be used.
-        metrics : str, TimeSeriesScorer or List[Union[str, TimeSeriesScorer]], optional
+        metrics : str, TimeSeriesScorer or list[str | TimeSeriesScorer], optional
             Metric or a list of metrics to compute scores with. Defaults to ``self.eval_metric``. Supports both
             metric names as strings and custom metrics based on TimeSeriesScorer.
+        cutoff : int, optional
+            A *negative* integer less than or equal to ``-1 * prediction_length`` denoting the time step in ``data``
+            where the forecast evaluation starts, i.e., time series are evaluated from the ``-cutoff``-th to the
+            ``-cutoff + prediction_length``-th time step. Defaults to ``-1 * prediction_length``, using the last
+            ``prediction_length`` time steps of each time series for evaluation.
         display : bool, default = False
             If True, the scores will be printed.
         use_cache : bool, default = True
@@ -863,29 +1117,185 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
         Returns
         -------
-        scores_dict : Dict[str, float]
+        scores_dict : dict[str, float]
             Dictionary where keys = metrics, values = performance along each metric. For consistency, error metrics
             will have their signs flipped to obey this convention. For example, negative MAPE values will be reported.
             To get the ``eval_metric`` score, do ``output[predictor.eval_metric.name]``.
         """
+        self._assert_is_fit("evaluate")
         data = self._check_and_prepare_data_frame(data)
-        self._check_data_for_evaluation(data)
+        data = self._check_and_prepare_data_frame_for_evaluation(data, cutoff=cutoff)
         scores_dict = self._learner.evaluate(data, model=model, metrics=metrics, use_cache=use_cache)
         if display:
             logger.info("Evaluations on test data:")
             logger.info(json.dumps(scores_dict, indent=4))
         return scores_dict
+    def feature_importance(
+        self,
+        data: TimeSeriesDataFrame | pd.DataFrame | Path | str | None = None,
+        model: str | None = None,
+        metric: str | TimeSeriesScorer | None = None,
+        features: list[str] | None = None,
+        time_limit: float | None = None,
+        method: Literal["naive", "permutation"] = "permutation",
+        subsample_size: int = 50,
+        num_iterations: int | None = None,
+        random_seed: int | None = 123,
+        relative_scores: bool = False,
+        include_confidence_band: bool = True,
+        confidence_level: float = 0.99,
+    ) -> pd.DataFrame:
+        """
+        Calculates feature importance scores for the given model via replacing each feature by a shuffled version of the same feature
+        (also known as permutation feature importance) or by assigning a constant value representing the median or mode of the feature,
+        and computing the relative decrease in the model's predictive performance.
+        A feature's importance score represents the performance drop that results when the model makes predictions on a perturbed copy
+        of the data where this feature's values have been randomly shuffled across rows. A feature score of 0.01 would indicate that the
+        predictive performance dropped by 0.01 when the feature was randomly shuffled or replaced. The higher the score a feature has,
+        the more important it is to the model's performance.
+        If a feature has a negative score, this means that the feature is likely harmful to the final model, and a model trained with
+        the feature removed would be expected to achieve a better predictive performance. Note that calculating feature importance can
+        be a computationally expensive process, particularly if the model uses many features. In many cases, this can take longer than
+        the original model training. Roughly, this will equal to the number of features in the data multiplied by ``num_iterations``
+        (or, 1 when ``method="naive"``) and time taken when ``evaluate()`` is called on a dataset with ``subsample_size``.
+        Parameters
+        ----------
+        data : TimeSeriesDataFrame, pd.DataFrame, Path or str, optional
+            The data to evaluate feature importances on. The last ``prediction_length`` time steps of the data set, for each
+            item, will be held out for prediction and forecast accuracy will be calculated on these time steps.
+            More accurate feature importances will be obtained from new data that was held-out during ``fit()``.
+            The names and dtypes of columns and static features in ``data`` must match the ``train_data`` used to train
+            the predictor.
+            If provided data is a ``pandas.DataFrame``, AutoGluon will attempt to convert it to a ``TimeSeriesDataFrame``.
+            If a ``str`` or a ``Path`` is provided, AutoGluon will attempt to load this file.
+            If ``data`` is not provided, then validation (tuning) data provided during training (or the held out data used for
+            validation if ``tuning_data`` was not explicitly provided ``fit()``) will be used.
+        model : str, optional
+            Name of the model that you would like to evaluate. By default, the best model during training
+            (with highest validation score) will be used.
+        metric : str or TimeSeriesScorer, optional
+            Metric to be used for computing feature importance. If None, the ``eval_metric`` specified during initialization of
+            the ``TimeSeriesPredictor`` will be used.
+        features : list[str], optional
+            List of feature names that feature importances are calculated for and returned. By default, all feature importances
+            will be returned.
+        method : {"permutation", "naive"}, default = "permutation"
+            Method to be used for computing feature importance.
+            * ``naive``: computes feature importance by replacing the values of each feature by a constant value and computing
+              feature importances as the relative improvement in the evaluation metric. The constant value is the median for
+              real-valued features and the mode for categorical features, for both covariates and static features, obtained from the
+              feature values in ``data`` provided.
+            * ``permutation``: computes feature importance by naively shuffling the values of the feature across different items
+              and time steps. Each feature is shuffled for ``num_iterations`` times and feature importances are computed as the
+              relative improvement in the evaluation metric. Refer to https://explained.ai/rf-importance/ for an explanation of
+              permutation importance.
+        subsample_size : int, default = 50
+            The number of items to sample from ``data`` when computing feature importance. Larger values increase the accuracy of
+            the feature importance scores. Runtime linearly scales with ``subsample_size``.
+        time_limit : float, optional
+            Time in seconds to limit the calculation of feature importance. If None, feature importance will calculate without early stopping.
+            If ``method="permutation"``, a minimum of 1 full shuffle set will always be evaluated. If a shuffle set evaluation takes longer than
+            ``time_limit``, the method will take the length of a shuffle set evaluation to return regardless of the ``time_limit``.
+        num_iterations : int, optional
+            The number of different iterations of the data that are evaluated. If ``method="permutation"``, this will be interpreted
+            as the number of shuffle sets (equivalent to ``num_shuffle_sets`` in :meth:`TabularPredictor.feature_importance`). If ``method="naive"``, the
+            constant replacement approach is repeated for ``num_iterations`` times, and a different subsample of data (of size ``subsample_size``) will
+            be taken in each iteration.
+            Default is 1 for ``method="naive"`` and 5 for ``method="permutation"``. The value will be ignored if ``method="naive"`` and the subsample
+            size is greater than the number of items in ``data`` as additional iterations will be redundant.
+            Larger values will increase the quality of the importance evaluation.
+            It is generally recommended to increase ``subsample_size`` before increasing ``num_iterations``.
+            Runtime scales linearly with ``num_iterations``.
+        random_seed : int or None, default = 123
+            If provided, fixes the seed of the random number generator for all models. This guarantees reproducible
+            results for feature importance.
+        relative_scores : bool, default = False
+            By default, this method will return expected average *absolute* improvement in the eval metric due to the feature. If True, then
+            the statistics will be computed over the *relative* (percentage) improvements.
+        include_confidence_band: bool, default = True
+            If True, returned DataFrame will include two additional columns specifying confidence interval for the true underlying importance value of
+            each feature. Increasing ``subsample_size`` and ``num_iterations`` will tighten the confidence interval.
+        confidence_level: float, default = 0.99
+            This argument is only considered when ``include_confidence_band=True``, and can be used to specify the confidence level used
+            for constructing confidence intervals. For example, if ``confidence_level`` is set to 0.99, then the returned DataFrame will include
+            columns ``p99_high`` and ``p99_low`` which indicates that the true feature importance will be between ``p99_high`` and ``p99_low`` 99% of
+            the time (99% confidence interval). More generally, if ``confidence_level`` = 0.XX, then the columns containing the XX% confidence interval
+            will be named ``pXX_high`` and ``pXX_low``.
+        Returns
+        -------
+        :class:`pd.DataFrame` of feature importance scores with 2 columns:
+            index: The feature name.
+            'importance': The estimated feature importance score.
+            'stddev': The standard deviation of the feature importance score. If NaN, then not enough ``num_iterations`` were used.
+        """
+        self._assert_is_fit("feature_importance")
+        if data is not None:
+            data = self._check_and_prepare_data_frame(data)
+            data = self._check_and_prepare_data_frame_for_evaluation(data)
+        fi_df = self._learner.get_feature_importance(
+            data=data,
+            model=model,
+            metric=metric,
+            features=features,
+            time_limit=time_limit,
+            method=method,
+            subsample_size=subsample_size,
+            num_iterations=num_iterations,
+            random_seed=random_seed,
+            relative_scores=relative_scores,
+            include_confidence_band=include_confidence_band,
+            confidence_level=confidence_level,
+        )
+        return fi_df
     @classmethod
     def _load_version_file(cls, path: str) -> str:
+        """
+        Loads the version file that is part of the saved predictor artifact.
+        Parameters
+        ----------
+        path: str
+            The path that would be used to load the predictor via `predictor.load(path)`
+        Returns
+        -------
+        The version of AutoGluon used to fit the predictor, as a string.
+        """
         version_file_path = os.path.join(path, cls._predictor_version_file_name)
-        version = load_str.load(path=version_file_path)
+        try:
+            version = load_str.load(path=version_file_path)
+        except:
+            # Loads the old version file used in `autogluon.timeseries<=1.1.0`, named `__version__`.
+            # This file name was changed because Kaggle does not allow uploading files named `__version__`.
+            version_file_path = os.path.join(path, "__version__")
+            version = load_str.load(path=version_file_path)
         return version
     @classmethod
-    def load(cls, path: Union[str, Path], require_version_match: bool = True) -> "TimeSeriesPredictor":
+    def load(cls, path: str | Path, require_version_match: bool = True) -> "TimeSeriesPredictor":
         """Load an existing ``TimeSeriesPredictor`` from given ``path``.
+        .. warning::
+            :meth:`autogluon.timeseries.TimeSeriesPredictor.load` uses ``pickle`` module implicitly, which is known to
+            be insecure. It is possible to construct malicious pickle data which will execute arbitrary code during
+            unpickling. Never load data that could have come from an untrusted source, or that could have been tampered
+            with. **Only load data you trust.**
         Parameters
         ----------
         path : str or pathlib.Path
@@ -907,14 +1317,18 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
         """
         if not path:
             raise ValueError("`path` cannot be None or empty in load().")
-        path: str = setup_outputdir(path, warn_if_exist=False)
+        path = setup_outputdir(path, warn_if_exist=False)
+        predictor_path = Path(path) / cls.predictor_file_name
+        if not predictor_path.exists():
+            raise FileNotFoundError(f"No such file '{predictor_path}'")
         try:
             version_saved = cls._load_version_file(path=path)
         except:
             logger.warning(
                 f'WARNING: Could not find version file at "{os.path.join(path, cls._predictor_version_file_name)}".\n'
-                f"This means that the predictor was fit in a version `<=0.7.0`."
+                f"This means that the predictor was fit in an AutoGluon version `<=0.7.0`."
             )
             version_saved = "Unknown (Likely <=0.7.0)"
@@ -926,13 +1340,13 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
         )
         logger.info(f"Loading predictor from path {path}")
-        learner = AbstractLearner.load(path)
-        predictor = load_pkl.load(path=os.path.join(learner.path, cls.predictor_file_name))
+        learner = cls._learner_type.load(path)
+        predictor = load_pkl.load(path=str(predictor_path))
         predictor._learner = learner
         predictor.path = learner.path
         return predictor
-    def _save_version_file(self):
+    def _save_version_file(self) -> None:
         version_file_contents = current_ag_version
         version_file_path = os.path.join(self.path, self._predictor_version_file_name)
         save_str.save(path=version_file_path, data=version_file_contents, verbose=False)
@@ -944,43 +1358,87 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
         (we do not recommend modifying the Predictor object yourself as it tracks many trained models).
         """
         tmp_learner = self._learner
-        self._learner = None
+        self._learner = None  # type: ignore
         save_pkl.save(path=os.path.join(tmp_learner.path, self.predictor_file_name), object=self)
         self._learner = tmp_learner
         self._save_version_file()
-    def info(self) -> Dict[str, Any]:
+    def info(self) -> dict[str, Any]:
         """Returns a dictionary of objects each describing an attribute of the training process and trained models."""
         return self._learner.get_info(include_model_info=True)
     @property
     def model_best(self) -> str:
         """Returns the name of the best model from trainer."""
+        self._assert_is_fit("model_best")
         if self._trainer.model_best is not None:
             models = self._trainer.get_model_names()
             if self._trainer.model_best in models:
                 return self._trainer.model_best
         return self._trainer.get_model_best()
+    def persist(self, models: Literal["all", "best"] | list[str] = "best", with_ancestors: bool = True) -> list[str]:
+        """Persist models in memory for reduced inference latency. This is particularly important if the models are being used for online
+        inference where low latency is critical. If models are not persisted in memory, they are loaded from disk every time they are
+        asked to make predictions. This is especially cumbersome for large deep learning based models which have to be loaded into
+        accelerator (e.g., GPU) memory each time.
+        Parameters
+        ----------
+        models : list of str or str, default = 'best'
+            Model names of models to persist.
+            If 'best' then the model with the highest validation score is persisted (this is the model used for prediction by default).
+            If 'all' then all models are persisted. Valid models are listed in this ``predictor`` by calling ``predictor.model_names()``.
+        with_ancestors : bool, default = True
+            If True, all ancestor models of the provided models will also be persisted.
+            If False, ensemble models will not have the models they depend on persisted unless those models were specified in ``models``.
+            This will slow down inference as the ancestor models will still need to be loaded from disk for each predict call.
+            Only relevant for ensemble models.
+        Returns
+        -------
+        list_of_models : list[str]
+            List of persisted model names.
+        """
+        self._assert_is_fit("persist")
+        return self._learner.persist_trainer(models=models, with_ancestors=with_ancestors)
+    def unpersist(self) -> list[str]:
+        """Unpersist models in memory for reduced memory usage. If models are not persisted in memory, they are loaded from
+        disk every time they are asked to make predictions.
+        Note: Another way to reset the predictor and unpersist models is to reload the predictor from disk
+        via ``predictor = TimeSeriesPredictor.load(predictor.path)``.
+        Returns
+        -------
+        list_of_models : list[str]
+            List of unpersisted model names.
+        """
+        return self._learner.unpersist_trainer()
     def leaderboard(
         self,
-        data: Optional[Union[TimeSeriesDataFrame, pd.DataFrame, str]] = None,
+        data: TimeSeriesDataFrame | pd.DataFrame | Path | str | None = None,
+        cutoff: int | None = None,
+        extra_info: bool = False,
+        extra_metrics: list[str | TimeSeriesScorer] | None = None,
         display: bool = False,
         use_cache: bool = True,
         **kwargs,
     ) -> pd.DataFrame:
         """Return a leaderboard showing the performance of every trained model, the output is a
-        pandas data frame with columns:
+        pandas dataframe with columns:
         * ``model``: The name of the model.
         * ``score_test``: The test score of the model on ``data``, if provided. Computed according to ``eval_metric``.
         * ``score_val``: The validation score of the model using the internal validation data. Computed according to ``eval_metric``.
         .. note::
-            Metrics scores are always shown in 'higher is better' format.
+            Metrics are always reported in 'higher is better' format.
             This means that metrics such as MASE or MAPE will be multiplied by -1, so their values will be negative.
             This is necessary to avoid the user needing to know the metric to understand if higher is better when
-            looking at leaderboard.
+            looking at the leaderboard.
         * ``pred_time_val``: Time taken by the model to predict on the validation data set
         * ``fit_time_marginal``: The fit time required to train the model (ignoring base models for ensembles).
@@ -989,19 +1447,35 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
         Parameters
         ----------
-        data : Union[TimeSeriesDataFrame, pd.DataFrame, str], optional
-            dataset used for additional evaluation. If not provided, the validation set used during training will be
-            used.
-            If ``known_covariates_names`` were specified when creating the predictor, ``data`` must include the columns
-            listed in ``known_covariates_names`` with the covariates values aligned with the target time series.
-            If ``train_data`` used to train the predictor contained past covariates or static features, then ``data``
-            must also include them (with same column names and dtypes).
-            If provided data is an instance of pandas DataFrame, AutoGluon will attempt to automatically convert it
-            to a ``TimeSeriesDataFrame``.
+        data : TimeSeriesDataFrame | pd.DataFrame | Path | str, optional
+            dataset used for additional evaluation. Must include both historical and future data (i.e., length of all
+            time series in ``data`` must be at least ``prediction_length + 1``, if ``cutoff`` is not provided,
+            ``-cutoff + 1`` otherwise).
+            The names and dtypes of columns and static features in ``data`` must match the ``train_data`` used to train
+            the predictor.
+            If provided data is a ``pandas.DataFrame``, AutoGluon will attempt to convert it to a ``TimeSeriesDataFrame``.
+            If a ``str`` or a ``Path`` is provided, AutoGluon will attempt to load this file.
+        cutoff : int, optional
+            A *negative* integer less than or equal to ``-1 * prediction_length`` denoting the time step in ``data``
+            where the forecast evaluation starts, i.e., time series are evaluated from the ``-cutoff``-th to the
+            ``-cutoff + prediction_length``-th time step. Defaults to ``-1 * prediction_length``, using the last
+            ``prediction_length`` time steps of each time series for evaluation.
+        extra_info : bool, default = False
+            If True, the leaderboard will contain an additional column ``hyperparameters`` with the hyperparameters used
+            by each model during training. An empty dictionary ``{}`` means that the model was trained with default
+            hyperparameters.
+        extra_metrics : list[str | TimeSeriesScorer], optional
+            A list of metrics to calculate scores for and include in the output DataFrame.
+            Only valid when ``data`` is specified. The scores refer to the scores on ``data`` (same data as used to
+            calculate the ``score_test`` column).
+            This list can contain any values which would also be valid for ``eval_metric`` when creating a :class:`~autogluon.timeseries.TimeSeriesPredictor`.
+            For each provided ``metric``, a column with name ``str(metric)`` will be added to the leaderboard, containing
+            the value of the metric computed on ``data``.
         display : bool, default = False
             If True, the leaderboard DataFrame will be printed.
         use_cache : bool, default = True
@@ -1014,6 +1488,7 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
             The leaderboard containing information on all models and in order of best model to worst in terms of
             test performance.
         """
+        self._assert_is_fit("leaderboard")
         if "silent" in kwargs:
             # keep `silent` logic for backwards compatibility
             assert isinstance(kwargs["silent"], bool)
@@ -1021,17 +1496,62 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
         if len(kwargs) > 0:
             for key in kwargs:
                 raise TypeError(f"TimeSeriesPredictor.leaderboard() got an unexpected keyword argument '{key}'")
+        if data is None and extra_metrics is not None:
+            raise ValueError("`extra_metrics` is only valid when `data` is specified.")
+        if data is None and cutoff is not None:
+            raise ValueError("`cutoff` is only valid when `data` is specified.")
         if data is not None:
             data = self._check_and_prepare_data_frame(data)
-            self._check_data_for_evaluation(data)
-        leaderboard = self._learner.leaderboard(data, use_cache=use_cache)
+            data = self._check_and_prepare_data_frame_for_evaluation(data, cutoff=cutoff)
+        leaderboard = self._learner.leaderboard(
+            data, extra_info=extra_info, extra_metrics=extra_metrics, use_cache=use_cache
+        )
         if display:
             with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.width", 1000):
                 print(leaderboard)
         return leaderboard
-    def fit_summary(self, verbosity: int = 1) -> Dict[str, Any]:
+    def make_future_data_frame(self, data: TimeSeriesDataFrame | pd.DataFrame | Path | str) -> pd.DataFrame:
+        """Generate a dataframe with the ``item_id`` and ``timestamp`` values corresponding to the forecast horizon.
+        Parameters
+        ----------
+        data : TimeSeriesDataFrame | pd.DataFrame | Path | str
+            Historical time series data.
+        Returns
+        -------
+        forecast_horizon : pd.DataFrame
+            Data frame with columns ``item_id`` and ``timestamp`` corresponding to the forecast horizon. For each item ID
+            in ``data``, ``forecast_horizon`` will contain the timestamps for the next ``prediction_length`` time steps,
+            following the end of each series in the input data.
+        Examples
+        --------
+        >>> print(data)
+                            target
+        item_id timestamp
+        A       2024-01-01       0
+                2024-01-02       1
+                2024-01-03       2
+        B       2024-04-07       3
+                2024-04-08       4
+        >>> predictor = TimeSeriesPredictor(prediction_length=2, freq="D")
+        >>> print(predictor.make_future_data_frame(data))
+          item_id  timestamp
+        0       A 2024-01-04
+        0       A 2024-01-05
+        1       B 2024-04-09
+        1       B 2024-04-10
+        """
+        if self.freq is None:
+            raise ValueError("Please fit the predictor before calling `make_future_data_frame`")
+        data = self._check_and_prepare_data_frame(data)
+        return make_future_data_frame(data, prediction_length=self.prediction_length, freq=self.freq)
+    def fit_summary(self, verbosity: int = 1) -> dict[str, Any]:
         """Output summary of information about models produced during
         :meth:`~autogluon.timeseries.TimeSeriesPredictor.fit`.
@@ -1042,10 +1562,11 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
         Returns
         -------
-        summary_dict : Dict[str, Any]
+        summary_dict : dict[str, Any]
             Dict containing various detailed information. We do not recommend directly printing this dict as it may
             be very large.
         """
+        self._assert_is_fit("fit_summary")
         # TODO: HPO-specific information currently not reported in fit_summary
         # TODO: Revisit after ray tune integration
@@ -1066,7 +1587,7 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
         model_hyperparams = {}
         for model_name in self.model_names():
             model_obj = self._trainer.load_model(model_name)
-            model_hyperparams[model_name] = model_obj.params
+            model_hyperparams[model_name] = model_obj.get_hyperparameters()
         results["model_hyperparams"] = model_hyperparams
         results["leaderboard"] = self._learner.leaderboard()
@@ -1081,7 +1602,7 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
             print("****************** End of fit() summary ******************")
         return results
-    def refit_full(self, model: str = "all", set_best_to_refit_full: bool = True) -> Dict[str, str]:
+    def refit_full(self, model: str = "all", set_best_to_refit_full: bool = True) -> dict[str, str]:
         """Retrain model on all of the data (training + validation).
         This method can only be used if no ``tuning_data`` was passed to :meth:`~autogluon.timeseries.TimeSeriesPredictor.fit`.
@@ -1106,6 +1627,7 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
             ``predictor.predict(data)`` is called will be the refit_full version instead of the original version of the
             model. Has no effect if ``model`` is not the best model.
         """
+        self._assert_is_fit("refit_full")
         logger.warning(
             "\tWARNING: refit_full functionality for TimeSeriesPredictor is experimental "
             "and is not yet supported by all models."
@@ -1143,40 +1665,38 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
                 )
         return refit_full_dict
-    def __dir__(self) -> List[str]:
-        # This hides method from IPython autocomplete, but not VSCode autocomplete
-        deprecated = ["score", "get_model_best", "get_model_names"]
-        return [d for d in super().__dir__() if d not in deprecated]
     def _simulation_artifact(self, test_data: TimeSeriesDataFrame) -> dict:
         """[Advanced] Computes and returns the necessary information to perform offline ensemble simulation."""
         def select_target(ts_df: TimeSeriesDataFrame) -> TimeSeriesDataFrame:
             ts_df = ts_df.copy()
             ts_df.static_features = None
-            return ts_df[[self.target]]
+            return cast(TimeSeriesDataFrame, ts_df[[self.target]])
         test_data = self._check_and_prepare_data_frame(test_data)
-        self._check_data_for_evaluation(test_data, name="test_data")
+        test_data = self._check_and_prepare_data_frame_for_evaluation(test_data, name="test_data")
         test_data = self._learner.feature_generator.transform(test_data)
         trainer = self._trainer
         train_data = trainer.load_train_data()
         val_data = trainer.load_val_data()
-        base_models = trainer.get_model_names(level=0)
-        pred_proba_dict_val: Dict[str, List[TimeSeriesDataFrame]] = {
-            model: trainer._get_model_oof_predictions(model) for model in base_models
+        base_model_names = trainer.get_model_names(layer=0)
+        pred_proba_dict_val: dict[str, list[TimeSeriesDataFrame]] = {
+            model_name: trainer._get_model_oof_predictions(model_name)
+            for model_name in base_model_names
+            if "_FULL" not in model_name
         }
         past_data, known_covariates = test_data.get_model_inputs_for_scoring(
-            prediction_length=self.prediction_length, known_covariates_names=trainer.metadata.known_covariates_real
+            prediction_length=self.prediction_length,
+            known_covariates_names=trainer.covariate_metadata.known_covariates,
         )
-        pred_proba_dict_test: Dict[str, TimeSeriesDataFrame] = trainer.get_model_pred_dict(
-            base_models, data=past_data, known_covariates=known_covariates
+        pred_proba_dict_test, _ = trainer.get_model_pred_dict(
+            base_model_names, data=past_data, known_covariates=known_covariates
         )
-        y_val: List[TimeSeriesDataFrame] = [
-            select_target(df) for df in trainer._get_ensemble_oof_data(train_data=train_data, val_data=val_data)
+        y_val: list[TimeSeriesDataFrame] = [
+            select_target(df) for df in trainer._get_validation_windows(train_data=train_data, val_data=val_data)
         ]
         y_test: TimeSeriesDataFrame = select_target(test_data)
@@ -1188,34 +1708,35 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
             target=self.target,
             prediction_length=self.prediction_length,
             eval_metric=self.eval_metric.name,
-            eval_metric_seasonal_period=self.eval_metric_seasonal_period,
+            eval_metric_seasonal_period=self.eval_metric.seasonal_period,
+            horizon_weight=self.eval_metric.horizon_weight,
             quantile_levels=self.quantile_levels,
         )
         return simulation_dict
     def plot(
         self,
-        data: Union[TimeSeriesDataFrame, pd.DataFrame, str],
-        predictions: Optional[Union[TimeSeriesDataFrame, pd.DataFrame, str]] = None,
-        quantile_levels: Optional[List[float]] = None,
-        item_ids: Optional[List[Union[str, int]]] = None,
+        data: TimeSeriesDataFrame | pd.DataFrame | Path | str,
+        predictions: TimeSeriesDataFrame | None = None,
+        quantile_levels: list[float] | None = None,
+        item_ids: list[str | int] | None = None,
         max_num_item_ids: int = 8,
-        max_history_length: Optional[int] = None,
-        point_forecast_column: Optional[str] = None,
-        matplotlib_rc_params: Optional[dict] = None,
+        max_history_length: int | None = None,
+        point_forecast_column: str | None = None,
+        matplotlib_rc_params: dict | None = None,
     ):
-        """Plot historic time series values and the forecasts.
+        """Plot historical time series values and the forecasts.
         Parameters
         ----------
-        data : Union[TimeSeriesDataFrame, pd.DataFrame, str]
+        data : TimeSeriesDataFrame | pd.DataFrame | Path | str
             Observed time series data.
         predictions : TimeSeriesDataFrame, optional
             Predictions generated by calling :meth:`~autogluon.timeseries.TimeSeriesPredictor.predict`.
-        quantile_levels : List[float], optional
+        quantile_levels : list[float], optional
             Quantile levels for which to plot the prediction intervals. Defaults to lowest & highest quantile levels
             available in ``predictions``.
-        item_ids : List[Union[str, int]], optional
+        item_ids : list[str | int], optional
             If provided, plots will only be generated for time series with these item IDs. By default (if set to
             ``None``), item IDs are selected randomly. In either case, plots are generated for at most
             ``max_num_item_ids`` time series.
@@ -1227,8 +1748,8 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
             Name of the column in ``predictions`` that will be plotted as the point forecast. Defaults to ``"0.5"``,
             if this column is present in ``predictions``, otherwise ``"mean"``.
         matplotlib_rc_params : dict, optional
-            Dictionary describing the plot style that will be passed to [`matplotlib.pyplot.rc_context`](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.rc_context.html).
-            See [matplotlib documentation](https://matplotlib.org/stable/users/explain/customizing.html#the-default-matplotlibrc-file) for the list of available options.
+            Dictionary describing the plot style that will be passed to `matplotlib.pyplot.rc_context <https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.rc_context.html>`_.
+            See `matplotlib documentation <https://matplotlib.org/stable/users/explain/customizing.html#the-default-matplotlibrc-file>`_ for the list of available options.
         """
         import matplotlib.pyplot as plt
@@ -1291,14 +1812,14 @@ class TimeSeriesPredictor(TimeSeriesPredictorDeprecatedMixin):
                 ax.plot(ts, label="Observed", color="C0")
                 if predictions is not None:
-                    forecast = predictions.loc[item_id]
+                    forecast: pd.DataFrame = predictions.loc[item_id]  # type: ignore
                     point_forecast = forecast[point_forecast_column]
                     ax.plot(point_forecast, color="C1", label="Forecast")
                     if quantile_levels is not None:
                         for q in quantile_levels:
                             ax.fill_between(forecast.index, point_forecast, forecast[str(q)], color="C1", alpha=0.2)
             if len(axes) > len(item_ids):
-                axes[len(item_ids)].set_axis_off()
-            handles, labels = axes[0].get_legend_handles_labels()
+                axes[len(item_ids)].set_axis_off()  # type: ignore
+            handles, labels = axes[0].get_legend_handles_labels()  # type: ignore
             fig.legend(handles, labels, bbox_to_anchor=(0.5, 0.0), ncols=len(handles))
         return fig