PyPI - autogluon.timeseries - Versions diffs - 1.4.1b20250906__py3-none-any.whl → 1.4.1b20251210__py3-none-any.whl - Mend

autogluon.timeseries 1.4.1b20250906py3-none-any.whl → 1.4.1b20251210py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of autogluon.timeseries might be problematic. Click here for more details.

Files changed (93) hide show

autogluon/timeseries/models/multi_window/multi_window_model.py CHANGED Viewed

@@ -4,13 +4,13 @@ import logging
 import math
 import os
 import time
-from typing import Any, Optional, Type, Union
+from typing import Any, Type
 import numpy as np
 from typing_extensions import Self
 import autogluon.core as ag
-from autogluon.timeseries.dataset.ts_dataframe import TimeSeriesDataFrame
+from autogluon.timeseries.dataset import TimeSeriesDataFrame
 from autogluon.timeseries.models.abstract import AbstractTimeSeriesModel
 from autogluon.timeseries.models.local.abstract_local_model import AbstractLocalModel
 from autogluon.timeseries.splitter import AbstractWindowSplitter, ExpandingWindowSplitter
@@ -38,8 +38,8 @@ class MultiWindowBacktestingModel(AbstractTimeSeriesModel):
     def __init__(
         self,
-        model_base: Union[AbstractTimeSeriesModel, Type[AbstractTimeSeriesModel]],
-        model_base_kwargs: Optional[dict[str, Any]] = None,
+        model_base: AbstractTimeSeriesModel | Type[AbstractTimeSeriesModel],
+        model_base_kwargs: dict[str, Any] | None = None,
         **kwargs,
     ):
         if inspect.isclass(model_base) and issubclass(model_base, AbstractTimeSeriesModel):
@@ -58,8 +58,8 @@ class MultiWindowBacktestingModel(AbstractTimeSeriesModel):
         self.model_base_type = type(self.model_base)
         self.info_per_val_window = []
-        self.most_recent_model: Optional[AbstractTimeSeriesModel] = None
-        self.most_recent_model_folder: Optional[str] = None
+        self.most_recent_model: AbstractTimeSeriesModel | None = None
+        self.most_recent_model_folder: str | None = None
         super().__init__(**kwargs)
     @property
@@ -83,19 +83,19 @@ class MultiWindowBacktestingModel(AbstractTimeSeriesModel):
     def _is_gpu_available(self) -> bool:
         return self._get_model_base()._is_gpu_available()
-    def get_minimum_resources(self, is_gpu_available: bool = False) -> dict[str, Union[int, float]]:
+    def get_minimum_resources(self, is_gpu_available: bool = False) -> dict[str, int | float]:
         return self._get_model_base().get_minimum_resources(is_gpu_available)
     def _fit(
         self,
         train_data: TimeSeriesDataFrame,
-        val_data: Optional[TimeSeriesDataFrame] = None,
-        time_limit: Optional[float] = None,
-        num_cpus: Optional[int] = None,
-        num_gpus: Optional[int] = None,
+        val_data: TimeSeriesDataFrame | None = None,
+        time_limit: float | None = None,
+        num_cpus: int | None = None,
+        num_gpus: int | None = None,
         verbosity: int = 2,
-        val_splitter: Optional[AbstractWindowSplitter] = None,
-        refit_every_n_windows: Optional[int] = 1,
+        val_splitter: AbstractWindowSplitter | None = None,
+        refit_every_n_windows: int | None = 1,
         **kwargs,
     ):
         # TODO: use incremental training for GluonTS models?
@@ -109,9 +109,9 @@ class MultiWindowBacktestingModel(AbstractTimeSeriesModel):
         if refit_every_n_windows is None:
             refit_every_n_windows = val_splitter.num_val_windows + 1  # only fit model for the first window
-        oof_predictions_per_window = []
+        oof_predictions_per_window: list[TimeSeriesDataFrame] = []
         global_fit_start_time = time.time()
-        model: Optional[AbstractTimeSeriesModel] = None
+        model: AbstractTimeSeriesModel | None = None
         for window_index, (train_fold, val_fold) in enumerate(val_splitter.split(train_data)):
             logger.debug(f"\tWindow {window_index}")
@@ -142,6 +142,7 @@ class MultiWindowBacktestingModel(AbstractTimeSeriesModel):
                     train_data=train_fold,
                     val_data=val_fold,
                     time_limit=time_left_for_window,
+                    verbosity=verbosity,
                     **kwargs,
                 )
                 model.fit_time = time.time() - model_fit_start_time
@@ -182,8 +183,9 @@ class MultiWindowBacktestingModel(AbstractTimeSeriesModel):
         self.most_recent_model_folder = most_recent_refit_window  # type: ignore
         self.predict_time = self.most_recent_model.predict_time
         self.fit_time = time.time() - global_fit_start_time - self.predict_time  # type: ignore
-        self._oof_predictions = oof_predictions_per_window
-        self.val_score = np.mean([info["val_score"] for info in self.info_per_val_window])  # type: ignore
+        self.cache_oof_predictions(oof_predictions_per_window)
+        self.val_score = float(np.mean([info["val_score"] for info in self.info_per_val_window]))
     def get_info(self) -> dict:
         info = super().get_info()
@@ -198,7 +200,7 @@ class MultiWindowBacktestingModel(AbstractTimeSeriesModel):
     def _predict(
         self,
         data: TimeSeriesDataFrame,
-        known_covariates: Optional[TimeSeriesDataFrame] = None,
+        known_covariates: TimeSeriesDataFrame | None = None,
         **kwargs,
     ) -> TimeSeriesDataFrame:
         if self.most_recent_model is None:
@@ -212,12 +214,25 @@ class MultiWindowBacktestingModel(AbstractTimeSeriesModel):
         store_predict_time: bool = False,
         **predict_kwargs,
     ) -> None:
-        # self.val_score, self.predict_time, self._oof_predictions already saved during _fit()
-        assert self._oof_predictions is not None
-        if store_val_score:
-            assert self.val_score is not None
+        if self._oof_predictions is None or self.most_recent_model is None:
+            raise ValueError(f"{self.name} must be fit before calling score_and_cache_oof")
+        # Score on val_data using the most recent model
+        past_data, known_covariates = val_data.get_model_inputs_for_scoring(
+            prediction_length=self.prediction_length, known_covariates_names=self.covariate_metadata.known_covariates
+        )
+        predict_start_time = time.time()
+        val_predictions = self.most_recent_model.predict(
+            past_data, known_covariates=known_covariates, **predict_kwargs
+        )
+        self._oof_predictions.append(val_predictions)
         if store_predict_time:
-            assert self.predict_time is not None
+            self.predict_time = time.time() - predict_start_time
+        if store_val_score:
+            self.val_score = self._score_with_predictions(val_data, val_predictions)
     def _get_search_space(self):
         return self.model_base._get_search_space()
@@ -234,7 +249,7 @@ class MultiWindowBacktestingModel(AbstractTimeSeriesModel):
         train_fn_kwargs["init_params"]["model_base_kwargs"] = self.get_params()
         return train_fn_kwargs
-    def save(self, path: Optional[str] = None, verbose: bool = True) -> str:
+    def save(self, path: str | None = None, verbose: bool = True) -> str:
         most_recent_model = self.most_recent_model
         self.most_recent_model = None
         save_path = super().save(path, verbose)

autogluon/timeseries/models/registry.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from abc import ABCMeta
 from dataclasses import dataclass
 from inspect import isabstract
-from typing import Union
 @dataclass
@@ -44,7 +43,7 @@ class ModelRegistry(ABCMeta):
         cls.REGISTRY[alias] = record
     @classmethod
-    def _get_model_record(cls, alias: Union[str, type]) -> ModelRecord:
+    def _get_model_record(cls, alias: str | type) -> ModelRecord:
         if isinstance(alias, type):
             alias = alias.__name__
         alias = alias.removesuffix("Model")
@@ -53,11 +52,11 @@ class ModelRegistry(ABCMeta):
         return cls.REGISTRY[alias]
     @classmethod
-    def get_model_class(cls, alias: Union[str, type]) -> type:
+    def get_model_class(cls, alias: str | type) -> type:
         return cls._get_model_record(alias).model_class
     @classmethod
-    def get_model_priority(cls, alias: Union[str, type]) -> int:
+    def get_model_priority(cls, alias: str | type) -> int:
         return cls._get_model_record(alias).ag_priority
     @classmethod

autogluon/timeseries/models/toto/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .model import TotoModel
+__all__ = ["TotoModel"]

autogluon/timeseries/models/toto/_internal/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+from .backbone import TotoBackbone
+from .dataset import MaskedTimeseries
+from .forecaster import TotoForecaster
+__all__ = [
+    "MaskedTimeseries",
+    "TotoBackbone",
+    "TotoForecaster",
+]

autogluon/timeseries/models/toto/_internal/backbone/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .backbone import TotoBackbone
+__all__ = ["TotoBackbone"]

autogluon/timeseries/models/toto/_internal/backbone/attention.py ADDED Viewed

@@ -0,0 +1,196 @@
+# Unless explicitly stated otherwise all files in this repository are licensed under the Apache-2.0 License.
+#
+# This product includes software developed at Datadog (https://www.datadoghq.com/)
+# Copyright 2025 Datadog, Inc.
+import logging
+from enum import Enum
+import torch
+from einops import rearrange
+from torch.nn.functional import scaled_dot_product_attention
+from .rope import TimeAwareRotaryEmbedding
+log = logging.getLogger(__name__)
+class AttentionAxis(Enum):
+    TIME = 1
+    SPACE = 2
+class BaseMultiheadAttention(torch.nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float,
+        rotary_emb: TimeAwareRotaryEmbedding | None,
+        use_memory_efficient_attention: bool,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by number of heads."
+        self.head_dim = embed_dim // num_heads
+        self.rotary_emb = rotary_emb
+        # We allocate a single tensor for the q, k, and v projection matrices,
+        # multiply them with the inputs, and then split the projected tensors into q, k, and v using unbind.
+        # This reduces overhead a bit vs. having multiple separate Linear layers,
+        # which need to be initialized, tracked by the optimizer, etc.
+        self.wQKV = torch.nn.Linear(embed_dim, embed_dim * 3)
+        self.dropout = dropout
+        self.use_memory_efficient_attention = use_memory_efficient_attention
+        self.wO = torch.nn.Linear(embed_dim, embed_dim)
+        assert not self.use_memory_efficient_attention, (
+            "xformers is not available, so use_memory_efficient_attention must be False"
+        )
+        if not hasattr(self, "attention_axis") or self.attention_axis not in (AttentionAxis.TIME, AttentionAxis.SPACE):
+            raise ValueError("Child class must define attention_axis as AttentionAxis.TIME or AttentionAxis.SPACE.")
+    def rearrange_inputs(self, inputs: torch.Tensor) -> torch.Tensor:
+        pattern = (
+            "batch variate seq_len embed_dim -> (batch variate) seq_len embed_dim"
+            if self.attention_axis == AttentionAxis.TIME
+            else "batch variate seq_len embed_dim -> (batch seq_len) variate embed_dim"
+        )
+        return rearrange(inputs, pattern)
+    def get_qkv(
+        self,
+        inputs: torch.Tensor,
+    ) -> tuple[torch.Tensor, ...]:
+        pattern: str = ""
+        if self.attention_axis == AttentionAxis.TIME and self.use_memory_efficient_attention:
+            pattern = "batch_X_variate seq_len (qkv head_dim n_heads) -> qkv batch_X_variate seq_len n_heads head_dim"
+        elif self.attention_axis == AttentionAxis.TIME and not self.use_memory_efficient_attention:
+            pattern = "batch_X_variate seq_len (qkv head_dim n_heads) -> qkv batch_X_variate n_heads seq_len head_dim"
+        elif self.attention_axis == AttentionAxis.SPACE and self.use_memory_efficient_attention:
+            pattern = "batch_X_seq_len variate (qkv head_dim n_heads) -> qkv batch_X_seq_len variate n_heads head_dim"
+        elif self.attention_axis == AttentionAxis.SPACE and not self.use_memory_efficient_attention:
+            pattern = "batch_X_seq_len variate (qkv head_dim n_heads) -> qkv batch_X_seq_len n_heads variate head_dim"
+        assert pattern
+        qkv = self.wQKV(inputs.contiguous())
+        return rearrange(qkv, pattern, qkv=3, head_dim=self.head_dim, n_heads=self.num_heads).unbind(dim=0)
+    def positional_embedding(self, q, k, v, kv_cache, layer_idx):
+        # Apply the rotary embeddings
+        seq_pos_offset = 0
+        if self.rotary_emb is not None and self.attention_axis == AttentionAxis.TIME:
+            if kv_cache is not None:
+                seq_pos_offset = kv_cache.seq_len(layer_idx)
+            # We need to permute because rotary embeddings expect the sequence dimension to be the second-to-last dimension
+            q, k = self.rotary_emb.rotate_queries_and_keys(q, k, seq_pos_offset=seq_pos_offset)
+        if kv_cache is not None and self.attention_axis == AttentionAxis.TIME:
+            # First, we append the current input key and value tensors to the cache.
+            # This concatenates the current key and value tensors to the existing key and value tensors
+            kv_cache.append(layer_idx, (k, v))
+            # Then, we retrieve the key and value tensors from the cache.
+            # This includes all the key and value tensors from previous time steps
+            # as well as the current time step.
+            k, v = kv_cache[layer_idx]
+        q = q.contiguous()
+        k = k.contiguous().to(q.dtype)  # Ensure k is the same dtype as q; this is necessary when using mixed precision
+        v = v.contiguous().to(q.dtype)  # Ensure v is the same dtype as q; this is necessary when using mixed precision
+        return q, k, v, seq_pos_offset
+    def rearrange_output(self, output: torch.Tensor, batch: int, variate: int, seq_len: int) -> torch.Tensor:
+        if self.attention_axis == AttentionAxis.TIME and self.use_memory_efficient_attention:
+            pattern = "(batch variate) seq_len n_heads head_dim -> batch variate seq_len (n_heads head_dim)"
+        elif self.attention_axis == AttentionAxis.TIME and not self.use_memory_efficient_attention:
+            pattern = "(batch variate) n_heads seq_len head_dim -> batch variate seq_len (n_heads head_dim)"
+        elif self.attention_axis == AttentionAxis.SPACE and self.use_memory_efficient_attention:
+            pattern = "(batch seq_len) variate n_heads head_dim -> batch variate seq_len (n_heads head_dim)"
+        elif self.attention_axis == AttentionAxis.SPACE and not self.use_memory_efficient_attention:
+            pattern = "(batch seq_len) n_heads variate head_dim -> batch variate seq_len (n_heads head_dim)"
+        return rearrange(output, pattern, batch=batch, variate=variate, seq_len=seq_len)  # type: ignore
+    def run_attention(self, attention_mask, q, k, v, seq_pos_offset, dropout, seq_len, variate):
+        # Determine dimension ranges for attention
+        # Ensure the last query vector index is used from the cache
+        q_dim_start, q_dim_end = seq_pos_offset, seq_pos_offset + seq_len
+        kv_dim_start, kv_dim_end = 0, v.shape[1] if self.use_memory_efficient_attention else v.shape[2]
+        if self.attention_axis == AttentionAxis.TIME:
+            attention_mask = (
+                attention_mask[..., q_dim_start:q_dim_end, kv_dim_start:kv_dim_end]
+                if torch.is_tensor(attention_mask)
+                else None
+            )
+            return scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=attention_mask,
+                dropout_p=dropout,
+                is_causal=(attention_mask is None and seq_pos_offset == 0),
+            )
+        elif self.attention_axis == AttentionAxis.SPACE:
+            # We don't use causal masking for space-wise attention
+            attention_mask = (
+                attention_mask[..., kv_dim_start:kv_dim_end, kv_dim_start:kv_dim_end]
+                if torch.is_tensor(attention_mask)
+                else None
+            )
+            return scaled_dot_product_attention(q, k, v, attn_mask=attention_mask, dropout_p=dropout, is_causal=False)
+        else:
+            raise ValueError("Invalid attention axis")
+    def forward(
+        self,
+        layer_idx: int,
+        inputs: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        kv_cache=None,
+    ) -> torch.Tensor:
+        batch_size, variate, seq_len, _ = inputs.shape
+        dropout = self.dropout if self.training else 0.0
+        rearranged_inputs = self.rearrange_inputs(inputs)
+        q, k, v = self.get_qkv(rearranged_inputs)
+        q, k, v, seq_pos_offset = self.positional_embedding(q, k, v, kv_cache, layer_idx)
+        output = self.run_attention(attention_mask, q, k, v, seq_pos_offset, dropout, seq_len, variate)
+        output = self.rearrange_output(output, batch_size, variate, seq_len)
+        return self.wO(output)
+class TimeWiseMultiheadAttention(BaseMultiheadAttention):
+    """
+    Computes standard multihead causal attention over the time axis.
+    It does this by flattening out the variates along the batch dimension.
+    It also applies rotary position embeddings to the query and key matrices
+    in order to incorporate relative positional information.
+    """
+    attention_axis = AttentionAxis.TIME
+class SpaceWiseMultiheadAttention(BaseMultiheadAttention):
+    """
+    Computes bidirectional multihead attention over the space axis (i.e. across variates within
+    a multi-variate time series). This is done by flattening out the time axis along the batch dimension.
+    This allows the model to attend to different variates at the same time point. By alternating
+    between time-wise and space-wise attention, the model can learn both temporal and cross-variate
+    dependencies in the data.
+    Unlike with time-wise attention, don't apply rotary embeddings here
+    because we want cross-variate attention to be invariant to the order of the variates.
+    """
+    attention_axis = AttentionAxis.SPACE
+MultiHeadAttention = TimeWiseMultiheadAttention | SpaceWiseMultiheadAttention

autogluon/timeseries/models/toto/_internal/backbone/backbone.py ADDED Viewed

@@ -0,0 +1,262 @@
+# Unless explicitly stated otherwise all files in this repository are licensed under the Apache-2.0 License.
+#
+# This product includes software developed at Datadog (https://www.datadoghq.com/)
+# Copyright 2025 Datadog, Inc.
+import math
+from typing import NamedTuple
+import torch
+from .distribution import MixtureOfStudentTsOutput
+from .kvcache import KVCache
+from .scaler import CausalPatchStdMeanScaler
+from .transformer import Transformer
+class TotoOutput(NamedTuple):
+    """
+    Output of the Toto model. Contains the output distribution, the location parameters,
+    and the scale parameters.
+    """
+    distribution: torch.distributions.Distribution
+    loc: torch.Tensor
+    scale: torch.Tensor
+def patchify_id_mask(id_mask: torch.Tensor, patch_size: int) -> torch.Tensor:
+    patched_id_mask = id_mask.unfold(dimension=-1, size=patch_size, step=patch_size)
+    patched_id_mask_min = patched_id_mask.min(-1).values
+    patched_id_mask_max = patched_id_mask.max(-1).values
+    assert torch.eq(patched_id_mask_min, patched_id_mask_max).all(), "Patches cannot span multiple datasets"
+    return patched_id_mask_min
+class PatchEmbedding(torch.nn.Module):
+    """
+    Multivariate time series patch embedding.
+    Patchifies each variate separately.
+    """
+    def __init__(self, patch_size: int, stride: int, embed_dim: int):
+        super().__init__()
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.stride = stride
+        self.projection = torch.nn.Linear(self.patch_size, self.embed_dim)
+    def _patchify(self, x: torch.Tensor) -> torch.Tensor:
+        return x.unfold(dimension=-1, size=self.patch_size, step=self.stride)
+    def forward(
+        self,
+        x: torch.Tensor,
+        id_mask: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        assert x.shape[-1] % self.patch_size == 0, (
+            f"Series length ({x.shape=}) must be divisible by ({self.patch_size=})"
+        )
+        x_patched: torch.Tensor = self._patchify(x)
+        id_mask_patched: torch.Tensor = self._patchify(id_mask)
+        assert torch.eq(id_mask_patched.min(-1).values, id_mask_patched.max(-1).values).all(), (
+            "Patches cannot span multiple datasets"
+        )
+        return (
+            self.projection(x_patched),
+            id_mask_patched.min(-1).values,
+        )
+class TotoBackbone(torch.nn.Module):
+    """
+    Toto (Timeseries-Optimized Transformer for Observability) is a transformer-based model for multivariate
+    time series forecasting. It applies a patch embedding to the input data, followed by a transformer
+    that alternates between time-wise and space-wise attention. The transformer is followed by a linear projection
+    that maps the transformer output to the output distribution.
+    The output distribution can be a single distribution (e.g. Gaussian) or a mixture of distributions.
+    If a mixture of distributions is used, the model will learn to predict the mixture weights
+    as well as the parameters of the individual distributions.
+    Parameters
+    ----------
+    patch_size
+        Size of the patch to use for the patch embedding.
+    stride
+        Stride to use for the patch embedding.
+    embed_dim
+        Dimension of the model's latent space.
+    num_layers
+        Number of transformer layers to use.
+    num_heads
+        Number of attention heads to use in each self-attention layer.
+    mlp_hidden_dim
+        Dimension of the hidden layer in the feedforward network.
+    dropout
+        Dropout rate to use in the model.
+    spacewise_every_n_layers
+        How many time-wise transformer layers to apply between each space-wise transformer layer.
+    spacewise_first
+        Whether to apply space-wise attention before time-wise attention.
+    scaler_cls
+        Class to use for scaling the input data.
+    output_distribution_classes
+        List of classes to use for the output distribution. If a single class is provided, the model
+        will output a single distribution. If multiple classes are provided, the model will output a
+        learned mixture of distributions.
+    output_distribution_kwargs
+        Keyword arguments to pass to the output distribution class. Note: this currently only works
+        with a single output distribution class.
+    use_memory_efficient_attention:
+        Whether to use memory-efficient attention. If True, the model will use the memory-efficient from xFormers.
+    stabilize_with_global:
+        Whether to use global statistics to stabilize causal statistics by clamping extreme values. Only applies to causal scalers.
+    scale_factor_exponent:
+        Exponent that controls the allowed range of deviation from global scale for causal scalers.
+    """
+    def __init__(
+        self,
+        patch_size: int,
+        stride: int,
+        embed_dim: int,
+        num_layers: int,
+        num_heads: int,
+        mlp_hidden_dim: int,
+        dropout: float,
+        spacewise_every_n_layers: int,
+        scaler_cls: str,
+        output_distribution_classes: list[str],
+        spacewise_first: bool = True,
+        output_distribution_kwargs: dict | None = None,
+        use_memory_efficient_attention: bool = True,
+        stabilize_with_global: bool = True,
+        scale_factor_exponent: float = 10.0,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        # strings are used when loading a safetensors checkpoint
+        # Initialize patch-based scalers with the correct patch_size
+        self.scaler = CausalPatchStdMeanScaler(
+            patch_size=patch_size,
+            stabilize_with_global=stabilize_with_global,
+            scale_factor_exponent=scale_factor_exponent,
+        )
+        self.patch_embed = PatchEmbedding(patch_size, stride, embed_dim)
+        self.dropout = dropout
+        self.num_layers = num_layers
+        self.use_memory_efficient_attention = use_memory_efficient_attention
+        self.transformer = Transformer(
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            num_layers=self.num_layers,
+            mlp_hidden_dim=mlp_hidden_dim,
+            dropout=dropout,
+            spacewise_every_n_layers=spacewise_every_n_layers,
+            spacewise_first=spacewise_first,
+            use_memory_efficient_attention=self.use_memory_efficient_attention,
+        )
+        self.unembed = torch.nn.Linear(embed_dim, embed_dim * patch_size)
+        # TODO[BEN] this doesn't need to be a list
+        output_distribution_classes_ = [MixtureOfStudentTsOutput]
+        self.output_distribution = output_distribution_classes_[0](embed_dim, **(output_distribution_kwargs or {}))
+    def allocate_kv_cache(
+        self,
+        batch_size: int,
+        num_variates: int,
+        max_time_steps: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> KVCache:
+        return KVCache(
+            batch_size=batch_size,
+            num_variates=num_variates,
+            transformer_layers=list(self.transformer.layers),
+            num_layers=self.num_layers,
+            embed_dim=self.embed_dim,
+            num_heads=self.transformer.layers[0].num_heads,  # type: ignore
+            max_seq_len=math.ceil(max_time_steps / self.patch_embed.stride),
+            device=device,
+            dtype=dtype,
+            use_memory_efficient_attention=self.use_memory_efficient_attention,
+        )
+    def backbone(
+        self,
+        inputs: torch.Tensor,
+        input_padding_mask: torch.Tensor,
+        id_mask: torch.Tensor,
+        kv_cache: KVCache | None = None,
+        scaling_prefix_length: int | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        scaled_inputs: torch.Tensor
+        loc: torch.Tensor
+        scale: torch.Tensor
+        # Standard scaling operation, same API but without ID mask.
+        scaled_inputs, loc, scale = self.scaler(
+            inputs,
+            weights=torch.ones_like(inputs, device=inputs.device),
+            padding_mask=input_padding_mask,
+            prefix_length=scaling_prefix_length,
+        )
+        if kv_cache is not None:
+            prefix_len = self.patch_embed.stride * kv_cache.current_len(0)
+            # Truncate inputs so that the transformer only processes
+            # the last patch in the sequence. We'll use the KVCache
+            # for the earlier patches.
+            scaled_inputs = scaled_inputs[:, :, prefix_len:]
+            # As a simplification, when using kv cache we only allow decoding
+            # one step at a time after the initial forward pass.
+            assert (prefix_len == 0) or (scaled_inputs.shape[-1] == self.patch_embed.stride), (
+                "Must decode one step at a time."
+            )
+            input_padding_mask = input_padding_mask[:, :, prefix_len:]
+            id_mask = id_mask[:, :, prefix_len:]
+        embeddings: torch.Tensor
+        reduced_id_mask: torch.Tensor
+        embeddings, reduced_id_mask = self.patch_embed(scaled_inputs, id_mask)
+        # Apply the transformer on the embeddings
+        transformed: torch.Tensor = self.transformer(embeddings, reduced_id_mask, kv_cache)
+        # Unembed and flatten the sequence
+        unembedded = self.unembed(transformed)
+        batch_size, num_variates, seq_len = unembedded.shape[:3]
+        patch_size = unembedded.shape[-1] // self.embed_dim
+        flattened = unembedded.view(batch_size, num_variates, seq_len * patch_size, self.embed_dim)
+        return flattened, loc, scale
+    def forward(
+        self,
+        inputs: torch.Tensor,
+        input_padding_mask: torch.Tensor,
+        id_mask: torch.Tensor,
+        kv_cache: KVCache | None = None,
+        scaling_prefix_length: int | None = None,
+    ) -> TotoOutput:
+        flattened, loc, scale = self.backbone(
+            inputs,
+            input_padding_mask,
+            id_mask,
+            kv_cache,
+            scaling_prefix_length,
+        )
+        return TotoOutput(self.output_distribution(flattened), loc, scale)
+    @property
+    def device(self):
+        return next(self.parameters()).device

autogluon.timeseries 1.4.1b20250906__py3-none-any.whl → 1.4.1b20251210__py3-none-any.whl

Potentially problematic release.

autogluon.timeseries 1.4.1b20250906py3-none-any.whl → 1.4.1b20251210py3-none-any.whl