autogluon.timeseries 1.4.1b20250926__py3-none-any.whl → 1.4.1b20250930__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of autogluon.timeseries might be problematic. Click here for more details.

Files changed (26) hide show
  1. autogluon/timeseries/models/__init__.py +2 -0
  2. autogluon/timeseries/models/toto/__init__.py +3 -0
  3. autogluon/timeseries/models/toto/_internal/__init__.py +9 -0
  4. autogluon/timeseries/models/toto/_internal/backbone/__init__.py +3 -0
  5. autogluon/timeseries/models/toto/_internal/backbone/attention.py +197 -0
  6. autogluon/timeseries/models/toto/_internal/backbone/backbone.py +262 -0
  7. autogluon/timeseries/models/toto/_internal/backbone/distribution.py +70 -0
  8. autogluon/timeseries/models/toto/_internal/backbone/kvcache.py +136 -0
  9. autogluon/timeseries/models/toto/_internal/backbone/rope.py +94 -0
  10. autogluon/timeseries/models/toto/_internal/backbone/scaler.py +306 -0
  11. autogluon/timeseries/models/toto/_internal/backbone/transformer.py +333 -0
  12. autogluon/timeseries/models/toto/_internal/dataset.py +165 -0
  13. autogluon/timeseries/models/toto/_internal/forecaster.py +423 -0
  14. autogluon/timeseries/models/toto/dataloader.py +108 -0
  15. autogluon/timeseries/models/toto/hf_pretrained_model.py +119 -0
  16. autogluon/timeseries/models/toto/model.py +234 -0
  17. autogluon/timeseries/version.py +1 -1
  18. {autogluon.timeseries-1.4.1b20250926.dist-info → autogluon.timeseries-1.4.1b20250930.dist-info}/METADATA +10 -5
  19. {autogluon.timeseries-1.4.1b20250926.dist-info → autogluon.timeseries-1.4.1b20250930.dist-info}/RECORD +26 -11
  20. /autogluon.timeseries-1.4.1b20250926-py3.9-nspkg.pth → /autogluon.timeseries-1.4.1b20250930-py3.9-nspkg.pth +0 -0
  21. {autogluon.timeseries-1.4.1b20250926.dist-info → autogluon.timeseries-1.4.1b20250930.dist-info}/LICENSE +0 -0
  22. {autogluon.timeseries-1.4.1b20250926.dist-info → autogluon.timeseries-1.4.1b20250930.dist-info}/NOTICE +0 -0
  23. {autogluon.timeseries-1.4.1b20250926.dist-info → autogluon.timeseries-1.4.1b20250930.dist-info}/WHEEL +0 -0
  24. {autogluon.timeseries-1.4.1b20250926.dist-info → autogluon.timeseries-1.4.1b20250930.dist-info}/namespace_packages.txt +0 -0
  25. {autogluon.timeseries-1.4.1b20250926.dist-info → autogluon.timeseries-1.4.1b20250930.dist-info}/top_level.txt +0 -0
  26. {autogluon.timeseries-1.4.1b20250926.dist-info → autogluon.timeseries-1.4.1b20250930.dist-info}/zip-safe +0 -0
@@ -0,0 +1,333 @@
1
+ # Unless explicitly stated otherwise all files in this repository are licensed under the Apache-2.0 License.
2
+ #
3
+ # This product includes software developed at Datadog (https://www.datadoghq.com/)
4
+ # Copyright 2025 Datadog, Inc.
5
+
6
+ from typing import Optional, Union, cast
7
+
8
+ import torch
9
+ import torch.nn.functional as F
10
+ from einops import rearrange
11
+ from rotary_embedding_torch import RotaryEmbedding
12
+
13
+ from .attention import (
14
+ AttentionAxis,
15
+ MultiHeadAttention,
16
+ SpaceWiseMultiheadAttention,
17
+ TimeWiseMultiheadAttention,
18
+ )
19
+ from .kvcache import KVCache
20
+ from .rope import TimeAwareRotaryEmbedding
21
+
22
+
23
+ class SwiGLU(torch.nn.Module):
24
+ """
25
+ https://arxiv.org/abs/2002.05202
26
+ NOTE: x should be 2x the size you want
27
+ """
28
+
29
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
30
+ # Note this ordering is unusual, but is done so to match xFormers
31
+ gate, x = x.chunk(2, dim=-1)
32
+ return F.silu(gate) * x
33
+
34
+
35
+ class RMSNorm(torch.nn.Module):
36
+ def __init__(self, dim: int, include_weight: bool = True, eps: float = 1e-8):
37
+ super(RMSNorm, self).__init__()
38
+ self.eps = eps
39
+ if include_weight:
40
+ self.scale: Optional[torch.nn.Parameter] = torch.nn.Parameter(torch.ones(dim))
41
+ else:
42
+ self.scale = None
43
+
44
+ def forward(self, x: torch.Tensor):
45
+ x_normed = x / torch.sqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
46
+ return x_normed if self.scale is None else x_normed * self.scale
47
+
48
+ def increment_and_forward_(self, x: torch.Tensor, y: torch.Tensor):
49
+ """
50
+ If you need the fused addition with RMS norm, do the same check here.
51
+ """
52
+ return self.forward(x + y)
53
+
54
+
55
+ def make_batched_block_mask(t: torch.Tensor) -> torch.Tensor:
56
+ unsqueezed = rearrange(t, "... d -> ... 1 d")
57
+ return unsqueezed == unsqueezed.transpose(-1, -2)
58
+
59
+
60
+ class TransformerLayer(torch.nn.Module):
61
+ """
62
+ A transformer block that applies multihead attention followed by a feedforward network.
63
+
64
+ The transformer can be configured to apply time-wise attention (i.e. attention over the time axis)
65
+ or space-wise attention (i.e. attention over the variate axis).
66
+
67
+ The transformer block uses pre-norm, which is a variant of the transformer architecture where
68
+ LayerNorm is applied before each sublayer, rather than after. This is the approach taken in
69
+ LLaMA and other recent transformer-based models.
70
+
71
+ The transformer block also uses SwiGLU, which is a variant of the Gated Linear Unit (GLU) activation
72
+ function. SwiGLU is a variant of the GLU activation that uses the Swish activation function. This
73
+ activation function has been used extensively in recent transformer-based models and has been shown
74
+ to improve performance.
75
+ """
76
+
77
+ embed_dim: int
78
+ num_heads: int
79
+ mlp_hidden_dim: int
80
+ dropout: float
81
+ attention_axis: AttentionAxis
82
+
83
+ def __init__(
84
+ self,
85
+ embed_dim: int,
86
+ num_heads: int,
87
+ mlp_hidden_dim: int,
88
+ dropout: float,
89
+ rotary_emb: Optional[RotaryEmbedding] = None,
90
+ attention_axis: AttentionAxis = AttentionAxis.TIME,
91
+ RMS_norm: bool = True,
92
+ use_memory_efficient_attention: bool = True,
93
+ ):
94
+ super().__init__()
95
+ self.embed_dim = embed_dim
96
+ self.num_heads = num_heads
97
+ self.mlp_hidden_dim = mlp_hidden_dim
98
+ self.dropout = dropout
99
+ self.attention_axis = attention_axis
100
+
101
+ if RMS_norm:
102
+ self.norm1: Union[RMSNorm, torch.nn.LayerNorm] = RMSNorm(embed_dim)
103
+ self.norm2: Union[RMSNorm, torch.nn.LayerNorm] = RMSNorm(embed_dim)
104
+
105
+ else:
106
+ self.norm1 = torch.nn.LayerNorm(embed_dim)
107
+ self.norm2 = torch.nn.LayerNorm(embed_dim)
108
+
109
+ self.attention: MultiHeadAttention
110
+
111
+ if attention_axis == AttentionAxis.TIME:
112
+ self.attention = TimeWiseMultiheadAttention(
113
+ embed_dim=embed_dim,
114
+ num_heads=num_heads,
115
+ dropout=dropout,
116
+ rotary_emb=rotary_emb, # type: ignore
117
+ use_memory_efficient_attention=use_memory_efficient_attention,
118
+ )
119
+ elif attention_axis == AttentionAxis.SPACE:
120
+ self.attention = SpaceWiseMultiheadAttention(
121
+ embed_dim=embed_dim,
122
+ num_heads=num_heads,
123
+ dropout=dropout,
124
+ rotary_emb=None,
125
+ use_memory_efficient_attention=use_memory_efficient_attention,
126
+ )
127
+ else:
128
+ raise ValueError("Invalid attention axis")
129
+
130
+ self.mlp = torch.nn.Sequential(
131
+ torch.nn.Linear(embed_dim, 2 * mlp_hidden_dim),
132
+ SwiGLU(),
133
+ torch.nn.Linear(mlp_hidden_dim, embed_dim),
134
+ torch.nn.Dropout(dropout),
135
+ )
136
+
137
+ def forward(
138
+ self,
139
+ layer_idx: int,
140
+ inputs: torch.Tensor,
141
+ attention_mask: Optional[torch.Tensor] = None,
142
+ kv_cache: Optional[KVCache] = None,
143
+ ) -> torch.Tensor:
144
+ pre_norm_1 = self.norm1(inputs)
145
+ hidden_state = inputs + self.attention(layer_idx, pre_norm_1, attention_mask, kv_cache).contiguous()
146
+
147
+ pre_norm_2 = self.norm2(hidden_state)
148
+ return hidden_state + self.mlp(pre_norm_2)
149
+
150
+
151
+ class Transformer(torch.nn.Module):
152
+ """
153
+ A stack of transformer layers. The transformer alternates between time-wise and space-wise attention
154
+ to learn both temporal and cross-variate dependencies in the data.
155
+
156
+ Based on the intuition that time-wise attention is more important overall than space-wise attention
157
+ (because an individual variate is more likely to be correlated with itself across time than with other variates),
158
+ the transformer can be configured to apply space-wise attention less frequently than time-wise attention.
159
+ This is controlled by the `spacewise_every_n_layers` parameter, which specifies how many time-wise transformer
160
+ layers to apply between every space-wise transformer layer.
161
+
162
+ Parameters
163
+ ----------
164
+ num_layers
165
+ Number of transformer layers to use.
166
+ num_heads
167
+ Number of attention heads to use in each self-attention layer.
168
+ mlp_hidden_dim
169
+ Dimension of the hidden layer in the feedforward network.
170
+ dropout
171
+ Dropout rate to use in the model.
172
+ spacewise_every_n_layers
173
+ How many time-wise transformer layers to apply between each space-wise transformer layer.
174
+ spacewise_first
175
+ Whether to apply space-wise attention before time-wise attention.
176
+ use_memory_efficient_attention
177
+ Whether to use memory-efficient attention. If True, the model will use the memory-efficient from xFormers.
178
+ """
179
+
180
+ def __init__(
181
+ self,
182
+ num_layers: int,
183
+ embed_dim: int,
184
+ num_heads: int,
185
+ mlp_hidden_dim: int,
186
+ dropout: float,
187
+ spacewise_every_n_layers: int,
188
+ spacewise_first: bool,
189
+ use_memory_efficient_attention: bool = True,
190
+ ):
191
+ super().__init__()
192
+
193
+ assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by number of heads."
194
+
195
+ self.rotary_emb = TimeAwareRotaryEmbedding(
196
+ embed_dim // num_heads,
197
+ use_xpos=True,
198
+ cache_if_possible=True,
199
+ seq_before_head_dim=use_memory_efficient_attention,
200
+ )
201
+ attention_axes = self._get_layer_types(num_layers, spacewise_every_n_layers, spacewise_first)
202
+
203
+ self.use_memory_efficient_attention = use_memory_efficient_attention
204
+
205
+ self.layers = torch.nn.ModuleList(
206
+ [
207
+ TransformerLayer(
208
+ embed_dim=embed_dim,
209
+ num_heads=num_heads,
210
+ mlp_hidden_dim=mlp_hidden_dim,
211
+ dropout=dropout,
212
+ rotary_emb=self.rotary_emb,
213
+ attention_axis=attention_axes[i],
214
+ use_memory_efficient_attention=self.use_memory_efficient_attention,
215
+ )
216
+ for i in range(num_layers)
217
+ ]
218
+ )
219
+
220
+ def _get_mask(
221
+ self,
222
+ num_heads: int,
223
+ dtype: torch.dtype,
224
+ id_mask: Optional[torch.Tensor] = None,
225
+ ) -> torch.Tensor:
226
+ """
227
+ Unified method to create and process space-wise masks.
228
+
229
+ Args:
230
+ mask_type: Type of mask to create ('spacewise').
231
+ seq_len: Total sequence length.
232
+ num_heads: Number of attention heads.
233
+ device: Device where the mask should be created.
234
+ dtype: Desired dtype for the bias tensor.
235
+ id_mask: Mask for variates (used for spacewise masks).
236
+
237
+ Returns:
238
+ Processed attention mask tensor with the correct shape for the given mask type.
239
+ """
240
+
241
+ if id_mask is None:
242
+ raise ValueError("id_mask must be provided for spacewise masks.")
243
+
244
+ # Create spacewise mask
245
+ mask = make_batched_block_mask(id_mask.transpose(-1, -2))
246
+
247
+ if self.use_memory_efficient_attention:
248
+ mask = self._pad_to_multiple(mask)
249
+ mask = mask.float().masked_fill(~mask, float("-inf")).masked_fill(mask, 0.0).to(dtype)
250
+
251
+ # Rearrange for space-wise attention
252
+ mask = rearrange(mask, "batch seq_len variate1 variate2 -> (batch seq_len) 1 variate1 variate2")
253
+ # Stack along num_heads dimension
254
+ return mask.expand(-1, num_heads, -1, -1).contiguous()
255
+
256
+ def _pad_to_multiple(
257
+ self,
258
+ tensor: torch.Tensor,
259
+ multiple: int = 8,
260
+ causal: bool = False, # New flag to indicate causal mask extension
261
+ ) -> torch.Tensor:
262
+ """
263
+ Pads the last two dimensions of a tensor to be divisible by `multiple`.
264
+ For causal masks, the padded area is filled with the continued lower-triangular pattern,
265
+ rather than with zeros.
266
+ """
267
+ pad_amount = (multiple - tensor.shape[-1] % multiple) % multiple
268
+ if pad_amount > 0:
269
+ new_size = tensor.shape[-1] + pad_amount
270
+ if causal:
271
+ # Create a full causal mask for the new size.
272
+ full_mask = torch.tril(torch.ones((new_size, new_size), dtype=tensor.dtype, device=tensor.device))
273
+ # Preserve any modifications from the original mask (e.g., condition tokens in top-left)
274
+ full_mask[: tensor.shape[-1], : tensor.shape[-1]] = tensor
275
+ tensor = full_mask
276
+ else:
277
+ tensor = F.pad(tensor, (0, pad_amount, 0, pad_amount))
278
+ return tensor
279
+
280
+ def _get_layer_types(
281
+ self,
282
+ num_layers: int,
283
+ spacewise_every_n_layers: int,
284
+ spacewise_first: bool,
285
+ ) -> list[AttentionAxis]:
286
+ if spacewise_every_n_layers == -1:
287
+ return [AttentionAxis.TIME] * num_layers
288
+ assert num_layers % spacewise_every_n_layers == 0
289
+
290
+ block = [AttentionAxis.TIME] * (spacewise_every_n_layers - 1)
291
+
292
+ if spacewise_first:
293
+ block = [AttentionAxis.SPACE] + block
294
+ else:
295
+ block = block + [AttentionAxis.SPACE]
296
+
297
+ layer_types = block * (num_layers // spacewise_every_n_layers)
298
+
299
+ return layer_types
300
+
301
+ def forward(
302
+ self,
303
+ inputs: torch.Tensor,
304
+ id_mask: torch.Tensor,
305
+ kv_cache: Optional[KVCache] = None,
306
+ ) -> torch.Tensor:
307
+ batch, _, seq_len, _ = inputs.shape
308
+ # Get the sequence length by looking up a timewise layer in the kv cache.
309
+ # Regardless of whether spacewise is first in the stack, the layer
310
+ # at index 1 is always a timewise layer.
311
+ seq_len = (kv_cache.seq_len(1) if kv_cache else 0) + seq_len
312
+
313
+ num_heads: int = cast(int, self.layers[0].num_heads)
314
+
315
+ timewise_attention_mask = None
316
+
317
+ # We create a space-wise ID mask by creating a block triangular mask from the ID mask
318
+ # in the space-wise direction. This ensures that the model can only attend to
319
+ # variates in the same group.
320
+ spacewise_attention_mask = self._get_mask(
321
+ num_heads=num_heads,
322
+ dtype=inputs.dtype,
323
+ id_mask=id_mask,
324
+ )
325
+
326
+ for layer_idx, layer in enumerate(self.layers):
327
+ inputs = layer(
328
+ layer_idx,
329
+ inputs,
330
+ (timewise_attention_mask if layer.attention_axis == AttentionAxis.TIME else spacewise_attention_mask),
331
+ kv_cache,
332
+ )
333
+ return inputs
@@ -0,0 +1,165 @@
1
+ # Unless explicitly stated otherwise all files in this repository are licensed under the Apache-2.0 License.
2
+ #
3
+ # This product includes software developed at Datadog (https://www.datadoghq.com/)
4
+ # Copyright 2025 Datadog, Inc.
5
+
6
+ from functools import reduce
7
+ from typing import NamedTuple, Union
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ import torch
12
+ from einops import repeat
13
+
14
+
15
+ def pad_array(
16
+ values: torch.Tensor,
17
+ patch_stride: int,
18
+ ) -> torch.Tensor:
19
+ """
20
+ Makes sure that the series length is divisible by the patch_stride
21
+ by adding left-padding.
22
+ """
23
+ if isinstance(values, np.ndarray):
24
+ values = torch.from_numpy(values)
25
+ series_len = values.shape[-1]
26
+ # left-pad the time series to make sure we can divide it into patches.
27
+ padded_length = int(np.ceil(series_len / patch_stride) * patch_stride)
28
+ if values.ndim == 2: # variates series_len
29
+ padded_values = torch.zeros((values.shape[0], padded_length), dtype=values.dtype, device=values.device)
30
+ elif values.ndim == 3: # batch variates series_len
31
+ padded_values = torch.zeros(
32
+ (values.shape[0], values.shape[1], padded_length),
33
+ dtype=values.dtype,
34
+ device=values.device,
35
+ )
36
+ else:
37
+ raise ValueError(f"Unsupported number of dimensions: {values.ndim}")
38
+ padded_values[..., -series_len:] = values
39
+
40
+ return padded_values
41
+
42
+
43
+ def pad_id_mask(
44
+ id_mask: torch.Tensor,
45
+ patch_stride: int,
46
+ ) -> torch.Tensor:
47
+ """
48
+ Makes sure that the series length is divisible by the patch_stride
49
+ by adding left-padding to the id mask. It does this by repeating
50
+ the leftmost value of the id mask for each variate
51
+ """
52
+ series_len = id_mask.shape[-1]
53
+ # left-pad the time series to make sure we can divide it into patches.
54
+ padded_length = int(np.ceil(series_len / patch_stride) * patch_stride)
55
+ padding_amount = padded_length - series_len
56
+ left_edge: torch.Tensor = id_mask[..., 0]
57
+ if id_mask.ndim == 2: # variates series_len
58
+ # repeat the left edge of the id mask for padding_amount
59
+ padding = repeat(
60
+ left_edge,
61
+ "variates -> variates padding_amount",
62
+ padding_amount=padding_amount,
63
+ )
64
+ id_mask = torch.cat([padding, id_mask], dim=1)
65
+ elif id_mask.ndim == 3: # batch variates series_len
66
+ # repeat the left edge of the id mask for padding_amount
67
+ padding = repeat(
68
+ left_edge,
69
+ "batch variates -> batch variates padding_amount",
70
+ padding_amount=padding_amount,
71
+ )
72
+ id_mask = torch.cat([padding, id_mask], dim=2)
73
+ else:
74
+ raise ValueError(f"Unsupported number of dimensions: {id_mask.ndim}")
75
+
76
+ return id_mask
77
+
78
+
79
+ class MaskedTimeseries(NamedTuple):
80
+ series: torch.Tensor
81
+ """
82
+ The time series data, of shape (batch_size, num_variates, sequence_length). The first
83
+ dimension is optional.
84
+ """
85
+
86
+ padding_mask: torch.Tensor
87
+ """
88
+ A mask that indicates which values are padding. If padding_mask[..., i] is True,
89
+ then series[..., i] is _NOT_ padding; i.e., it's a valid value in the time series.
90
+ Same shape as `series`.
91
+ """
92
+
93
+ id_mask: torch.Tensor
94
+ """
95
+ A mask that indicates the group ID of each variate. Any
96
+ variates with the same ID are considered to be part of the same multivariate
97
+ time series, and can attend to each other.
98
+
99
+ Note: the sequence_length dimension can be 1 if the IDs should
100
+ be broadcast across the time dimension.
101
+ """
102
+
103
+ timestamp_seconds: torch.Tensor
104
+ """
105
+ A POSIX timestamp in seconds for each time step in the series. Of same shape as
106
+ `series`.
107
+ """
108
+
109
+ time_interval_seconds: torch.Tensor
110
+ """
111
+ The time frequency of each variate in seconds. Of shape (batch_size, num_variates) with
112
+ the first dimension optional.
113
+ """
114
+
115
+ def to(self, device: torch.device) -> "MaskedTimeseries":
116
+ return MaskedTimeseries(
117
+ series=self.series.to(device),
118
+ padding_mask=self.padding_mask.to(device),
119
+ id_mask=self.id_mask.to(device),
120
+ timestamp_seconds=self.timestamp_seconds.to(device),
121
+ time_interval_seconds=self.time_interval_seconds.to(device),
122
+ )
123
+
124
+
125
+ def is_extreme_value(t: torch.Tensor) -> torch.Tensor:
126
+ if torch.is_floating_point(t):
127
+ max_value = torch.finfo(t.dtype).max
128
+ else:
129
+ max_value = torch.iinfo(t.dtype).max
130
+
131
+ return reduce(
132
+ torch.logical_or,
133
+ (
134
+ torch.isinf(t),
135
+ torch.isnan(t),
136
+ t.abs() >= max_value / 2,
137
+ ),
138
+ )
139
+
140
+
141
+ def replace_extreme_values(t: torch.Tensor, replacement: float = 0.0) -> torch.Tensor:
142
+ return torch.where(is_extreme_value(t), torch.tensor(replacement, dtype=t.dtype, device=t.device), t)
143
+
144
+
145
+ def freq_to_seconds(freq: Union[str, pd.offsets.BaseOffset]) -> float:
146
+ # Modified from: https://github.com/DataDog/toto/blob/846d599f4b8d377db3088d5cd1a736d050cef5ac/toto/inference/gluonts_predictor.py#L58
147
+ if isinstance(freq, str):
148
+ freq = pd.tseries.frequencies.to_offset(freq)
149
+ try:
150
+ # Use nanos for fixed frequencies
151
+ return freq.nanos / 1e9 # Convert nanoseconds to seconds
152
+ except ValueError:
153
+ # Handle non-fixed frequencies like Week
154
+ if isinstance(freq, pd.offsets.BusinessDay):
155
+ return freq.n * 24 * 60 * 60
156
+ elif isinstance(freq, pd.offsets.Week):
157
+ return freq.n * 7 * 24 * 60 * 60 # n weeks to seconds
158
+ elif isinstance(freq, pd.offsets.MonthBegin) or isinstance(freq, pd.offsets.MonthEnd):
159
+ return 30 * 24 * 60 * 60 # Approximate a month as 30 days
160
+ elif isinstance(freq, pd.offsets.QuarterEnd) or isinstance(freq, pd.offsets.QuarterBegin):
161
+ return 90 * 24 * 60 * 60 # Approximate a quarter as 90 days
162
+ elif isinstance(freq, pd.offsets.YearEnd) or isinstance(freq, pd.offsets.YearBegin):
163
+ return 365.25 * 24 * 60 * 60 # Approximate a year as 365.25 days
164
+ else:
165
+ raise ValueError(f"Cannot handle frequency of type {type(freq)}: {freq}")