dsipts 1.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dsipts might be problematic. Click here for more details.

Files changed (81) hide show
  1. dsipts/__init__.py +48 -0
  2. dsipts/data_management/__init__.py +0 -0
  3. dsipts/data_management/monash.py +338 -0
  4. dsipts/data_management/public_datasets.py +162 -0
  5. dsipts/data_structure/__init__.py +0 -0
  6. dsipts/data_structure/data_structure.py +1167 -0
  7. dsipts/data_structure/modifiers.py +213 -0
  8. dsipts/data_structure/utils.py +173 -0
  9. dsipts/models/Autoformer.py +199 -0
  10. dsipts/models/CrossFormer.py +152 -0
  11. dsipts/models/D3VAE.py +196 -0
  12. dsipts/models/Diffusion.py +818 -0
  13. dsipts/models/DilatedConv.py +342 -0
  14. dsipts/models/DilatedConvED.py +310 -0
  15. dsipts/models/Duet.py +197 -0
  16. dsipts/models/ITransformer.py +167 -0
  17. dsipts/models/Informer.py +180 -0
  18. dsipts/models/LinearTS.py +222 -0
  19. dsipts/models/PatchTST.py +181 -0
  20. dsipts/models/Persistent.py +44 -0
  21. dsipts/models/RNN.py +213 -0
  22. dsipts/models/Samformer.py +139 -0
  23. dsipts/models/TFT.py +269 -0
  24. dsipts/models/TIDE.py +296 -0
  25. dsipts/models/TTM.py +252 -0
  26. dsipts/models/TimeXER.py +184 -0
  27. dsipts/models/VQVAEA.py +299 -0
  28. dsipts/models/VVA.py +247 -0
  29. dsipts/models/__init__.py +0 -0
  30. dsipts/models/autoformer/__init__.py +0 -0
  31. dsipts/models/autoformer/layers.py +352 -0
  32. dsipts/models/base.py +439 -0
  33. dsipts/models/base_v2.py +444 -0
  34. dsipts/models/crossformer/__init__.py +0 -0
  35. dsipts/models/crossformer/attn.py +118 -0
  36. dsipts/models/crossformer/cross_decoder.py +77 -0
  37. dsipts/models/crossformer/cross_embed.py +18 -0
  38. dsipts/models/crossformer/cross_encoder.py +99 -0
  39. dsipts/models/d3vae/__init__.py +0 -0
  40. dsipts/models/d3vae/diffusion_process.py +169 -0
  41. dsipts/models/d3vae/embedding.py +108 -0
  42. dsipts/models/d3vae/encoder.py +326 -0
  43. dsipts/models/d3vae/model.py +211 -0
  44. dsipts/models/d3vae/neural_operations.py +314 -0
  45. dsipts/models/d3vae/resnet.py +153 -0
  46. dsipts/models/d3vae/utils.py +630 -0
  47. dsipts/models/duet/__init__.py +0 -0
  48. dsipts/models/duet/layers.py +438 -0
  49. dsipts/models/duet/masked.py +202 -0
  50. dsipts/models/informer/__init__.py +0 -0
  51. dsipts/models/informer/attn.py +185 -0
  52. dsipts/models/informer/decoder.py +50 -0
  53. dsipts/models/informer/embed.py +125 -0
  54. dsipts/models/informer/encoder.py +100 -0
  55. dsipts/models/itransformer/Embed.py +142 -0
  56. dsipts/models/itransformer/SelfAttention_Family.py +355 -0
  57. dsipts/models/itransformer/Transformer_EncDec.py +134 -0
  58. dsipts/models/itransformer/__init__.py +0 -0
  59. dsipts/models/patchtst/__init__.py +0 -0
  60. dsipts/models/patchtst/layers.py +569 -0
  61. dsipts/models/samformer/__init__.py +0 -0
  62. dsipts/models/samformer/utils.py +154 -0
  63. dsipts/models/tft/__init__.py +0 -0
  64. dsipts/models/tft/sub_nn.py +234 -0
  65. dsipts/models/timexer/Layers.py +127 -0
  66. dsipts/models/timexer/__init__.py +0 -0
  67. dsipts/models/ttm/__init__.py +0 -0
  68. dsipts/models/ttm/configuration_tinytimemixer.py +307 -0
  69. dsipts/models/ttm/consts.py +16 -0
  70. dsipts/models/ttm/modeling_tinytimemixer.py +2099 -0
  71. dsipts/models/ttm/utils.py +438 -0
  72. dsipts/models/utils.py +624 -0
  73. dsipts/models/vva/__init__.py +0 -0
  74. dsipts/models/vva/minigpt.py +83 -0
  75. dsipts/models/vva/vqvae.py +459 -0
  76. dsipts/models/xlstm/__init__.py +0 -0
  77. dsipts/models/xlstm/xLSTM.py +255 -0
  78. dsipts-1.1.5.dist-info/METADATA +31 -0
  79. dsipts-1.1.5.dist-info/RECORD +81 -0
  80. dsipts-1.1.5.dist-info/WHEEL +5 -0
  81. dsipts-1.1.5.dist-info/top_level.txt +1 -0
@@ -0,0 +1,818 @@
1
+ import torch
2
+ import torch.nn as nn
3
+ import numpy as np
4
+ from .tft import sub_nn
5
+
6
+ try:
7
+ import lightning.pytorch as pl
8
+ from .base_v2 import Base
9
+ OLD_PL = False
10
+ except:
11
+ import pytorch_lightning as pl
12
+ OLD_PL = True
13
+ from .base import Base
14
+ from typing import List, Union
15
+ from ..data_structure.utils import beauty_string
16
+ from .utils import get_scope
17
+
18
+ class Diffusion(Base):
19
+ handle_multivariate = False
20
+ handle_future_covariates = True
21
+ handle_categorical_variables = True
22
+ handle_quantile_loss = False
23
+ description = get_scope(handle_multivariate,handle_future_covariates,handle_categorical_variables,handle_quantile_loss)
24
+ def __init__(self,
25
+ d_model: int,
26
+ out_channels: int,
27
+ past_steps: int,
28
+ future_steps: int,
29
+ past_channels: int,
30
+ future_channels: int,
31
+ embs: List[int],
32
+
33
+ learn_var:bool,
34
+ cosine_alpha: bool,
35
+ diffusion_steps: int,
36
+ beta: float,
37
+ gamma:float,
38
+
39
+ #for subnet
40
+ n_layers_RNN: int,
41
+ d_head: int,
42
+ n_head: int,
43
+ dropout_rate: float,
44
+ activation: str,
45
+ subnet:int,
46
+ perc_subnet_learning_for_step:float,
47
+
48
+ persistence_weight:float=0.0,
49
+ loss_type: str='l1',
50
+ quantiles:List[float]=[],
51
+ optim:Union[str,None]=None,
52
+ optim_config:Union[dict,None]=None,
53
+ scheduler_config:Union[dict,None]=None,
54
+ **kwargs)->None:
55
+ """Denoising Diffusion Probabilistic Model
56
+
57
+ Args:
58
+ d_model (int):
59
+ out_channels (int): number of target variables
60
+ past_steps (int): size of past window
61
+ future_steps (int): size of future window to be predicted
62
+ past_channels (int): number of variables available for the past context
63
+ future_channels (int): number of variables known in the future, available for forecasting
64
+ embs (list[int]): categorical variables dimensions for embeddings
65
+ learn_var (bool): Flag to make the model train the posterior variance (if True) or use the variance of posterior distribution
66
+ cosine_alpha (bool): Flag for the generation of alphas and betas
67
+ diffusion_steps (int): number of noising steps for the initial sample
68
+ beta (float): starting variable to generate the diffusion perturbations. Ignored if cosine_alpha == True
69
+ gamma (float): trade_off variable to balance loss over noise prediction and NegativeLikelihood/KL_Divergence.
70
+ n_layers_RNN (int): param for subnet
71
+ d_head (int): param for subnet
72
+ n_head (int): param for subnet
73
+ dropout_rate (float): param for subnet
74
+ activation (str): param for subnet
75
+ subnet (int): =1 for attention subnet, =2 for linear subnet. Others can be added(wait for Black Friday for discounts)
76
+ perc_subnet_learning_for_step (float): percentage to choose how many subnet has to be trained for every batch. Decrease this value if the loss blows up.
77
+ persistence_weight (float, optional): Defaults to 0.0.
78
+ loss_type (str, optional): Defaults to 'l1'.
79
+ quantiles (List[float], optional): Only [] accepted. Defaults to [].
80
+ optim (Union[str,None], optional): Defaults to None.
81
+ optim_config (Union[dict,None], optional): Defaults to None.
82
+ scheduler_config (Union[dict,None], optional): Defaults to None.
83
+ """
84
+
85
+ super().__init__(**kwargs)
86
+ self.save_hyperparameters(logger=False)
87
+
88
+ self.dropout = dropout_rate
89
+ self.persistence_weight = persistence_weight
90
+ self.loss_type = loss_type
91
+ self.optim = optim
92
+ self.optim_config = optim_config
93
+ self.scheduler_config = scheduler_config
94
+
95
+ #* HANDLING LOSSES
96
+ # With respect to other models, here quantiles are not used
97
+ # Here we define the loss used for noise predicted and actual noise_loss
98
+ # Losses for distribution are defined as functions below.
99
+
100
+ # trade off for noise loss and distribution loss
101
+ self.gamma = gamma
102
+ ## can not handle quantile
103
+ if len(quantiles)>0:
104
+ quantiles = []
105
+ assert len(quantiles) ==0
106
+ self.mul = 1
107
+ self.use_quantiles = False
108
+ if self.loss_type == 'mse':
109
+ self.loss = nn.MSELoss()
110
+ else:
111
+ self.loss = nn.L1Loss()
112
+
113
+ #* >>>>>>>>>>>>> canonical data parameters
114
+ # dimension of the model, number of variables and sequence length info
115
+ self.d_model = d_model
116
+ self.past_steps = past_steps
117
+ self.future_steps = future_steps
118
+ self.past_channels = past_channels
119
+ self.future_channels = future_channels
120
+ self.output_channels = out_channels
121
+
122
+ #* >>>>>>>>>>>>> specific model parameters
123
+ # if we want to learn also the variance, instead of using the standard posterior variance of Diffusion NN
124
+ self.learn_var = learn_var
125
+
126
+ # number of noising steps
127
+ self.T = diffusion_steps
128
+
129
+ # distribution weigths to avoid less trained subnet
130
+ self.multinomial_step_weights = np.ones(diffusion_steps)
131
+
132
+ # % of all subnets trained every batch of every epoch
133
+ # this percentage is controlled by the parameter 'perc_subnet_learning_for_step':
134
+ # - decrease or increase according to the efficiency of your machine
135
+ self.simultaneous_steps = max(int(diffusion_steps*perc_subnet_learning_for_step), 1)
136
+
137
+
138
+ #* >>>>>>>>>>>>> specific diffusion setup
139
+ self.s = (100*self.T)**(-1) # offset variable to avoid problems with computations near 0
140
+ # value found by try and error
141
+
142
+ # betas and cumulative products of alphas are the main values for the diffusion model, both in (0,1) at each step t:
143
+ # - betas_t -> variance added at t-th step
144
+ # - alphas_t = 1 - betas_t
145
+ # - alphas_cumprod_t -> accumulation of alphas up to step t.
146
+ # - - It can be considered as the remaining signal of the starting input at t-th step!!
147
+
148
+ # according to the flag below we can choose how to generate them!
149
+ if cosine_alpha:
150
+ # COSINE ALPHA Computation
151
+ # aux_perc = 0.05
152
+ # avoid_comp_err_norm = self.T*(1+aux_perc) # enlarging self.T to avoid errors in computations using cos^2
153
+ # the t-th cumulative product of alphas is the 'forgetting' schedule of the inital sample after t diffusion step
154
+ # in this procedure we use the function below to produce all the cumulative products of alphas
155
+ f_cos_t = [(np.cos( (t/self.T +self.s)/(1+self.s) * np.pi/2 ))**2 for t in range(self.T)]
156
+
157
+ self.alphas_cumprod = np.append(1-self.s, f_cos_t[1:]/f_cos_t[0]) # computed as scaled cumulative product of alphas f_cos_t[1:]/f_cos_t[0]
158
+ self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1]) # auxiliar vector to use the same index to access alpha_cumprod_t and alpha_cumprod_{t-1}
159
+ self.alphas = self.alphas_cumprod * (self.alphas_cumprod_prev)**(-1)
160
+ self.betas = 1 - self.alphas
161
+
162
+ else:
163
+ # STANDARD ALPHA Computation
164
+ # beta is considered constant in [0,1) for all time steps. Good values near 0.03
165
+ # Unlike before, here we generate all needed values starting from betas
166
+ self.betas = np.array([beta]*self.T)
167
+ self.alphas = 1 - self.betas
168
+ self.alphas_cumprod = np.cumprod(self.alphas)
169
+ self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1]) # auxiliar vector to use the same index to access alpha_cumprod_t and alpha_cumprod_{t-1}
170
+
171
+ # values for posterior distribution, id est the target distribution of each subnet
172
+ # All these values will be casted to tensors during computations using the function _extract_into_tensor
173
+ self.posterior_mean_coef1 = self.betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
174
+ self.posterior_mean_coef2 = (1.0 - self.alphas_cumprod_prev) * np.sqrt(self.alphas) / (1.0 - self.alphas_cumprod)
175
+ self.posterior_variance = np.append(self.s, self.betas[1:] * (1.0 - self.alphas_cumprod_prev[1:]) / (1.0 - self.alphas_cumprod[1:]))
176
+ self.posterior_log_variance = np.log(self.posterior_variance)
177
+
178
+ #* >>>>>>>>>>>>> LAYERS
179
+ # for other numerical variables in the past
180
+ self.aux_past_channels = past_channels - out_channels
181
+ self.linear_aux_past = nn.ModuleList([nn.Linear(1, d_model) for _ in range(self.aux_past_channels)])
182
+
183
+ # for numerical variables in the future
184
+ self.aux_fut_channels = future_channels
185
+ self.linear_aux_fut = nn.ModuleList([nn.Linear(1, d_model) for _ in range(self.aux_fut_channels)])
186
+
187
+ # embedding categorical for both past and future (ASSUMING BOTH AVAILABLE OR NO ONE)
188
+ self.seq_len = past_steps + future_steps
189
+ self.emb_cat_var = sub_nn.embedding_cat_variables(self.seq_len, future_steps, d_model, embs, self.device)
190
+
191
+ # diffusion sub nets, one subnet for each step
192
+ if subnet == 1:
193
+ self.sub_nets = nn.ModuleList([
194
+ SubNet1(self.aux_past_channels, self.aux_fut_channels, learn_var, out_channels, d_model, d_head, n_head, activation, dropout_rate) for _ in range(diffusion_steps)
195
+ ])
196
+ elif subnet == 2:
197
+ self.sub_nets = nn.ModuleList([
198
+ SubNet2(self.aux_past_channels, self.aux_fut_channels, learn_var, past_steps, future_steps, out_channels, d_model, activation, dropout_rate) for _ in range(diffusion_steps)
199
+ ])
200
+ elif subnet ==3 :
201
+ aux_num_available = self.aux_past_channels>0 or self.aux_fut_channels>0 # if we have numerical vars, use it
202
+ self.sub_nets = nn.ModuleList([
203
+ SubNet3(learn_var, aux_num_available, out_channels, d_model, future_steps, n_layers_RNN, d_head, n_head, dropout_rate) for _ in range(diffusion_steps)
204
+ ])
205
+ else:
206
+ raise ValueError("Wrong number for Subnet. Not yet implemented!")
207
+
208
+ def forward(self, batch:dict)-> float:
209
+ """training process of the diffusion network
210
+
211
+ Args:
212
+ batch (dict): variables loaded
213
+
214
+ Returns:
215
+ float: total loss about the prediction of the noises over all subnets extracted
216
+ """
217
+
218
+ # LOADING TARGET VARIABLES
219
+ y_to_be_pred = batch['y'].to(self.device)
220
+
221
+ # LOADING AUTOREGRESSIVE CONTEXT OF TARGET VARIABLES
222
+ num_past = batch['x_num_past'].to(self.device)
223
+ idx_target = batch['idx_target'][0]
224
+ y_past = num_past[:,:,idx_target]
225
+
226
+ # LOADING EMBEDDING CATEGORICAL VARIABLES
227
+ emb_cat_past, emb_cat_fut = self.cat_categorical_vars(batch)
228
+ emb_cat_past = torch.mean(emb_cat_past, dim = 2)
229
+ emb_cat_fut = torch.mean(emb_cat_fut, dim = 2)
230
+
231
+ ### LOADING PAST AND FUTURE NUMERICAL VARIABLES
232
+ # load in the model auxiliar numerical variables
233
+
234
+ if self.aux_past_channels>0: # if we have more numerical variables about past
235
+ aux_num_past = self.remove_var(num_past, idx_target, 2) # remove the autoregressive variable
236
+ assert self.aux_past_channels == aux_num_past.size(2), beauty_string(f"{self.aux_past_channels} LAYERS FOR PAST VARS AND {aux_num_past.size(2)} VARS",'section',True) # to check if we are using the expected number of variables about past
237
+ # concat all embedded vars and mean of them
238
+ aux_emb_num_past = torch.Tensor().to(self.device)
239
+ for i, layer in enumerate(self.linear_aux_past):
240
+ aux_emb_past = layer(aux_num_past[:,:,[i]]).unsqueeze(2)
241
+ aux_emb_num_past = torch.cat((aux_emb_num_past, aux_emb_past), dim=2)
242
+ aux_emb_num_past = torch.mean(aux_emb_num_past, dim = 2)
243
+ else:
244
+ aux_emb_num_past = None # non available vars
245
+
246
+ if self.aux_fut_channels>0: # if we have more numerical variables about future
247
+ # AUX means AUXILIARY variables
248
+ aux_num_fut = batch['x_num_future'].to(self.device)
249
+ assert self.aux_fut_channels == aux_num_fut.size(2), beauty_string(f"{self.aux_fut_channels} LAYERS FOR PAST VARS AND {aux_num_fut.size(2)} VARS",'section',True) # to check if we are using the expected number of variables about fut
250
+ # concat all embedded vars and mean of them
251
+ aux_emb_num_fut = torch.Tensor().to(self.device)
252
+ for j, layer in enumerate(self.linear_aux_fut):
253
+ aux_emb_fut = layer(aux_num_fut[:,:,[j]]).unsqueeze(2)
254
+ aux_emb_num_fut = torch.cat((aux_emb_num_fut, aux_emb_fut), dim=2)
255
+ aux_emb_num_fut = torch.mean(aux_emb_num_fut, dim = 2)
256
+ else:
257
+ aux_emb_num_fut = None # non available vars
258
+
259
+ ### actual DIFFUSION process ----------------------------------------------
260
+
261
+ ##* CHOOSE THE t SUBNET
262
+ # We have T subnets: [0, 1, ..., T-1].
263
+ values = list(range(self.T))
264
+
265
+ ## Probabilistic way to choose the subnet properly
266
+ # avoid exploding step_weights going on with trainings
267
+ self.improving_weight_during_training()
268
+ # normalizing weights for a more stable subnet training
269
+ t_wei = self.multinomial_step_weights/np.sum(self.multinomial_step_weights)
270
+ # extract times t
271
+ drawn_t = np.random.choice(values, size=self.simultaneous_steps, replace=False, p=t_wei) # type: ignore
272
+
273
+ if 0 not in drawn_t: drawn_t = np.append(drawn_t, 0)
274
+ # update weights
275
+ non_draw_val = np.delete(values, drawn_t) # type: ignore
276
+ self.multinomial_step_weights[non_draw_val] += 1 # increase weights of non-extracted subnet
277
+
278
+ # init negative loss for the first step
279
+ tot_loss = -1
280
+ for t in drawn_t:
281
+ # LOADING THE SUBNET
282
+ sub_net = self.sub_nets[t]
283
+
284
+ # Get y and noise it
285
+ y_noised, true_mean, true_log_var, actual_noise = self.q_sample(y_to_be_pred, t)
286
+
287
+ # compute the output from that network using the sample with noises
288
+ # output composed of: noise predicted and, if learn_var=True, vector for variances
289
+ if self.learn_var:
290
+ #predict the noise!
291
+ eps_pred, var_aux_out = sub_net(y_noised, y_past, emb_cat_past, emb_cat_fut, aux_emb_num_past, aux_emb_num_fut)
292
+
293
+ # compute posterior variance of NN (using interpolation)
294
+ pre_var_t = self._extract_into_tensor(self.betas, t, eps_pred.shape)
295
+ post_var_t = self._extract_into_tensor(self.posterior_variance, t, eps_pred.shape)
296
+ post_sigma = torch.exp( var_aux_out*torch.log(pre_var_t) + (1-var_aux_out)*torch.log(post_var_t) ) # variance, not log_var
297
+ else:
298
+ eps_pred = sub_net(y_noised, y_past, emb_cat_past, emb_cat_fut, aux_emb_num_past, aux_emb_num_fut)
299
+ post_sigma = self._extract_into_tensor(self.posterior_variance, t, eps_pred.shape)
300
+
301
+ # posterior mean assuming the predicted noise is the actual one
302
+ out_mean = self._extract_into_tensor(np.sqrt(1/self.alphas), t, eps_pred.shape) * ( y_noised - self._extract_into_tensor(self.betas/np.sqrt(1-self.alphas_cumprod), t, eps_pred.shape) * eps_pred )
303
+
304
+ # # At the first timestep return the negative likelihood,
305
+ if t==0:
306
+ # post_var = self._extract_into_tensor(self.posterior_variance, t, y_to_be_pred.shape)
307
+ neg_likelihoods = -torch.log(self.gaussian_likelihood(y_to_be_pred, out_mean, post_sigma)) #! (values to be predicted, mean of values predicted, variance)
308
+ distribution_loss = torch.mean(neg_likelihoods)
309
+
310
+ # # otherwise return KL( q(x_{t-1}|x_t, x_0) || p(x_{t-1}|x_t) )
311
+ else:
312
+ # COMPUTE LOSS between TRUE eps and DRAWN eps_pred
313
+ kl_divergence = self.normal_kl(true_mean, true_log_var, out_mean, torch.log(post_sigma)) # (true mean, true log var, mean of values predicted, log var predicted)
314
+ distribution_loss = torch.mean(kl_divergence)
315
+
316
+ # always compute the loss about the straight prediction of the noise
317
+ noise_loss = self.loss(eps_pred, actual_noise)
318
+
319
+ # if tot_loss == -1:
320
+ # beauty_string(f'NOISE LOSS: {noise_loss.item()}','info',True)
321
+ # beauty_string(f'ACTUAL NOISE: {actual_noise[0].min()}, {actual_noise[0].max()}, {actual_noise[0].mean()}, {actual_noise[0].var()}','info',True)
322
+ # beauty_string(f'PREDICTED NOISE: {eps_pred[0].min()}, {eps_pred[0].max()}, {eps_pred[0].mean()}, {eps_pred[0].var()}','info',True)
323
+
324
+ noise_loss += self.gamma*distribution_loss # add, scaled according to gamma, the distribution_loss
325
+
326
+ # update the total loss
327
+ if tot_loss==-1:
328
+ tot_loss = noise_loss
329
+ else:
330
+ tot_loss += noise_loss
331
+ return tot_loss
332
+
333
+ def training_step(self, batch, batch_idx):
334
+ # the training loss is already computed in the forward method
335
+ loss_eps = self(batch)
336
+ return loss_eps
337
+
338
+ def inference(self, batch:dict) -> torch.Tensor:
339
+ """Inference process to forecast future y
340
+
341
+ Args:
342
+ batch (dict): Keys checked ['x_num_past, 'idx_target', 'x_num_future', 'x_cat_past', 'x_cat_future']
343
+
344
+ Returns:
345
+ torch.Tensor: generated sequence [batch_size, future_steps, num_var]
346
+ """
347
+ # LOADING AUTOREGRESSIVE CONTEXT OF TARGET VARIABLES
348
+ num_past = batch['x_num_past'].to(self.device)
349
+ batch_size = num_past.shape[0]
350
+ idx_target = batch['idx_target'][0]
351
+ y_past = num_past[:,:,idx_target]
352
+
353
+ # LOADING EMBEDDING CATEGORICAL VARIABLES
354
+ emb_cat_past, emb_cat_fut = self.cat_categorical_vars(batch)
355
+ emb_cat_past = torch.mean(emb_cat_past, dim = 2)
356
+ emb_cat_fut = torch.mean(emb_cat_fut, dim = 2)
357
+
358
+ ### LOADING PAST AND FUTURE NUMERICAL VARIABLES
359
+ # this check is done simultaneously
360
+ # because in the model we use auxiliar numerical variables
361
+ # only if we have both them in the past and in the future
362
+
363
+ ### LOADING PAST AND FUTURE NUMERICAL VARIABLES
364
+ # load in the model auxiliar numerical variables
365
+
366
+ if self.aux_past_channels>0: # if we have more numerical variables about past
367
+ aux_num_past = self.remove_var(num_past, idx_target, 2) # remove the autoregressive variable
368
+ assert self.aux_past_channels == aux_num_past.size(2), beauty_string(f"{self.aux_past_channels} LAYERS FOR PAST VARS AND {aux_num_past.size(2)} VARS",'section',True) # to check if we are using the expected number of variables about past
369
+ # concat all embedded vars and mean of them
370
+ aux_emb_num_past = torch.Tensor().to(self.device)
371
+ for i, layer in enumerate(self.linear_aux_past):
372
+ aux_emb_past = layer(aux_num_past[:,:,[i]]).unsqueeze(2)
373
+ aux_emb_num_past = torch.cat((aux_emb_num_past, aux_emb_past), dim=2)
374
+ aux_emb_num_past = torch.mean(aux_emb_num_past, dim = 2)
375
+ else:
376
+ aux_emb_num_past = None # non available vars
377
+
378
+ if self.aux_fut_channels>0: # if we have more numerical variables about future
379
+ # AUX means AUXILIARY variables
380
+ aux_num_fut = batch['x_num_future'].to(self.device)
381
+ assert self.aux_fut_channels == aux_num_fut.size(2), beauty_string(f"{self.aux_fut_channels} LAYERS FOR PAST VARS AND {aux_num_fut.size(2)} VARS",'section',True) # to check if we are using the expected number of variables about fut
382
+ # concat all embedded vars and mean of them
383
+ aux_emb_num_fut = torch.Tensor().to(self.device)
384
+ for j, layer in enumerate(self.linear_aux_fut):
385
+ aux_emb_fut = layer(aux_num_fut[:,:,[j]]).unsqueeze(2)
386
+ aux_emb_num_fut = torch.cat((aux_emb_num_fut, aux_emb_fut), dim=2)
387
+ aux_emb_num_fut = torch.mean(aux_emb_num_fut, dim = 2)
388
+ else:
389
+ aux_emb_num_fut = None # non available vars
390
+
391
+ # DIFFUSION INFERENCE
392
+ # import pdb; pdb.set_trace() # can use also torch.normal(0, 1, size=y_noised.shape)
393
+ y_noised = torch.randn((batch_size, self.future_steps, self.output_channels)).to(self.device)
394
+ # pass the white noise in sub nets
395
+ for t in range(self.T-1, -1, -1): # INVERSE cycle over all subnets, but not the last one
396
+ sub_net = self.sub_nets[t] # load the subnet
397
+
398
+ ## CHECK THE NUMBER OF PARAMS
399
+ # model_parameters = filter(lambda p: p.requires_grad, model.parameters())
400
+ # params = sum([np.prod(p.size()) for p in model_parameters]) -> 13K
401
+ if self.learn_var:
402
+ eps_pred, var_aux_out = sub_net(y_noised, y_past, emb_cat_past, emb_cat_fut, aux_emb_num_past, aux_emb_num_fut)
403
+ # interpolazion of variance
404
+ pre_var_t = self._extract_into_tensor(self.betas, t, eps_pred.shape)
405
+ post_var_t = self._extract_into_tensor(self.posterior_variance, t, eps_pred.shape)
406
+ post_sigma = torch.exp(var_aux_out*torch.log(pre_var_t) + (1-var_aux_out)*torch.log(post_var_t))
407
+ else:
408
+ eps_pred = sub_net(y_noised, y_past, emb_cat_past, emb_cat_fut, aux_emb_num_past, aux_emb_num_fut)
409
+ post_sigma = self._extract_into_tensor(self.posterior_variance, t, eps_pred.shape)
410
+
411
+ # Sample x_{t-1} from the model at the given timestep.
412
+ # y_noised = self._extract_into_tensor(1/np.sqrt(self.alphas), t, y_noised.shape)*( y_noised - self._extract_into_tensor(np.sqrt(self.betas), t, eps_pred.shape)*eps_pred )
413
+ y_noised = 1/torch.sqrt(1-post_sigma)*(y_noised - torch.sqrt(post_sigma)*eps_pred)
414
+
415
+ # if t>0 :
416
+ # noise = torch.rand_like(y_noised).to(self.device)
417
+ # y_noised = y_noised + torch.sqrt(post_sigma)*noise
418
+
419
+ out = y_noised.view(-1, self.future_steps, self.output_channels, 1)
420
+ return out
421
+
422
+ # for validation extract the output from the self.inference method
423
+ def validation_step(self, batch, batch_idx):
424
+ out = self.inference(batch)
425
+ loss = self.compute_loss(batch,out)
426
+ return loss
427
+
428
+ # function to concat embedded categorical variables
429
+ def cat_categorical_vars(self, batch:dict):
430
+ """Extracting categorical context about past and future
431
+
432
+ Args:
433
+ batch (dict): Keys checked -> ['x_cat_past', 'x_cat_future']
434
+
435
+ Returns:
436
+ List[torch.Tensor, torch.Tensor]: cat_emb_past, cat_emb_fut
437
+ """
438
+ # GET AVAILABLE CATEGORICAL CONTEXT
439
+ if 'x_cat_past' in batch.keys() and 'x_cat_future' in batch.keys(): # if we have both
440
+ # HERE WE ASSUME SAME NUMBER AND KIND OF VARIABLES IN PAST AND FUTURE
441
+ cat_past = batch['x_cat_past'].to(self.device)
442
+ cat_fut = batch['x_cat_future'].to(self.device)
443
+ cat_full = torch.cat((cat_past, cat_fut), dim = 1)
444
+ # EMB CATEGORICAL VARIABLES AND THEN SPLIT IN PAST AND FUTURE
445
+ emb_cat_full = self.emb_cat_var(cat_full,self.device)
446
+ else:
447
+ emb_cat_full = self.emb_cat_var(batch['x_num_past'].shape[0],self.device)
448
+
449
+ # CONCAT THEM, according to self.emb_cat_var usage
450
+ cat_full = torch.cat((cat_past, cat_fut), dim = 1)
451
+ # actual embedding
452
+ emb_cat_full = self.emb_cat_var(cat_full,self.device)
453
+ # split past and future categorical embedded variables
454
+ cat_emb_past = emb_cat_full[:,:self.past_steps,:,:]
455
+ cat_emb_fut = emb_cat_full[:,-self.future_steps:,:,:]
456
+
457
+ return cat_emb_past, cat_emb_fut
458
+
459
+ #function to extract from batch['x_num_past'] all variables except the one autoregressive
460
+ def remove_var(self, tensor: torch.Tensor, indexes_to_exclude: list, dimension: int)-> torch.Tensor:
461
+ """Function to remove variables from tensors in chosen dimension and position
462
+
463
+ Args:
464
+ tensor (torch.Tensor): starting tensor
465
+ indexes_to_exclude (list): index of the chosen dimension we want t oexclude
466
+ dimension (int): dimension of the tensor on which we want to work (not list od dims!!)
467
+
468
+ Returns:
469
+ torch.Tensor: new tensor without the chosen variables
470
+ """
471
+
472
+ remaining_idx = torch.tensor([i for i in range(tensor.size(dimension)) if i not in indexes_to_exclude]).to(tensor.device)
473
+ # Select the desired sub-tensor
474
+ extracted_subtensors = torch.index_select(tensor, dim=dimension, index=remaining_idx)
475
+
476
+ return extracted_subtensors
477
+
478
+ def improving_weight_during_training(self):
479
+ """
480
+ Each time we sample from multinomial we subtract the minimum for more precise sampling,
481
+ avoiding great learning differences among subnets
482
+
483
+ This lead to more stable inference also in early training, mainly for common context embedding.
484
+
485
+ For probabilistic reason, weights has to be >0, so we subtract min-1
486
+ """
487
+ self.multinomial_step_weights -= (self.multinomial_step_weights.min()-1)
488
+ return
489
+
490
+ ### >>>>>>>>>>>>> AUXILIARY MODEL FUNCS
491
+ def q_sample(self, x_start: torch.Tensor, t: int)-> List[torch.Tensor]:
492
+ """Diffuse x_start for t diffusion steps.
493
+
494
+ In other words, sample from q(x_t | x_0).
495
+
496
+ Also, compute the mean and variance of the diffusion posterior:
497
+
498
+ q(x_{t-1} | x_t, x_0)
499
+
500
+ Posterior mean and variance are the ones to be predicted
501
+
502
+ Args:
503
+ x_start (torch.Tensor): values to be predicted
504
+ t (int): diffusion step
505
+
506
+ Returns:
507
+ List[torch.Tensor, torch.Tensor, torch.Tensor]: q_sample, posterior mean, posterior log variance and the actual noise
508
+ """
509
+ # noise from normal distribution
510
+ noise = torch.randn_like(x_start)
511
+
512
+ # direct diffusion at t-th step
513
+ q_sample = self._extract_into_tensor(np.sqrt(self.alphas_cumprod), t, x_start.shape) * x_start + self._extract_into_tensor(np.sqrt(1 - self.alphas_cumprod), t, x_start.shape) * noise
514
+
515
+ # compute meean and variance
516
+ q_mean = self._extract_into_tensor(self.posterior_mean_coef1, t, q_sample.shape) * x_start + self._extract_into_tensor(self.posterior_mean_coef2, t, q_sample.shape) * q_sample
517
+ q_log_var = self._extract_into_tensor( self.posterior_log_variance, t, q_sample.shape )
518
+
519
+ # return, the sample, its posterior mean and log_variance, the noise used
520
+ return [q_sample, q_mean, q_log_var, noise]
521
+
522
+ def normal_kl(self, mean1, logvar1, mean2, logvar2):
523
+ """
524
+ Compute the KL divergence between two gaussians. Also called relative entropy.
525
+ KL divergence of P from Q is the expected excess surprise from using Q as a model when the actual distribution is P.
526
+ KL(P||Q) = P*log(P/Q) or -P*log(Q/P)
527
+
528
+ # In the context of machine learning, KL(P||Q) is often called the 'information gain'
529
+ # achieved if P would be used instead of Q which is currently used.
530
+
531
+ Shapes are automatically broadcasted, so batches can be compared to
532
+ scalars, among other use cases.
533
+ """
534
+ # -1/2 + log(sigma2/sigma1) + sigma1^2/2sigma2^2 + (mu1-mu2)^2/2sigma2^2
535
+ return 0.5 * (
536
+ -1.0
537
+ + logvar2 - logvar1
538
+ + torch.exp(logvar1 - logvar2)
539
+ + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
540
+ )
541
+
542
+ def gaussian_likelihood(self, x, mean, var):
543
+ term1 = 1.0 / torch.sqrt(2 * np.pi * var)
544
+ term2 = torch.exp(-0.5 * ((x - mean)**2 / var))
545
+ likelihood = term1 * term2
546
+ return likelihood
547
+
548
+ def gaussian_log_likelihood(self, x, mean, var):
549
+ term1 = -0.5 * ((x - mean) / torch.sqrt(var))**2
550
+ term2 = -0.5 * torch.log(2 * torch.tensor(np.pi) * var)
551
+ log_likelihood = term1 + term2
552
+ return log_likelihood
553
+
554
+ def _extract_into_tensor(self, arr, timesteps, broadcast_shape):
555
+ """
556
+ Extract values from a 1-D numpy array for a batch of indices.
557
+
558
+ :param arr: the 1-D numpy array.
559
+ :param timesteps: a tensor of indices into the array to extract.
560
+ :param broadcast_shape: a larger shape of K dimensions with the batch
561
+ dimension equal to the length of timesteps.
562
+ :return: a tensor of shape 'broadcast_shape' where the shape has K dims.
563
+ """
564
+ ten = torch.tensor(arr[timesteps])
565
+ return ten.expand(broadcast_shape).to(self.device)
566
+
567
+ ### >>>>>>>>>>>>> SUB NET
568
+ class SubNet1(nn.Module):
569
+ def __init__(self, aux_past_ch, aux_fut_ch, learn_var:bool, output_channel:int, d_model:int, d_head:int, n_head:int, activation:str, dropout_rate:float) -> None:
570
+ """ -> SUBNET of the DIFFUSION MODEL (DDPM)
571
+
572
+ It starts with an autoregressive LSTM Network computation of epsilon, then subtracted to 'y_noised' tensor. This is always possible!
573
+ Now we have an approximation of our 'eps_hat', that at the end will pass in a residual connection with its embedded version 'emb_eps_hat'.
574
+
575
+ 'emb_eps_hat' will be update with respect to available info about categorical values of our serie:
576
+ Through an ATTENTION Network we compare past categorical with future categorical to update the embedded noise predicted.
577
+
578
+ Also, if we have values about auxiliary numerical variables both in past and future, the changes of these variables will be fetched
579
+ by another ATTENTION Network.
580
+
581
+ The goal is ensure valuable computations for 'eps' always, and then updating things if we have enough data.
582
+ Both attentions uses { Q = *_future, K = *_past, V = y_past } using as much as possible context variables for better updates.
583
+
584
+ Args:
585
+ learn_var (bool): set if the network has to learn the optim variance of each step
586
+ output_channel (int): number of variables to be predicted
587
+ future_steps (int): number of step in the future, so the number of timesstep to be predicted
588
+ d_model (int): hidden dimension of the model
589
+ num_layers_RNN (int): number of layers for autoregressive prediction
590
+ d_head (int): number of heads for Attention Networks
591
+ n_head (int): hidden dimension of heads for Attention Networks
592
+ dropout_rate (float):
593
+ """
594
+ super().__init__()
595
+ self.aux_past_channels = aux_past_ch
596
+ self.aux_fut_channels = aux_fut_ch
597
+
598
+ self.learn_var = learn_var
599
+ activation_fun = eval(activation)
600
+
601
+ self.y_noised_linear = nn.Linear(output_channel, d_model)
602
+ self.y_past_linear = nn.Linear(output_channel, d_model)
603
+
604
+ self.past_sequential = nn.Sequential(
605
+ nn.Linear(d_model*3, d_model*2) if self.aux_past_channels>0 else nn.Linear(d_model*2, d_model*2),
606
+ activation_fun(),
607
+ nn.Linear(d_model*2, d_model)
608
+ )
609
+
610
+ self.fut_sequential = nn.Sequential(
611
+ nn.Linear(d_model*3, d_model*2) if self.aux_fut_channels>0 else nn.Linear(d_model*2, d_model*2),
612
+ activation_fun(),
613
+ nn.Linear(d_model*2, d_model)
614
+ )
615
+
616
+ self.y_sequential = nn.Sequential(
617
+ nn.Linear(d_model*2, d_model),
618
+ activation_fun(),
619
+ nn.Linear(d_model, d_model)
620
+ )
621
+
622
+ self.attention = sub_nn.InterpretableMultiHead(d_model, d_head, n_head)
623
+
624
+ # if learn_var == True, we want to predict an additional variable for he variance
625
+ # just an intermediate dimension for linears
626
+ hidden_size = int(d_model/3)
627
+ self.eps_out_sequential = nn.Sequential(
628
+ nn.Linear(d_model, hidden_size),
629
+ activation_fun(),
630
+ nn.Linear(hidden_size, output_channel)
631
+ )
632
+
633
+ self.var_out_sequential = nn.Sequential(
634
+ nn.Linear(output_channel, hidden_size),
635
+ nn.Linear(hidden_size, d_model),
636
+ activation_fun(),
637
+ nn.Linear(d_model, d_model),
638
+ activation_fun(),
639
+ nn.Linear(d_model, hidden_size),
640
+ nn.Linear(hidden_size, output_channel)
641
+ )
642
+
643
+
644
+ def forward(self, y_noised:torch.Tensor, y_past:torch.Tensor,
645
+ cat_past:torch.Tensor, cat_fut:torch.Tensor,
646
+ num_past:Union[torch.Tensor,None] = None, num_fut:Union[torch.Tensor,None] = None):
647
+ """'DIFFUSION SUBNET
648
+ Args:
649
+ y_noised (torch.Tensor): [B, future_step, num_var]
650
+ y_past (torch.Tensor): [B, past_step, num_var]
651
+ cat_past (torch.Tensor, optional): [B, past_step, d_model]. Defaults to None.
652
+ cat_fut (torch.Tensor, optional): [B, future_step, d_model]. Defaults to None.
653
+ num_past (torch.Tensor, optional): [B, past_step, d_model]. Defaults to None.
654
+ num_fut (torch.Tensor, optional): [B, future_step, d_model]. Defaults to None.
655
+
656
+ Returns:
657
+ torch.Tensor: predicted noise [B, future_step, num_var]. According to 'learn_var' param in initialization, the subnet returns another tensor of same size about the variance
658
+ """
659
+ emb_y_noised = self.y_noised_linear(y_noised.float())
660
+ emb_y_past = self.y_past_linear(y_past)
661
+
662
+ # LIN FOR PAST
663
+ past = [emb_y_past, cat_past]
664
+ if self.aux_past_channels>0:
665
+ past.append(num_past)
666
+ past_seq_input = torch.cat(past, dim=2) # type: ignore
667
+ past_seq = self.past_sequential(past_seq_input) # -> [B, future_step, d_model]
668
+
669
+ # LIN FOR FUT
670
+ fut = [emb_y_noised, cat_fut]
671
+ if self.aux_fut_channels>0:
672
+ fut.append(num_fut)
673
+ fut_seq_input = torch.cat(fut, dim=2) # type: ignore
674
+ fut_seq = self.fut_sequential(fut_seq_input) # -> [B, future_step, d_model]
675
+ # ATTENTION
676
+ attention = self.attention(fut_seq, past_seq, emb_y_past)
677
+ # OUTPUT
678
+ eps_out = self.eps_out_sequential(attention)
679
+ # if LEARN_VAR
680
+ if self.learn_var:
681
+ var_out = eps_out.detach()
682
+ var_out = self.var_out_sequential(var_out)
683
+ return eps_out, var_out
684
+
685
+ return eps_out
686
+
687
+ class SubNet2(nn.Module):
688
+ def __init__(self, aux_past_ch, aux_fut_ch, learn_var:bool, past_steps, future_steps, output_channel:int, d_model:int, activation:str, dropout_rate:float):
689
+ super().__init__()
690
+ self.aux_past_channels = aux_past_ch
691
+ self.aux_fut_channels = aux_fut_ch
692
+ self.learn_var = learn_var
693
+ # in_size changes wrt numerical vars
694
+ in_size = ( past_steps*(2+bool(aux_past_ch)) + future_steps*(2 + bool(aux_fut_ch)) ) * d_model
695
+ out_size = output_channel * future_steps
696
+
697
+ activation_fun = eval(activation)
698
+
699
+ self.y_noised_linear = nn.Linear(output_channel, d_model)
700
+ self.y_past_linear = nn.Linear(output_channel, d_model)
701
+
702
+ hidden_size = int( (output_channel + d_model)/2 )
703
+ self.eps_out_sequential = nn.Sequential(
704
+ nn.Linear(in_size, hidden_size),
705
+ nn.Dropout(dropout_rate),
706
+ nn.Linear(hidden_size, d_model),
707
+ activation_fun(),
708
+ nn.Dropout(dropout_rate),
709
+ nn.Linear(d_model, hidden_size),
710
+ activation_fun(),
711
+ nn.Dropout(dropout_rate),
712
+ nn.Linear(hidden_size, out_size)
713
+ )
714
+
715
+ self.var_out_sequential = nn.Sequential(
716
+ nn.Linear(in_size, hidden_size),
717
+ nn.Dropout(dropout_rate),
718
+ nn.Linear(hidden_size, hidden_size),
719
+ activation_fun(),
720
+ nn.Dropout(dropout_rate),
721
+ nn.Linear(hidden_size, out_size)
722
+ )
723
+
724
+ def forward(self, y_noised:torch.Tensor, y_past:torch.Tensor,
725
+ cat_past:torch.Tensor, cat_fut:torch.Tensor,
726
+ num_past:Union[torch.Tensor,None] = None, num_fut:Union[torch.Tensor,None] = None):
727
+
728
+ B, fut_step, n_var = y_noised.shape
729
+ emb_y_noised = self.y_noised_linear(y_noised.float()).view(B, -1)
730
+ emb_y_past = self.y_past_linear(y_past).view(B, -1)
731
+
732
+ # concat auroregressive variables and categorical ones that are always available
733
+ full_concat = torch.cat((emb_y_noised, emb_y_past, cat_past.view(B, -1), cat_fut.view(B, -1)), dim=1)
734
+ # concat numerical vars when available
735
+ if num_past is not None:
736
+ assert self.aux_past_channels>0 # check with flag in subnet init
737
+ full_concat = torch.cat((full_concat, num_past.view(B, -1)), dim = 1)
738
+ if num_fut is not None:
739
+ assert self.aux_fut_channels>0 # check with flag in subnet init
740
+ full_concat = torch.cat((full_concat, num_fut.view(B, -1)), dim = 1)
741
+
742
+ eps_out = self.eps_out_sequential(full_concat).view(B, fut_step, n_var)
743
+ if self.learn_var:
744
+ var_out = self.var_out_sequential(full_concat.detach()).view(B, fut_step, n_var)
745
+ return eps_out, var_out
746
+ return eps_out
747
+
748
+ class SubNet3(nn.Module):
749
+ def __init__(self, learn_var, flag_aux_num, num_var, d_model, pred_step, num_layers, d_head, n_head, dropout):
750
+ super().__init__()
751
+ self.learn_var = learn_var
752
+ self.flag_aux_num = flag_aux_num
753
+
754
+ # Autoregressive with RNN (y NOT embedded as inpute)
755
+ self.y_d_model = nn.Linear(num_var, d_model)
756
+ self.rnn = sub_nn.LSTM_Model(num_var, d_model, pred_step, num_layers, dropout)
757
+ self.eps_pred_grn = sub_nn.GRN(d_model, dropout)
758
+
759
+ #categorical
760
+ self.cat_MHA = sub_nn.InterpretableMultiHead(d_model, d_head, n_head)
761
+ self.cat_grn = sub_nn.GRN(d_model, dropout)
762
+ self.cat_res_conn = sub_nn.ResidualConnection(d_model, dropout)
763
+
764
+ #numerical
765
+ if flag_aux_num:
766
+ self.num_MHA = sub_nn.InterpretableMultiHead(d_model, d_head, n_head)
767
+ self.num_grn = sub_nn.GRN(d_model, dropout)
768
+ self.num_res_conn = sub_nn.ResidualConnection(d_model, dropout)
769
+
770
+ # EPS PREDICTION
771
+ self.eps_final_grn = sub_nn.GRN(d_model, dropout)
772
+ self.eps_out_linear = nn.Linear(d_model, num_var)
773
+
774
+ if learn_var:
775
+ self.emb_eps_pred = nn.Linear(num_var, d_model)
776
+ self.var_att = sub_nn.InterpretableMultiHead(d_model, d_head, n_head)
777
+ self.var_grn = sub_nn.GRN(d_model, dropout)
778
+ self.var_out = nn.Linear(d_model, num_var)
779
+
780
+ def forward(self, y_noised:torch.Tensor, y_past:torch.Tensor,
781
+ cat_past:torch.Tensor, cat_fut:torch.Tensor,
782
+ num_past:Union[torch.Tensor,None] = None, num_fut:Union[torch.Tensor,None] = None):
783
+
784
+ # Autoregressive
785
+ emb_y_past = self.y_d_model(y_past)
786
+ pred_y_fut = self.rnn(emb_y_past)
787
+ #re-embedding future
788
+ emb_pred_y_fut = self.y_d_model(pred_y_fut)
789
+ emb_y_noised = self.y_d_model(y_noised.float())
790
+
791
+ eps_pred = self.eps_pred_grn(emb_pred_y_fut - emb_y_noised, using_norm=False)
792
+
793
+ # Categorical contribute
794
+ cat_att = self.cat_MHA(cat_fut, cat_past, emb_y_past)
795
+ cat_att = self.cat_grn(cat_att, using_norm=False)
796
+ eps_pred = self.cat_res_conn(cat_att, eps_pred, using_norm=False)
797
+
798
+ # Numerical contribute
799
+ if self.flag_aux_num:
800
+ if num_past is None:
801
+ num_past = torch.ones_like(cat_past)
802
+ if num_fut is None:
803
+ num_fut = torch.ones_like(cat_fut)
804
+ num_att = self.num_MHA(num_fut, cat_past, emb_y_past)
805
+ num_att = self.num_grn(num_att, using_norm=False)
806
+ eps_pred = self.cat_res_conn(num_att, eps_pred, using_norm=False)
807
+
808
+ eps_pred = self.eps_final_grn(eps_pred, using_norm=False)
809
+ eps_pred = self.eps_out_linear(eps_pred)
810
+
811
+ if self.learn_var:
812
+ emb_eps_pred = self.emb_eps_pred(eps_pred.detach())
813
+ emb_eps_pred = self.var_att(emb_y_noised.detach(), emb_pred_y_fut.detach(), emb_eps_pred)
814
+ emb_var_pred = self.var_grn(emb_eps_pred, using_norm=False)
815
+ var_pred = self.var_out(emb_var_pred)
816
+ return eps_pred, var_pred
817
+ return eps_pred
818
+