PyPI - dsipts - Versions diffs - 1.1.5__py3-none-any.whl - Mend

dsipts 1.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dsipts might be problematic. Click here for more details.

Files changed (81) hide show

dsipts/__init__.py +48 -0
dsipts/data_management/__init__.py +0 -0
dsipts/data_management/monash.py +338 -0
dsipts/data_management/public_datasets.py +162 -0
dsipts/data_structure/__init__.py +0 -0
dsipts/data_structure/data_structure.py +1167 -0
dsipts/data_structure/modifiers.py +213 -0
dsipts/data_structure/utils.py +173 -0
dsipts/models/Autoformer.py +199 -0
dsipts/models/CrossFormer.py +152 -0
dsipts/models/D3VAE.py +196 -0
dsipts/models/Diffusion.py +818 -0
dsipts/models/DilatedConv.py +342 -0
dsipts/models/DilatedConvED.py +310 -0
dsipts/models/Duet.py +197 -0
dsipts/models/ITransformer.py +167 -0
dsipts/models/Informer.py +180 -0
dsipts/models/LinearTS.py +222 -0
dsipts/models/PatchTST.py +181 -0
dsipts/models/Persistent.py +44 -0
dsipts/models/RNN.py +213 -0
dsipts/models/Samformer.py +139 -0
dsipts/models/TFT.py +269 -0
dsipts/models/TIDE.py +296 -0
dsipts/models/TTM.py +252 -0
dsipts/models/TimeXER.py +184 -0
dsipts/models/VQVAEA.py +299 -0
dsipts/models/VVA.py +247 -0
dsipts/models/__init__.py +0 -0
dsipts/models/autoformer/__init__.py +0 -0
dsipts/models/autoformer/layers.py +352 -0
dsipts/models/base.py +439 -0
dsipts/models/base_v2.py +444 -0
dsipts/models/crossformer/__init__.py +0 -0
dsipts/models/crossformer/attn.py +118 -0
dsipts/models/crossformer/cross_decoder.py +77 -0
dsipts/models/crossformer/cross_embed.py +18 -0
dsipts/models/crossformer/cross_encoder.py +99 -0
dsipts/models/d3vae/__init__.py +0 -0
dsipts/models/d3vae/diffusion_process.py +169 -0
dsipts/models/d3vae/embedding.py +108 -0
dsipts/models/d3vae/encoder.py +326 -0
dsipts/models/d3vae/model.py +211 -0
dsipts/models/d3vae/neural_operations.py +314 -0
dsipts/models/d3vae/resnet.py +153 -0
dsipts/models/d3vae/utils.py +630 -0
dsipts/models/duet/__init__.py +0 -0
dsipts/models/duet/layers.py +438 -0
dsipts/models/duet/masked.py +202 -0
dsipts/models/informer/__init__.py +0 -0
dsipts/models/informer/attn.py +185 -0
dsipts/models/informer/decoder.py +50 -0
dsipts/models/informer/embed.py +125 -0
dsipts/models/informer/encoder.py +100 -0
dsipts/models/itransformer/Embed.py +142 -0
dsipts/models/itransformer/SelfAttention_Family.py +355 -0
dsipts/models/itransformer/Transformer_EncDec.py +134 -0
dsipts/models/itransformer/__init__.py +0 -0
dsipts/models/patchtst/__init__.py +0 -0
dsipts/models/patchtst/layers.py +569 -0
dsipts/models/samformer/__init__.py +0 -0
dsipts/models/samformer/utils.py +154 -0
dsipts/models/tft/__init__.py +0 -0
dsipts/models/tft/sub_nn.py +234 -0
dsipts/models/timexer/Layers.py +127 -0
dsipts/models/timexer/__init__.py +0 -0
dsipts/models/ttm/__init__.py +0 -0
dsipts/models/ttm/configuration_tinytimemixer.py +307 -0
dsipts/models/ttm/consts.py +16 -0
dsipts/models/ttm/modeling_tinytimemixer.py +2099 -0
dsipts/models/ttm/utils.py +438 -0
dsipts/models/utils.py +624 -0
dsipts/models/vva/__init__.py +0 -0
dsipts/models/vva/minigpt.py +83 -0
dsipts/models/vva/vqvae.py +459 -0
dsipts/models/xlstm/__init__.py +0 -0
dsipts/models/xlstm/xLSTM.py +255 -0
dsipts-1.1.5.dist-info/METADATA +31 -0
dsipts-1.1.5.dist-info/RECORD +81 -0
dsipts-1.1.5.dist-info/WHEEL +5 -0
dsipts-1.1.5.dist-info/top_level.txt +1 -0

dsipts/models/Diffusion.py ADDED Viewed

@@ -0,0 +1,818 @@
+import torch
+import torch.nn as nn
+import numpy as np
+from .tft import sub_nn
+try:
+    import lightning.pytorch as pl
+    from .base_v2 import Base
+    OLD_PL = False
+except:
+    import pytorch_lightning as pl
+    OLD_PL = True
+    from .base import Base
+from typing import List, Union
+from ..data_structure.utils import beauty_string
+from .utils import  get_scope
+class Diffusion(Base):
+    handle_multivariate = False
+    handle_future_covariates = True
+    handle_categorical_variables = True
+    handle_quantile_loss = False
+    description = get_scope(handle_multivariate,handle_future_covariates,handle_categorical_variables,handle_quantile_loss)
+    def __init__(self,
+                 d_model: int,
+                 out_channels: int,
+                 past_steps: int,
+                 future_steps: int,
+                 past_channels: int,
+                 future_channels: int,
+                 embs: List[int],
+                 learn_var:bool,
+                 cosine_alpha: bool,
+                 diffusion_steps: int,
+                 beta: float,
+                 gamma:float,
+                 #for subnet
+                 n_layers_RNN: int,
+                 d_head: int,
+                 n_head: int,
+                 dropout_rate: float,
+                 activation: str,
+                 subnet:int,
+                 perc_subnet_learning_for_step:float,
+                 persistence_weight:float=0.0,
+                 loss_type: str='l1',
+                 quantiles:List[float]=[],
+                 optim:Union[str,None]=None,
+                 optim_config:Union[dict,None]=None,
+                 scheduler_config:Union[dict,None]=None,
+                 **kwargs)->None:
+        """Denoising Diffusion Probabilistic Model
+        Args:
+            d_model (int):
+            out_channels (int): number of target variables
+            past_steps (int): size of past window
+            future_steps (int): size of future window to be predicted
+            past_channels (int): number of variables available for the past context
+            future_channels (int): number of variables known in the future, available for forecasting
+            embs (list[int]): categorical variables dimensions for embeddings
+            learn_var (bool): Flag to make the model train the posterior variance (if True) or use the variance of posterior distribution
+            cosine_alpha (bool): Flag for the generation of alphas and betas
+            diffusion_steps (int): number of noising steps for the initial sample
+            beta (float): starting variable to generate the diffusion perturbations. Ignored if cosine_alpha == True
+            gamma (float): trade_off variable to balance loss over noise prediction and NegativeLikelihood/KL_Divergence.
+            n_layers_RNN (int): param for subnet
+            d_head (int): param for subnet
+            n_head (int): param for subnet
+            dropout_rate (float): param for subnet
+            activation (str): param for subnet
+            subnet (int): =1 for attention subnet, =2 for linear subnet. Others can be added(wait for Black Friday for discounts)
+            perc_subnet_learning_for_step (float): percentage to choose how many subnet has to be trained for every batch. Decrease this value if the loss blows up.
+            persistence_weight (float, optional): Defaults to 0.0.
+            loss_type (str, optional): Defaults to 'l1'.
+            quantiles (List[float], optional): Only [] accepted. Defaults to [].
+            optim (Union[str,None], optional): Defaults to None.
+            optim_config (Union[dict,None], optional): Defaults to None.
+            scheduler_config (Union[dict,None], optional): Defaults to None.
+        """
+        super().__init__(**kwargs)
+        self.save_hyperparameters(logger=False)
+        self.dropout = dropout_rate
+        self.persistence_weight = persistence_weight
+        self.loss_type = loss_type
+        self.optim = optim
+        self.optim_config = optim_config
+        self.scheduler_config = scheduler_config
+        #* HANDLING LOSSES
+        # With respect to other models, here quantiles are not used
+        # Here we define the loss used for noise predicted and actual noise_loss
+        # Losses for distribution are defined as functions below.
+        # trade off for noise loss and distribution loss
+        self.gamma = gamma
+        ## can not handle quantile
+        if len(quantiles)>0:
+            quantiles = []
+        assert len(quantiles) ==0
+        self.mul = 1
+        self.use_quantiles = False
+        if self.loss_type == 'mse':
+            self.loss = nn.MSELoss()
+        else:
+            self.loss = nn.L1Loss()
+        #* >>>>>>>>>>>>> canonical data parameters
+        # dimension of the model, number of variables and sequence length info
+        self.d_model = d_model
+        self.past_steps = past_steps
+        self.future_steps = future_steps
+        self.past_channels = past_channels
+        self.future_channels = future_channels
+        self.output_channels = out_channels
+        #* >>>>>>>>>>>>> specific model parameters
+        # if we want to learn also the variance, instead of using the standard posterior variance of Diffusion NN
+        self.learn_var = learn_var
+        # number of noising steps
+        self.T = diffusion_steps
+        # distribution weigths to avoid less trained subnet
+        self.multinomial_step_weights = np.ones(diffusion_steps)
+        # % of all subnets trained every batch of every epoch
+        # this percentage is controlled by the parameter 'perc_subnet_learning_for_step':
+        # - decrease or increase according to the efficiency of your machine
+        self.simultaneous_steps = max(int(diffusion_steps*perc_subnet_learning_for_step), 1)
+        #* >>>>>>>>>>>>> specific diffusion setup
+        self.s = (100*self.T)**(-1)  # offset variable to avoid problems with computations near 0
+        # value found by try and error
+        # betas and cumulative products of alphas are the main values for the diffusion model, both in (0,1) at each step t:
+        # - betas_t -> variance added at t-th step
+        # - alphas_t = 1 - betas_t
+        # - alphas_cumprod_t -> accumulation of alphas up to step t.
+        #  - - It can be considered as the remaining signal of the starting input at t-th step!!
+        # according to the flag below we can choose how to generate them!
+        if cosine_alpha:
+            # COSINE ALPHA Computation
+            # aux_perc = 0.05
+            # avoid_comp_err_norm = self.T*(1+aux_perc) # enlarging self.T to avoid errors in computations using cos^2
+            # the t-th cumulative product of alphas is the 'forgetting' schedule of the inital sample after t diffusion step
+            # in this procedure we use the function below to produce all the cumulative products of alphas
+            f_cos_t = [(np.cos( (t/self.T +self.s)/(1+self.s) * np.pi/2 ))**2 for t in range(self.T)]
+            self.alphas_cumprod = np.append(1-self.s, f_cos_t[1:]/f_cos_t[0]) # computed as scaled cumulative product of alphas f_cos_t[1:]/f_cos_t[0]
+            self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1]) # auxiliar vector to use the same index to access alpha_cumprod_t and alpha_cumprod_{t-1}
+            self.alphas = self.alphas_cumprod * (self.alphas_cumprod_prev)**(-1)
+            self.betas = 1 - self.alphas
+        else:
+            # STANDARD ALPHA Computation
+            # beta is considered constant in [0,1) for all time steps. Good values near 0.03
+            # Unlike before, here we generate all needed values starting from betas
+            self.betas = np.array([beta]*self.T)
+            self.alphas = 1 - self.betas
+            self.alphas_cumprod = np.cumprod(self.alphas)
+            self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1]) # auxiliar vector to use the same index to access alpha_cumprod_t and alpha_cumprod_{t-1}
+        # values for posterior distribution, id est the target distribution of each subnet
+        # All these values will be casted to tensors during computations using the function _extract_into_tensor
+        self.posterior_mean_coef1 = self.betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        self.posterior_mean_coef2 = (1.0 - self.alphas_cumprod_prev) * np.sqrt(self.alphas) / (1.0 - self.alphas_cumprod)
+        self.posterior_variance = np.append(self.s, self.betas[1:] * (1.0 - self.alphas_cumprod_prev[1:]) / (1.0 - self.alphas_cumprod[1:]))
+        self.posterior_log_variance = np.log(self.posterior_variance)
+        #* >>>>>>>>>>>>> LAYERS
+        # for other numerical variables in the past
+        self.aux_past_channels = past_channels - out_channels
+        self.linear_aux_past = nn.ModuleList([nn.Linear(1, d_model) for _ in range(self.aux_past_channels)])
+        # for numerical variables in the future
+        self.aux_fut_channels = future_channels
+        self.linear_aux_fut = nn.ModuleList([nn.Linear(1, d_model) for _ in range(self.aux_fut_channels)])
+        # embedding categorical for both past and future (ASSUMING BOTH AVAILABLE OR NO ONE)
+        self.seq_len = past_steps + future_steps
+        self.emb_cat_var = sub_nn.embedding_cat_variables(self.seq_len, future_steps, d_model, embs, self.device)
+        # diffusion sub nets, one subnet for each step
+        if subnet == 1:
+            self.sub_nets = nn.ModuleList([
+                SubNet1(self.aux_past_channels, self.aux_fut_channels, learn_var, out_channels, d_model, d_head, n_head, activation, dropout_rate) for _ in range(diffusion_steps)
+            ])
+        elif subnet == 2:
+            self.sub_nets = nn.ModuleList([
+                SubNet2(self.aux_past_channels, self.aux_fut_channels, learn_var, past_steps, future_steps, out_channels, d_model, activation, dropout_rate) for _ in range(diffusion_steps)
+            ])
+        elif subnet ==3 :
+            aux_num_available = self.aux_past_channels>0 or self.aux_fut_channels>0 # if we have numerical vars, use it
+            self.sub_nets = nn.ModuleList([
+                SubNet3(learn_var, aux_num_available, out_channels, d_model, future_steps, n_layers_RNN, d_head, n_head, dropout_rate) for _ in range(diffusion_steps)
+            ])
+        else:
+            raise ValueError("Wrong number for Subnet. Not yet implemented!")
+    def forward(self, batch:dict)-> float:
+        """training process of the diffusion network
+        Args:
+            batch (dict): variables loaded
+        Returns:
+            float: total loss about the prediction of the noises over all subnets extracted
+        """
+        # LOADING TARGET VARIABLES
+        y_to_be_pred = batch['y'].to(self.device)
+        # LOADING AUTOREGRESSIVE CONTEXT OF TARGET VARIABLES
+        num_past = batch['x_num_past'].to(self.device)
+        idx_target = batch['idx_target'][0]
+        y_past = num_past[:,:,idx_target]
+        # LOADING EMBEDDING CATEGORICAL VARIABLES
+        emb_cat_past, emb_cat_fut = self.cat_categorical_vars(batch)
+        emb_cat_past = torch.mean(emb_cat_past, dim = 2)
+        emb_cat_fut = torch.mean(emb_cat_fut, dim = 2)
+        ### LOADING PAST AND FUTURE NUMERICAL VARIABLES
+        # load in the model auxiliar numerical variables
+        if self.aux_past_channels>0: # if we have more numerical variables about past
+            aux_num_past = self.remove_var(num_past, idx_target, 2) # remove the autoregressive variable
+            assert self.aux_past_channels == aux_num_past.size(2),  beauty_string(f"{self.aux_past_channels} LAYERS FOR PAST VARS AND {aux_num_past.size(2)} VARS",'section',True) # to check if we are using the expected number of variables about past
+            # concat all embedded vars and mean of them
+            aux_emb_num_past = torch.Tensor().to(self.device)
+            for i, layer in enumerate(self.linear_aux_past):
+                aux_emb_past = layer(aux_num_past[:,:,[i]]).unsqueeze(2)
+                aux_emb_num_past = torch.cat((aux_emb_num_past, aux_emb_past), dim=2)
+            aux_emb_num_past = torch.mean(aux_emb_num_past, dim = 2)
+        else:
+            aux_emb_num_past = None # non available vars
+        if self.aux_fut_channels>0: # if we have more numerical variables about future
+            # AUX means AUXILIARY variables
+            aux_num_fut = batch['x_num_future'].to(self.device)
+            assert self.aux_fut_channels == aux_num_fut.size(2), beauty_string(f"{self.aux_fut_channels} LAYERS FOR PAST VARS AND {aux_num_fut.size(2)} VARS",'section',True)  # to check if we are using the expected number of variables about fut
+            # concat all embedded vars and mean of them
+            aux_emb_num_fut = torch.Tensor().to(self.device)
+            for j, layer in enumerate(self.linear_aux_fut):
+                aux_emb_fut = layer(aux_num_fut[:,:,[j]]).unsqueeze(2)
+                aux_emb_num_fut = torch.cat((aux_emb_num_fut, aux_emb_fut), dim=2)
+            aux_emb_num_fut = torch.mean(aux_emb_num_fut, dim = 2)
+        else:
+            aux_emb_num_fut = None # non available vars
+        ### actual DIFFUSION process ----------------------------------------------
+        ##* CHOOSE THE t SUBNET
+        # We have T subnets: [0, 1, ..., T-1].
+        values = list(range(self.T))
+        ## Probabilistic way to choose the subnet properly
+        # avoid exploding step_weights going on with trainings
+        self.improving_weight_during_training()
+        # normalizing weights for a more stable subnet training
+        t_wei = self.multinomial_step_weights/np.sum(self.multinomial_step_weights)
+        # extract times t
+        drawn_t = np.random.choice(values, size=self.simultaneous_steps, replace=False, p=t_wei) # type: ignore
+        if 0 not in drawn_t: drawn_t = np.append(drawn_t, 0)
+        # update weights
+        non_draw_val = np.delete(values, drawn_t) # type: ignore
+        self.multinomial_step_weights[non_draw_val] += 1 # increase weights of non-extracted subnet
+        # init negative loss for the first step
+        tot_loss = -1
+        for t in drawn_t:
+            # LOADING THE SUBNET
+            sub_net = self.sub_nets[t]
+            # Get y and noise it
+            y_noised, true_mean, true_log_var, actual_noise = self.q_sample(y_to_be_pred, t)
+            # compute the output from that network using the sample with noises
+            # output composed of: noise predicted and, if learn_var=True, vector for variances
+            if self.learn_var:
+                #predict the noise!
+                eps_pred, var_aux_out = sub_net(y_noised, y_past, emb_cat_past, emb_cat_fut, aux_emb_num_past, aux_emb_num_fut)
+                # compute posterior variance of NN (using interpolation)
+                pre_var_t = self._extract_into_tensor(self.betas, t, eps_pred.shape)
+                post_var_t = self._extract_into_tensor(self.posterior_variance, t, eps_pred.shape)
+                post_sigma = torch.exp( var_aux_out*torch.log(pre_var_t) + (1-var_aux_out)*torch.log(post_var_t) ) # variance, not log_var
+            else:
+                eps_pred = sub_net(y_noised, y_past, emb_cat_past, emb_cat_fut, aux_emb_num_past, aux_emb_num_fut)
+                post_sigma = self._extract_into_tensor(self.posterior_variance, t, eps_pred.shape)
+            # posterior mean assuming the predicted noise is the actual one
+            out_mean = self._extract_into_tensor(np.sqrt(1/self.alphas), t, eps_pred.shape) * ( y_noised - self._extract_into_tensor(self.betas/np.sqrt(1-self.alphas_cumprod), t, eps_pred.shape) * eps_pred )
+            # # At the first timestep return the negative likelihood,
+            if t==0:
+                # post_var =  self._extract_into_tensor(self.posterior_variance, t, y_to_be_pred.shape)
+                neg_likelihoods = -torch.log(self.gaussian_likelihood(y_to_be_pred, out_mean, post_sigma)) #! (values to be predicted, mean of values predicted, variance)
+                distribution_loss = torch.mean(neg_likelihoods)
+            # # otherwise return KL( q(x_{t-1}|x_t, x_0) || p(x_{t-1}|x_t) )
+            else:
+                # COMPUTE LOSS between TRUE eps and DRAWN eps_pred
+                kl_divergence = self.normal_kl(true_mean, true_log_var, out_mean, torch.log(post_sigma)) # (true mean, true log var, mean of values predicted, log var predicted)
+                distribution_loss = torch.mean(kl_divergence)
+            # always compute the loss about the straight prediction of the noise
+            noise_loss = self.loss(eps_pred, actual_noise)
+            # if tot_loss == -1:
+            #     beauty_string(f'NOISE LOSS: {noise_loss.item()}','info',True)
+            #     beauty_string(f'ACTUAL NOISE: {actual_noise[0].min()}, {actual_noise[0].max()}, {actual_noise[0].mean()}, {actual_noise[0].var()}','info',True)
+            #     beauty_string(f'PREDICTED NOISE: {eps_pred[0].min()}, {eps_pred[0].max()}, {eps_pred[0].mean()}, {eps_pred[0].var()}','info',True)
+            noise_loss += self.gamma*distribution_loss # add, scaled according to gamma, the distribution_loss
+            # update the total loss
+            if tot_loss==-1:
+                tot_loss = noise_loss
+            else:
+                tot_loss += noise_loss
+        return tot_loss
+    def training_step(self, batch, batch_idx):
+        # the training loss is already computed in the forward method
+        loss_eps = self(batch)
+        return loss_eps
+    def inference(self, batch:dict) -> torch.Tensor:
+        """Inference process to forecast future y
+        Args:
+            batch (dict): Keys checked ['x_num_past, 'idx_target', 'x_num_future', 'x_cat_past', 'x_cat_future']
+        Returns:
+            torch.Tensor: generated sequence [batch_size, future_steps, num_var]
+        """
+        # LOADING AUTOREGRESSIVE CONTEXT OF TARGET VARIABLES
+        num_past = batch['x_num_past'].to(self.device)
+        batch_size = num_past.shape[0]
+        idx_target = batch['idx_target'][0]
+        y_past = num_past[:,:,idx_target]
+        # LOADING EMBEDDING CATEGORICAL VARIABLES
+        emb_cat_past, emb_cat_fut = self.cat_categorical_vars(batch)
+        emb_cat_past = torch.mean(emb_cat_past, dim = 2)
+        emb_cat_fut = torch.mean(emb_cat_fut, dim = 2)
+        ### LOADING PAST AND FUTURE NUMERICAL VARIABLES
+        # this check is done simultaneously
+        # because in the model we use auxiliar numerical variables
+        # only if we have both them in the past and in the future
+        ### LOADING PAST AND FUTURE NUMERICAL VARIABLES
+        # load in the model auxiliar numerical variables
+        if self.aux_past_channels>0: # if we have more numerical variables about past
+            aux_num_past = self.remove_var(num_past, idx_target, 2) # remove the autoregressive variable
+            assert self.aux_past_channels == aux_num_past.size(2),  beauty_string(f"{self.aux_past_channels} LAYERS FOR PAST VARS AND {aux_num_past.size(2)} VARS",'section',True) # to check if we are using the expected number of variables about past
+            # concat all embedded vars and mean of them
+            aux_emb_num_past = torch.Tensor().to(self.device)
+            for i, layer in enumerate(self.linear_aux_past):
+                aux_emb_past = layer(aux_num_past[:,:,[i]]).unsqueeze(2)
+                aux_emb_num_past = torch.cat((aux_emb_num_past, aux_emb_past), dim=2)
+            aux_emb_num_past = torch.mean(aux_emb_num_past, dim = 2)
+        else:
+            aux_emb_num_past = None # non available vars
+        if self.aux_fut_channels>0: # if we have more numerical variables about future
+            # AUX means AUXILIARY variables
+            aux_num_fut = batch['x_num_future'].to(self.device)
+            assert self.aux_fut_channels == aux_num_fut.size(2), beauty_string(f"{self.aux_fut_channels} LAYERS FOR PAST VARS AND {aux_num_fut.size(2)} VARS",'section',True)  # to check if we are using the expected number of variables about fut
+            # concat all embedded vars and mean of them
+            aux_emb_num_fut = torch.Tensor().to(self.device)
+            for j, layer in enumerate(self.linear_aux_fut):
+                aux_emb_fut = layer(aux_num_fut[:,:,[j]]).unsqueeze(2)
+                aux_emb_num_fut = torch.cat((aux_emb_num_fut, aux_emb_fut), dim=2)
+            aux_emb_num_fut = torch.mean(aux_emb_num_fut, dim = 2)
+        else:
+            aux_emb_num_fut = None # non available vars
+        # DIFFUSION INFERENCE
+        # import pdb; pdb.set_trace() # can use also torch.normal(0, 1, size=y_noised.shape)
+        y_noised = torch.randn((batch_size, self.future_steps, self.output_channels)).to(self.device)
+        # pass the white noise in sub nets
+        for t in range(self.T-1, -1, -1): # INVERSE cycle over all subnets, but not the last one
+            sub_net = self.sub_nets[t] # load the subnet
+            ## CHECK THE NUMBER OF PARAMS
+            #   model_parameters = filter(lambda p: p.requires_grad, model.parameters())
+            #   params = sum([np.prod(p.size()) for p in model_parameters]) -> 13K
+            if self.learn_var:
+                eps_pred, var_aux_out = sub_net(y_noised, y_past, emb_cat_past, emb_cat_fut, aux_emb_num_past, aux_emb_num_fut)
+                # interpolazion of variance
+                pre_var_t = self._extract_into_tensor(self.betas, t, eps_pred.shape)
+                post_var_t = self._extract_into_tensor(self.posterior_variance, t, eps_pred.shape)
+                post_sigma = torch.exp(var_aux_out*torch.log(pre_var_t) + (1-var_aux_out)*torch.log(post_var_t))
+            else:
+                eps_pred = sub_net(y_noised, y_past, emb_cat_past, emb_cat_fut, aux_emb_num_past, aux_emb_num_fut)
+                post_sigma = self._extract_into_tensor(self.posterior_variance, t, eps_pred.shape)
+            # Sample x_{t-1} from the model at the given timestep.
+            # y_noised = self._extract_into_tensor(1/np.sqrt(self.alphas), t, y_noised.shape)*( y_noised - self._extract_into_tensor(np.sqrt(self.betas), t, eps_pred.shape)*eps_pred )
+            y_noised = 1/torch.sqrt(1-post_sigma)*(y_noised - torch.sqrt(post_sigma)*eps_pred)
+            # if t>0 :
+            #     noise = torch.rand_like(y_noised).to(self.device)
+            #     y_noised = y_noised + torch.sqrt(post_sigma)*noise
+        out = y_noised.view(-1, self.future_steps, self.output_channels, 1)
+        return out
+    # for validation extract the output from the self.inference method
+    def validation_step(self, batch, batch_idx):
+        out = self.inference(batch)
+        loss = self.compute_loss(batch,out)
+        return loss
+    # function to concat embedded categorical variables
+    def cat_categorical_vars(self, batch:dict):
+        """Extracting categorical context about past and future
+        Args:
+            batch (dict): Keys checked -> ['x_cat_past', 'x_cat_future']
+        Returns:
+            List[torch.Tensor, torch.Tensor]: cat_emb_past, cat_emb_fut
+        """
+        # GET AVAILABLE CATEGORICAL CONTEXT
+        if 'x_cat_past' in batch.keys() and 'x_cat_future' in batch.keys(): # if we have both
+            # HERE WE ASSUME SAME NUMBER AND KIND OF VARIABLES IN PAST AND FUTURE
+            cat_past = batch['x_cat_past'].to(self.device)
+            cat_fut = batch['x_cat_future'].to(self.device)
+            cat_full = torch.cat((cat_past, cat_fut), dim = 1)
+            # EMB CATEGORICAL VARIABLES AND THEN SPLIT IN PAST AND FUTURE
+            emb_cat_full = self.emb_cat_var(cat_full,self.device)
+        else:
+            emb_cat_full = self.emb_cat_var(batch['x_num_past'].shape[0],self.device)
+        # CONCAT THEM, according to self.emb_cat_var usage
+        cat_full = torch.cat((cat_past, cat_fut), dim = 1)
+        # actual embedding
+        emb_cat_full = self.emb_cat_var(cat_full,self.device)
+        # split past and future categorical embedded variables
+        cat_emb_past = emb_cat_full[:,:self.past_steps,:,:]
+        cat_emb_fut = emb_cat_full[:,-self.future_steps:,:,:]
+        return cat_emb_past, cat_emb_fut
+    #function to extract from batch['x_num_past'] all variables except the one autoregressive
+    def remove_var(self, tensor: torch.Tensor, indexes_to_exclude: list, dimension: int)-> torch.Tensor:
+        """Function to remove variables from tensors in chosen dimension and position
+        Args:
+            tensor (torch.Tensor): starting tensor
+            indexes_to_exclude (list): index of the chosen dimension we want t oexclude
+            dimension (int): dimension of the tensor on which we want to work (not list od dims!!)
+        Returns:
+            torch.Tensor: new tensor without the chosen variables
+        """
+        remaining_idx = torch.tensor([i for i in range(tensor.size(dimension)) if i not in indexes_to_exclude]).to(tensor.device)
+        # Select the desired sub-tensor
+        extracted_subtensors = torch.index_select(tensor, dim=dimension, index=remaining_idx)
+        return extracted_subtensors
+    def improving_weight_during_training(self):
+        """
+        Each time we sample from multinomial we subtract the minimum for more precise sampling,
+        avoiding great learning differences among subnets
+        This lead to more stable inference also in early training, mainly for common context embedding.
+        For probabilistic reason, weights has to be >0, so we subtract min-1
+        """
+        self.multinomial_step_weights -= (self.multinomial_step_weights.min()-1)
+        return
+    ### >>>>>>>>>>>>> AUXILIARY MODEL FUNCS
+    def q_sample(self, x_start: torch.Tensor, t: int)-> List[torch.Tensor]:
+        """Diffuse x_start for t diffusion steps.
+        In other words, sample from q(x_t | x_0).
+        Also, compute the mean and variance of the diffusion posterior:
+            q(x_{t-1} | x_t, x_0)
+        Posterior mean and variance are the ones to be predicted
+        Args:
+            x_start (torch.Tensor): values to be predicted
+            t (int): diffusion step
+        Returns:
+            List[torch.Tensor, torch.Tensor, torch.Tensor]: q_sample, posterior mean, posterior log variance and the actual noise
+        """
+        # noise from normal distribution
+        noise = torch.randn_like(x_start)
+        # direct diffusion at t-th step
+        q_sample = self._extract_into_tensor(np.sqrt(self.alphas_cumprod), t, x_start.shape) * x_start + self._extract_into_tensor(np.sqrt(1 - self.alphas_cumprod), t, x_start.shape) * noise
+        # compute meean and variance
+        q_mean = self._extract_into_tensor(self.posterior_mean_coef1, t, q_sample.shape) * x_start + self._extract_into_tensor(self.posterior_mean_coef2, t, q_sample.shape) * q_sample
+        q_log_var = self._extract_into_tensor( self.posterior_log_variance, t, q_sample.shape )
+        # return, the sample, its posterior mean and log_variance, the noise used
+        return [q_sample, q_mean, q_log_var, noise]
+    def normal_kl(self, mean1, logvar1, mean2, logvar2):
+        """
+        Compute the KL divergence between two gaussians. Also called relative entropy.
+        KL divergence of P from Q is the expected excess surprise from using Q as a model when the actual distribution is P.
+        KL(P||Q) = P*log(P/Q) or -P*log(Q/P)
+        # In the context of machine learning, KL(P||Q) is often called the 'information gain'
+        # achieved if P would be used instead of Q which is currently used.
+        Shapes are automatically broadcasted, so batches can be compared to
+        scalars, among other use cases.
+        """
+        # -1/2 + log(sigma2/sigma1) + sigma1^2/2sigma2^2 + (mu1-mu2)^2/2sigma2^2
+        return 0.5 * (
+            -1.0
+            + logvar2 - logvar1
+            + torch.exp(logvar1 - logvar2)
+            + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
+        )
+    def gaussian_likelihood(self, x, mean, var):
+        term1 = 1.0 / torch.sqrt(2 * np.pi * var)
+        term2 = torch.exp(-0.5 * ((x - mean)**2 / var))
+        likelihood = term1 * term2
+        return likelihood
+    def gaussian_log_likelihood(self, x, mean, var):
+        term1 = -0.5 * ((x - mean) / torch.sqrt(var))**2
+        term2 = -0.5 * torch.log(2 * torch.tensor(np.pi) * var)
+        log_likelihood = term1 + term2
+        return log_likelihood
+    def _extract_into_tensor(self, arr, timesteps, broadcast_shape):
+        """
+        Extract values from a 1-D numpy array for a batch of indices.
+        :param arr: the 1-D numpy array.
+        :param timesteps: a tensor of indices into the array to extract.
+        :param broadcast_shape: a larger shape of K dimensions with the batch
+                                dimension equal to the length of timesteps.
+        :return: a tensor of shape 'broadcast_shape' where the shape has K dims.
+        """
+        ten = torch.tensor(arr[timesteps])
+        return ten.expand(broadcast_shape).to(self.device)
+### >>>>>>>>>>>>>  SUB NET
+class SubNet1(nn.Module):
+    def __init__(self, aux_past_ch, aux_fut_ch, learn_var:bool, output_channel:int, d_model:int, d_head:int, n_head:int, activation:str, dropout_rate:float) -> None:
+        """ -> SUBNET of the DIFFUSION MODEL (DDPM)
+        It starts with an autoregressive LSTM Network computation of epsilon, then subtracted to 'y_noised' tensor. This is always possible!
+        Now we have an approximation of our 'eps_hat', that at the end will pass in a residual connection with its embedded version 'emb_eps_hat'.
+        'emb_eps_hat' will be update with respect to available info about categorical values of our serie:
+        Through an ATTENTION Network we compare past categorical with future categorical to update the embedded noise predicted.
+        Also, if we have values about auxiliary numerical variables both in past and future, the changes of these variables will be fetched
+        by another ATTENTION Network.
+        The goal is ensure valuable computations for 'eps' always, and then updating things if we have enough data.
+        Both attentions uses { Q = *_future, K = *_past, V = y_past } using as much as possible context variables for better updates.
+        Args:
+            learn_var (bool): set if the network has to learn the optim variance of each step
+            output_channel (int): number of variables to be predicted
+            future_steps (int): number of step in the future, so the number of timesstep to be predicted
+            d_model (int): hidden dimension of the model
+            num_layers_RNN (int): number of layers for autoregressive prediction
+            d_head (int): number of heads for Attention Networks
+            n_head (int): hidden dimension of heads for Attention Networks
+            dropout_rate (float):
+        """
+        super().__init__()
+        self.aux_past_channels = aux_past_ch
+        self.aux_fut_channels = aux_fut_ch
+        self.learn_var = learn_var
+        activation_fun = eval(activation)
+        self.y_noised_linear = nn.Linear(output_channel, d_model)
+        self.y_past_linear = nn.Linear(output_channel, d_model)
+        self.past_sequential = nn.Sequential(
+            nn.Linear(d_model*3, d_model*2) if self.aux_past_channels>0 else nn.Linear(d_model*2, d_model*2),
+            activation_fun(),
+            nn.Linear(d_model*2, d_model)
+        )
+        self.fut_sequential = nn.Sequential(
+            nn.Linear(d_model*3, d_model*2) if self.aux_fut_channels>0 else nn.Linear(d_model*2, d_model*2),
+            activation_fun(),
+            nn.Linear(d_model*2, d_model)
+        )
+        self.y_sequential = nn.Sequential(
+            nn.Linear(d_model*2, d_model),
+            activation_fun(),
+            nn.Linear(d_model, d_model)
+        )
+        self.attention = sub_nn.InterpretableMultiHead(d_model, d_head, n_head)
+        # if learn_var == True, we want to predict an additional variable for he variance
+        # just an intermediate dimension for linears
+        hidden_size = int(d_model/3)
+        self.eps_out_sequential = nn.Sequential(
+            nn.Linear(d_model, hidden_size),
+            activation_fun(),
+            nn.Linear(hidden_size, output_channel)
+        )
+        self.var_out_sequential = nn.Sequential(
+            nn.Linear(output_channel, hidden_size),
+            nn.Linear(hidden_size, d_model),
+            activation_fun(),
+            nn.Linear(d_model, d_model),
+            activation_fun(),
+            nn.Linear(d_model, hidden_size),
+            nn.Linear(hidden_size, output_channel)
+        )
+    def forward(self, y_noised:torch.Tensor, y_past:torch.Tensor,
+                cat_past:torch.Tensor, cat_fut:torch.Tensor,
+                num_past:Union[torch.Tensor,None] = None, num_fut:Union[torch.Tensor,None] = None):
+        """'DIFFUSION SUBNET
+        Args:
+            y_noised (torch.Tensor): [B, future_step, num_var]
+            y_past (torch.Tensor): [B, past_step, num_var]
+            cat_past (torch.Tensor, optional): [B, past_step, d_model]. Defaults to None.
+            cat_fut (torch.Tensor, optional): [B, future_step, d_model]. Defaults to None.
+            num_past (torch.Tensor, optional): [B, past_step, d_model]. Defaults to None.
+            num_fut (torch.Tensor, optional): [B, future_step, d_model]. Defaults to None.
+        Returns:
+            torch.Tensor: predicted noise [B, future_step, num_var]. According to 'learn_var' param in initialization, the subnet returns another tensor of same size about the variance
+        """
+        emb_y_noised = self.y_noised_linear(y_noised.float())
+        emb_y_past = self.y_past_linear(y_past)
+        # LIN FOR PAST
+        past = [emb_y_past, cat_past]
+        if self.aux_past_channels>0:
+            past.append(num_past)
+        past_seq_input = torch.cat(past, dim=2) # type: ignore
+        past_seq = self.past_sequential(past_seq_input) # -> [B, future_step, d_model]
+        # LIN FOR FUT
+        fut = [emb_y_noised, cat_fut]
+        if self.aux_fut_channels>0:
+            fut.append(num_fut)
+        fut_seq_input = torch.cat(fut, dim=2) # type: ignore
+        fut_seq = self.fut_sequential(fut_seq_input) # -> [B, future_step, d_model]
+        # ATTENTION
+        attention = self.attention(fut_seq, past_seq, emb_y_past)
+        # OUTPUT
+        eps_out = self.eps_out_sequential(attention)
+        # if LEARN_VAR
+        if self.learn_var:
+            var_out = eps_out.detach()
+            var_out = self.var_out_sequential(var_out)
+            return eps_out, var_out
+        return eps_out
+class SubNet2(nn.Module):
+    def __init__(self, aux_past_ch, aux_fut_ch, learn_var:bool, past_steps, future_steps, output_channel:int, d_model:int, activation:str, dropout_rate:float):
+        super().__init__()
+        self.aux_past_channels = aux_past_ch
+        self.aux_fut_channels = aux_fut_ch
+        self.learn_var = learn_var
+        # in_size changes wrt numerical vars
+        in_size = ( past_steps*(2+bool(aux_past_ch)) + future_steps*(2 + bool(aux_fut_ch)) ) * d_model
+        out_size = output_channel * future_steps
+        activation_fun = eval(activation)
+        self.y_noised_linear = nn.Linear(output_channel, d_model)
+        self.y_past_linear = nn.Linear(output_channel, d_model)
+        hidden_size = int( (output_channel + d_model)/2 )
+        self.eps_out_sequential = nn.Sequential(
+            nn.Linear(in_size, hidden_size),
+            nn.Dropout(dropout_rate),
+            nn.Linear(hidden_size, d_model),
+            activation_fun(),
+            nn.Dropout(dropout_rate),
+            nn.Linear(d_model, hidden_size),
+            activation_fun(),
+            nn.Dropout(dropout_rate),
+            nn.Linear(hidden_size, out_size)
+        )
+        self.var_out_sequential = nn.Sequential(
+            nn.Linear(in_size, hidden_size),
+            nn.Dropout(dropout_rate),
+            nn.Linear(hidden_size, hidden_size),
+            activation_fun(),
+            nn.Dropout(dropout_rate),
+            nn.Linear(hidden_size, out_size)
+        )
+    def forward(self, y_noised:torch.Tensor, y_past:torch.Tensor,
+                cat_past:torch.Tensor, cat_fut:torch.Tensor,
+                num_past:Union[torch.Tensor,None] = None, num_fut:Union[torch.Tensor,None] = None):
+        B, fut_step, n_var = y_noised.shape
+        emb_y_noised = self.y_noised_linear(y_noised.float()).view(B, -1)
+        emb_y_past = self.y_past_linear(y_past).view(B, -1)
+        # concat auroregressive variables and categorical ones that are always available
+        full_concat = torch.cat((emb_y_noised, emb_y_past, cat_past.view(B, -1), cat_fut.view(B, -1)), dim=1)
+        # concat numerical vars when available
+        if num_past is not None:
+            assert self.aux_past_channels>0 # check with flag in subnet init
+            full_concat = torch.cat((full_concat, num_past.view(B, -1)), dim = 1)
+        if num_fut is not None:
+            assert self.aux_fut_channels>0 # check with flag in subnet init
+            full_concat = torch.cat((full_concat, num_fut.view(B, -1)), dim = 1)
+        eps_out = self.eps_out_sequential(full_concat).view(B, fut_step, n_var)
+        if self.learn_var:
+            var_out = self.var_out_sequential(full_concat.detach()).view(B, fut_step, n_var)
+            return eps_out, var_out
+        return eps_out
+class SubNet3(nn.Module):
+    def __init__(self, learn_var, flag_aux_num, num_var, d_model, pred_step, num_layers, d_head, n_head, dropout):
+        super().__init__()
+        self.learn_var = learn_var
+        self.flag_aux_num = flag_aux_num
+        # Autoregressive with RNN (y NOT embedded as inpute)
+        self.y_d_model = nn.Linear(num_var, d_model)
+        self.rnn = sub_nn.LSTM_Model(num_var, d_model, pred_step, num_layers, dropout)
+        self.eps_pred_grn = sub_nn.GRN(d_model, dropout)
+        #categorical
+        self.cat_MHA = sub_nn.InterpretableMultiHead(d_model, d_head, n_head)
+        self.cat_grn = sub_nn.GRN(d_model, dropout)
+        self.cat_res_conn = sub_nn.ResidualConnection(d_model, dropout)
+        #numerical
+        if flag_aux_num:
+            self.num_MHA = sub_nn.InterpretableMultiHead(d_model, d_head, n_head)
+            self.num_grn = sub_nn.GRN(d_model, dropout)
+            self.num_res_conn = sub_nn.ResidualConnection(d_model, dropout)
+        # EPS PREDICTION
+        self.eps_final_grn = sub_nn.GRN(d_model, dropout)
+        self.eps_out_linear = nn.Linear(d_model, num_var)
+        if learn_var:
+            self.emb_eps_pred = nn.Linear(num_var, d_model)
+            self.var_att = sub_nn.InterpretableMultiHead(d_model, d_head, n_head)
+            self.var_grn = sub_nn.GRN(d_model, dropout)
+            self.var_out = nn.Linear(d_model, num_var)
+    def forward(self, y_noised:torch.Tensor, y_past:torch.Tensor,
+                cat_past:torch.Tensor, cat_fut:torch.Tensor,
+                num_past:Union[torch.Tensor,None] = None, num_fut:Union[torch.Tensor,None] = None):
+        # Autoregressive
+        emb_y_past = self.y_d_model(y_past)
+        pred_y_fut = self.rnn(emb_y_past)
+        #re-embedding future
+        emb_pred_y_fut = self.y_d_model(pred_y_fut)
+        emb_y_noised = self.y_d_model(y_noised.float())
+        eps_pred = self.eps_pred_grn(emb_pred_y_fut - emb_y_noised, using_norm=False)
+        # Categorical contribute
+        cat_att = self.cat_MHA(cat_fut, cat_past, emb_y_past)
+        cat_att = self.cat_grn(cat_att, using_norm=False)
+        eps_pred = self.cat_res_conn(cat_att, eps_pred, using_norm=False)
+        # Numerical contribute
+        if self.flag_aux_num:
+            if num_past is None:
+                num_past = torch.ones_like(cat_past)
+            if num_fut is None:
+                num_fut = torch.ones_like(cat_fut)
+            num_att = self.num_MHA(num_fut, cat_past, emb_y_past)
+            num_att = self.num_grn(num_att, using_norm=False)
+            eps_pred = self.cat_res_conn(num_att, eps_pred, using_norm=False)
+        eps_pred = self.eps_final_grn(eps_pred, using_norm=False)
+        eps_pred = self.eps_out_linear(eps_pred)
+        if self.learn_var:
+            emb_eps_pred = self.emb_eps_pred(eps_pred.detach())
+            emb_eps_pred = self.var_att(emb_y_noised.detach(), emb_pred_y_fut.detach(), emb_eps_pred)
+            emb_var_pred = self.var_grn(emb_eps_pred, using_norm=False)
+            var_pred = self.var_out(emb_var_pred)
+            return eps_pred, var_pred
+        return eps_pred