PyPI - dsipts - Versions diffs - 1.1.5__py3-none-any.whl - Mend

dsipts 1.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dsipts might be problematic. Click here for more details.

Files changed (81) hide show

dsipts/__init__.py +48 -0
dsipts/data_management/__init__.py +0 -0
dsipts/data_management/monash.py +338 -0
dsipts/data_management/public_datasets.py +162 -0
dsipts/data_structure/__init__.py +0 -0
dsipts/data_structure/data_structure.py +1167 -0
dsipts/data_structure/modifiers.py +213 -0
dsipts/data_structure/utils.py +173 -0
dsipts/models/Autoformer.py +199 -0
dsipts/models/CrossFormer.py +152 -0
dsipts/models/D3VAE.py +196 -0
dsipts/models/Diffusion.py +818 -0
dsipts/models/DilatedConv.py +342 -0
dsipts/models/DilatedConvED.py +310 -0
dsipts/models/Duet.py +197 -0
dsipts/models/ITransformer.py +167 -0
dsipts/models/Informer.py +180 -0
dsipts/models/LinearTS.py +222 -0
dsipts/models/PatchTST.py +181 -0
dsipts/models/Persistent.py +44 -0
dsipts/models/RNN.py +213 -0
dsipts/models/Samformer.py +139 -0
dsipts/models/TFT.py +269 -0
dsipts/models/TIDE.py +296 -0
dsipts/models/TTM.py +252 -0
dsipts/models/TimeXER.py +184 -0
dsipts/models/VQVAEA.py +299 -0
dsipts/models/VVA.py +247 -0
dsipts/models/__init__.py +0 -0
dsipts/models/autoformer/__init__.py +0 -0
dsipts/models/autoformer/layers.py +352 -0
dsipts/models/base.py +439 -0
dsipts/models/base_v2.py +444 -0
dsipts/models/crossformer/__init__.py +0 -0
dsipts/models/crossformer/attn.py +118 -0
dsipts/models/crossformer/cross_decoder.py +77 -0
dsipts/models/crossformer/cross_embed.py +18 -0
dsipts/models/crossformer/cross_encoder.py +99 -0
dsipts/models/d3vae/__init__.py +0 -0
dsipts/models/d3vae/diffusion_process.py +169 -0
dsipts/models/d3vae/embedding.py +108 -0
dsipts/models/d3vae/encoder.py +326 -0
dsipts/models/d3vae/model.py +211 -0
dsipts/models/d3vae/neural_operations.py +314 -0
dsipts/models/d3vae/resnet.py +153 -0
dsipts/models/d3vae/utils.py +630 -0
dsipts/models/duet/__init__.py +0 -0
dsipts/models/duet/layers.py +438 -0
dsipts/models/duet/masked.py +202 -0
dsipts/models/informer/__init__.py +0 -0
dsipts/models/informer/attn.py +185 -0
dsipts/models/informer/decoder.py +50 -0
dsipts/models/informer/embed.py +125 -0
dsipts/models/informer/encoder.py +100 -0
dsipts/models/itransformer/Embed.py +142 -0
dsipts/models/itransformer/SelfAttention_Family.py +355 -0
dsipts/models/itransformer/Transformer_EncDec.py +134 -0
dsipts/models/itransformer/__init__.py +0 -0
dsipts/models/patchtst/__init__.py +0 -0
dsipts/models/patchtst/layers.py +569 -0
dsipts/models/samformer/__init__.py +0 -0
dsipts/models/samformer/utils.py +154 -0
dsipts/models/tft/__init__.py +0 -0
dsipts/models/tft/sub_nn.py +234 -0
dsipts/models/timexer/Layers.py +127 -0
dsipts/models/timexer/__init__.py +0 -0
dsipts/models/ttm/__init__.py +0 -0
dsipts/models/ttm/configuration_tinytimemixer.py +307 -0
dsipts/models/ttm/consts.py +16 -0
dsipts/models/ttm/modeling_tinytimemixer.py +2099 -0
dsipts/models/ttm/utils.py +438 -0
dsipts/models/utils.py +624 -0
dsipts/models/vva/__init__.py +0 -0
dsipts/models/vva/minigpt.py +83 -0
dsipts/models/vva/vqvae.py +459 -0
dsipts/models/xlstm/__init__.py +0 -0
dsipts/models/xlstm/xLSTM.py +255 -0
dsipts-1.1.5.dist-info/METADATA +31 -0
dsipts-1.1.5.dist-info/RECORD +81 -0
dsipts-1.1.5.dist-info/WHEEL +5 -0
dsipts-1.1.5.dist-info/top_level.txt +1 -0

dsipts/models/VVA.py ADDED Viewed

@@ -0,0 +1,247 @@
+from torch import  nn
+import torch
+try:
+    import lightning.pytorch as pl
+    from .base_v2 import Base
+    OLD_PL = False
+except:
+    import pytorch_lightning as pl
+    OLD_PL = True
+    from .base import Base
+from typing import List, Union
+from .vva.minigpt import Block
+import math
+from torch.nn import functional as F
+from ..data_structure.utils import beauty_string
+from .utils import  get_scope
+torch.autograd.set_detect_anomaly(True)
+class VVA(Base):
+    handle_multivariate = False
+    handle_future_covariates = False
+    handle_categorical_variables = False
+    handle_quantile_loss = False
+    description = get_scope(handle_multivariate,handle_future_covariates,handle_categorical_variables,handle_quantile_loss)
+    def __init__(self,
+                 past_steps:int,
+                 future_steps:int,
+                 past_channels:int,
+                 future_channels:int,
+                 embs:List[int],
+                 d_model:int,
+                 max_voc_size:int,
+                 token_split: int,
+                 num_layers:int,
+                 dropout_rate:float,
+                 n_heads:int,
+                 out_channels:int,
+                 persistence_weight:float=0.0,
+                 loss_type: str='l1',
+                 quantiles:List[int]=[],
+                 optim:Union[str,None]=None,
+                 optim_config:dict=None,
+                 scheduler_config:dict=None,
+                 **kwargs)->None:
+        """ Custom encoder-decoder
+        Args:
+            past_steps (int):  number of past datapoints used
+            future_steps (int): number of future lag to predict
+            past_channels (int): number of numeric past variables, must be >0
+            future_channels (int): number of future numeric variables
+            embs (List): list of the initial dimension of the categorical variables
+            cat_emb_dim (int): final dimension of each categorical variable
+            hidden_RNN (int): hidden size of the RNN block
+            num_layers_RNN (int): number of RNN layers
+            kind (str): one among GRU or LSTM
+            kernel_size (int): kernel size in the encoder convolutional block
+            sum_emb (bool): if true the contribution of each embedding will be summed-up otherwise stacked
+            out_channels (int):  number of output channels
+            activation (str, optional): activation fuction function pytorch. Default torch.nn.ReLU
+            remove_last (bool, optional): if True the model learns the difference respect to the last seen point
+            persistence_weight (float):  weight controlling the divergence from persistence model. Default 0
+            loss_type (str, optional): this model uses custom losses or l1 or mse. Custom losses can be linear_penalization or exponential_penalization. Default l1,
+            quantiles (List[int], optional): we can use quantile loss il len(quantiles) = 0 (usually 0.1,0.5, 0.9) or L1loss in case len(quantiles)==0. Defaults to [].
+            dropout_rate (float, optional): dropout rate in Dropout layers
+            use_bn (bool, optional): if true BN layers will be added and dropouts will be removed
+            use_glu (bool,optional): use GLU for feature selection. Defaults to True.
+            glu_percentage (float, optiona): percentage of features to use. Defaults to 1.0.
+            n_classes (int): number of classes (0 in regression)
+            optim (str, optional): if not None it expects a pytorch optim method. Defaults to None that is mapped to Adam.
+            optim_config (dict, optional): configuration for Adam optimizer. Defaults to None.
+            scheduler_config (dict, optional): configuration for stepLR scheduler. Defaults to None.
+        """
+        super().__init__(**kwargs)
+        self.block_size = past_steps//token_split + future_steps//token_split -1
+        self.save_hyperparameters(logger=False)
+        self.sentence_length = future_steps//token_split
+        self.transformer = nn.ModuleDict(dict(
+            wte = nn.Embedding(max_voc_size, d_model),
+            wpe = nn.Embedding(self.block_size, d_model),
+            drop = nn.Dropout(dropout_rate),
+            h = nn.ModuleList([Block( d_model,dropout_rate,n_heads,dropout_rate,self.block_size) for _ in range(num_layers)]), ##care can be different dropouts
+            ln_f = nn.LayerNorm(d_model),
+        ))
+        self.lm_head = nn.Linear(d_model, max_voc_size, bias=False)
+        for pn, p in self.named_parameters():
+            if pn.endswith('c_proj.weight'):
+                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * num_layers))
+        # report number of parameters (note we don't count the decoder parameters in lm_head)
+        n_params = sum(p.numel() for p in self.transformer.parameters())
+        beauty_string("number of parameters: %.2fM" % (n_params/1e6,),'info',self.verbose)
+        self.use_quantiles = True
+        self.is_classification = True
+        self.scheduler_config = scheduler_config
+        self.optim_config = optim_config
+        self.optim = self.scheduler_config = self.configure_optimizers()
+    def configure_optimizers(self):
+        """
+        This long function is unfortunately doing something very simple and is being very defensive:
+        We are separating out all parameters of the model into two buckets: those that will experience
+        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
+        We are then returning the PyTorch optimizer object.
+        """
+        # separate out all parameters to those that will and won't experience regularizing weight decay
+        decay = set()
+        no_decay = set()
+        whitelist_weight_modules = (torch.nn.Linear, )
+        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
+        for mn, m in self.named_modules():
+            for pn, p in m.named_parameters():
+                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
+                # random note: because named_modules and named_parameters are recursive
+                # we will see the same tensors p many many times. but doing it this way
+                # allows us to know which parent module any tensor p belongs to...
+                if pn.endswith('bias'):
+                    # all biases will not be decayed
+                    no_decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
+                    # weights of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
+                    # weights of blacklist modules will NOT be weight decayed
+                    no_decay.add(fpn)
+        # validate that we considered every parameter
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        inter_params = decay & no_decay
+        union_params = decay | no_decay
+        assert len(inter_params) == 0, beauty_string(f"parameters {inter_params} made it into both decay/no_decay sets!",'section' ,True)
+        assert len(param_dict.keys() - union_params) == 0,  beauty_string(f"parameters {param_dict.keys() - union_params} were not separated into either decay/no_decay set!",'section',True)
+        # create the pytorch optimizer object
+        optim_groups = [
+            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": self.optim_config.weight_decay},
+            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
+        ]
+        optimizer = torch.optim.AdamW(optim_groups, lr=self.optim_config.lr, betas=self.optim_config.betas)
+        return optimizer
+    def compute_loss(self,batch,y_hat):
+        """
+        custom loss calculation
+        :meta private:
+        """
+        return F.cross_entropy(y_hat.view(-1, y_hat.size(-1)), batch['y_emb'].view(-1), ignore_index=-1)
+    def forward(self, batch):
+        b, t = batch['x_emb'].size()
+        assert t <= self.block_size, beauty_string("Cannot forward sequence of length {t}, block size is only {self.block_size}",'section',True)
+        pos = torch.arange(0, t, dtype=torch.long, device=self.device).unsqueeze(0) # shape (1, t)
+        # forward the GPT model itself
+        tok_emb = self.transformer.wte(batch['x_emb']) # token embeddings of shape (b, t, n_embd)
+        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
+        x = self.transformer.drop(tok_emb + pos_emb)
+        for block in self.transformer.h:
+            x = block(x)
+        x = self.transformer.ln_f(x)
+        logits = self.lm_head(x)
+        return logits
+    def generate(self, idx, max_new_tokens, temperature=1.0, do_sample=False, top_k=None,num_samples=100):
+        """
+        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
+        the sequence max_new_tokens times, feeding the predictions back into the model each time.
+        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
+        """
+        if do_sample:
+            idx = idx.repeat(num_samples,1,1)
+            for _ in range(max_new_tokens):
+                tmp = []
+                for i in range(num_samples):
+                    idx_cond = idx[i,:,:] if idx.size(2) <= self.block_size else idx[i,:, -self.block_size:]
+                    logits = self({'x_emb':idx_cond})
+                    logits = logits[:, -1, :] / temperature
+                    if top_k is not None:
+                        v, _ = torch.topk(logits, top_k)
+                        logits[logits < v[:, [-1]]] = -float('Inf')
+                    probs = F.softmax(logits, dim=-1)
+                    idx_next = torch.multinomial(probs, num_samples=1, replacement=True)
+                    tmp.append(idx_next)
+                tmp = torch.cat(tmp,dim=1).T.unsqueeze(2)
+                idx = torch.cat((idx, tmp), dim=2)
+            return idx
+        else:
+            for _ in range(max_new_tokens):
+                # if the sequence context is growing too long we must crop it at block_size
+                idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
+                # forward the model to get the logits for the index in the sequence
+                logits = self({'x_emb':idx_cond})
+                # pluck the logits at the final step and scale by desired temperature
+                logits = logits[:, -1, :] / temperature
+                # optionally crop the logits to only the top k options
+                if top_k is not None:
+                    v, _ = torch.topk(logits, top_k)
+                    logits[logits < v[:, [-1]]] = -float('Inf')
+                # apply softmax to convert logits to (normalized) probabilities
+                probs = F.softmax(logits, dim=-1)
+                # either sample from the distribution or take the most likely element
+                _, idx_next = torch.topk(probs, k=1, dim=-1)
+                # append sampled index to the running sequence and continue
+                idx = torch.cat((idx, idx_next), dim=1)
+            return idx.unsqueeze(0)
+    def inference(self, batch:dict)->torch.tensor:
+        x = batch['x_emb'].to(self.device)
+        # isolate the input pattern alone
+        inp = x[:, :self.sentence_length]
+        # let the model sample the rest of the sequence
+        cat = self.generate(inp, self.sentence_length, do_sample=True,num_samples=3) # using greedy argmax, not samplingv ##todo here add sampling
+        sol_candidate = cat[:,:, self.sentence_length:]
+        return sol_candidate.permute(1,2,0)

dsipts/models/__init__.py ADDED Viewed

File without changes

dsipts/models/autoformer/__init__.py ADDED Viewed

File without changes

dsipts/models/autoformer/layers.py ADDED Viewed

@@ -0,0 +1,352 @@
+import torch
+import torch.nn as nn
+import math
+class AutoCorrelation(nn.Module):
+    """
+    AutoCorrelation Mechanism with the following two phases:
+    (1) period-based dependencies discovery
+    (2) time delay aggregation
+    This block can replace the self-attention family mechanism seamlessly.
+    """
+    def __init__(self, mask_flag=True, factor=1, scale=None, attention_dropout=0.1, output_attention=False):
+        super(AutoCorrelation, self).__init__()
+        self.factor = factor
+        self.scale = scale
+        self.mask_flag = mask_flag
+        self.output_attention = output_attention
+        self.dropout = nn.Dropout(attention_dropout)
+    def time_delay_agg_training(self, values, corr):
+        """
+        SpeedUp version of Autocorrelation (a batch-normalization style design)
+        This is for the training phase.
+        """
+        head = values.shape[1]
+        channel = values.shape[2]
+        length = values.shape[3]
+        # find top k
+        top_k = int(self.factor * math.log(length))
+        mean_value = torch.mean(torch.mean(corr, dim=1), dim=1)
+        index = torch.topk(torch.mean(mean_value, dim=0), top_k, dim=-1)[1]
+        weights = torch.stack([mean_value[:, index[i]] for i in range(top_k)], dim=-1)
+        # update corr
+        tmp_corr = torch.softmax(weights, dim=-1)
+        # aggregation
+        tmp_values = values
+        delays_agg = torch.zeros_like(values).float()
+        for i in range(top_k):
+            pattern = torch.roll(tmp_values, -int(index[i]), -1)
+            delays_agg = delays_agg + pattern * \
+                         (tmp_corr[:, i].unsqueeze(1).unsqueeze(1).unsqueeze(1).repeat(1, head, channel, length))
+        return delays_agg
+    def time_delay_agg_inference(self, values, corr):
+        """
+        SpeedUp version of Autocorrelation (a batch-normalization style design)
+        This is for the inference phase.
+        """
+        batch = values.shape[0]
+        head = values.shape[1]
+        channel = values.shape[2]
+        length = values.shape[3]
+        # index init
+        init_index = torch.arange(length).unsqueeze(0).unsqueeze(0).unsqueeze(0).repeat(batch, head, channel, 1).to(self.device)
+        # find top k
+        top_k = int(self.factor * math.log(length))
+        mean_value = torch.mean(torch.mean(corr, dim=1), dim=1)
+        weights = torch.topk(mean_value, top_k, dim=-1)[0]
+        delay = torch.topk(mean_value, top_k, dim=-1)[1]
+        # update corr
+        tmp_corr = torch.softmax(weights, dim=-1)
+        # aggregation
+        tmp_values = values.repeat(1, 1, 1, 2)
+        delays_agg = torch.zeros_like(values).float()
+        for i in range(top_k):
+            tmp_delay = init_index + delay[:, i].unsqueeze(1).unsqueeze(1).unsqueeze(1).repeat(1, head, channel, length)
+            pattern = torch.gather(tmp_values, dim=-1, index=tmp_delay)
+            delays_agg = delays_agg + pattern * \
+                         (tmp_corr[:, i].unsqueeze(1).unsqueeze(1).unsqueeze(1).repeat(1, head, channel, length))
+        return delays_agg
+    def time_delay_agg_full(self, values, corr):
+        """
+        Standard version of Autocorrelation
+        """
+        batch = values.shape[0]
+        head = values.shape[1]
+        channel = values.shape[2]
+        length = values.shape[3]
+        # index init
+        init_index = torch.arange(length).unsqueeze(0).unsqueeze(0).unsqueeze(0).repeat(batch, head, channel, 1).to(self.device)
+        # find top k
+        top_k = int(self.factor * math.log(length))
+        weights = torch.topk(corr, top_k, dim=-1)[0]
+        delay = torch.topk(corr, top_k, dim=-1)[1]
+        # update corr
+        tmp_corr = torch.softmax(weights, dim=-1)
+        # aggregation
+        tmp_values = values.repeat(1, 1, 1, 2)
+        delays_agg = torch.zeros_like(values).float()
+        for i in range(top_k):
+            tmp_delay = init_index + delay[..., i].unsqueeze(-1)
+            pattern = torch.gather(tmp_values, dim=-1, index=tmp_delay)
+            delays_agg = delays_agg + pattern * (tmp_corr[..., i].unsqueeze(-1))
+        return delays_agg
+    def forward(self, queries, keys, values, attn_mask):
+        B, L, H, E = queries.shape
+        _, S, _, D = values.shape
+        if L > S:
+            zeros = torch.zeros_like(queries[:, :(L - S), :]).float()
+            values = torch.cat([values, zeros], dim=1)
+            keys = torch.cat([keys, zeros], dim=1)
+        else:
+            values = values[:, :L, :, :]
+            keys = keys[:, :L, :, :]
+        # period-based dependencies
+        q_fft = torch.fft.rfft(queries.permute(0, 2, 3, 1).contiguous(), dim=-1)
+        k_fft = torch.fft.rfft(keys.permute(0, 2, 3, 1).contiguous(), dim=-1)
+        res = q_fft * torch.conj(k_fft)
+        corr = torch.fft.irfft(res, dim=-1)
+        # time delay agg
+        if self.training:
+            V = self.time_delay_agg_training(values.permute(0, 2, 3, 1).contiguous(), corr).permute(0, 3, 1, 2)
+        else:
+            V = self.time_delay_agg_inference(values.permute(0, 2, 3, 1).contiguous(), corr).permute(0, 3, 1, 2)
+        if self.output_attention:
+            return (V.contiguous(), corr.permute(0, 3, 1, 2))
+        else:
+            return (V.contiguous(), None)
+class AutoCorrelationLayer(nn.Module):
+    def __init__(self, correlation, d_model, n_heads, d_keys=None,
+                 d_values=None):
+        super(AutoCorrelationLayer, self).__init__()
+        d_keys = d_keys or (d_model // n_heads)
+        d_values = d_values or (d_model // n_heads)
+        self.inner_correlation = correlation
+        self.query_projection = nn.Linear(d_model, d_keys * n_heads)
+        self.key_projection = nn.Linear(d_model, d_keys * n_heads)
+        self.value_projection = nn.Linear(d_model, d_values * n_heads)
+        self.out_projection = nn.Linear(d_values * n_heads, d_model)
+        self.n_heads = n_heads
+    def forward(self, queries, keys, values, attn_mask):
+        self.inner_correlation.device = queries.device
+        B, L, _ = queries.shape
+        _, S, _ = keys.shape
+        H = self.n_heads
+        queries = self.query_projection(queries).view(B, L, H, -1)
+        keys = self.key_projection(keys).view(B, S, H, -1)
+        values = self.value_projection(values).view(B, S, H, -1)
+        out, attn = self.inner_correlation(
+            queries,
+            keys,
+            values,
+            attn_mask
+        )
+        out = out.view(B, L, -1)
+        return self.out_projection(out), attn
+class my_Layernorm(nn.Module):
+    """
+    Special designed layernorm for the seasonal part
+    """
+    def __init__(self, channels):
+        super(my_Layernorm, self).__init__()
+        self.layernorm = nn.LayerNorm(channels)
+    def forward(self, x):
+        x_hat = self.layernorm(x)
+        bias = torch.mean(x_hat, dim=1).unsqueeze(1).repeat(1, x.shape[1], 1)
+        return x_hat - bias
+class moving_avg(nn.Module):
+    """
+    Moving average block to highlight the trend of time series
+    """
+    def __init__(self, kernel_size, stride):
+        super(moving_avg, self).__init__()
+        self.kernel_size = kernel_size
+        self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0)
+    def forward(self, x):
+        # padding on the both ends of time series
+        front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1)
+        end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1)
+        x = torch.cat([front, x, end], dim=1)
+        x = self.avg(x.permute(0, 2, 1))
+        x = x.permute(0, 2, 1)
+        return x
+class series_decomp(nn.Module):
+    """
+    Series decomposition block
+    """
+    def __init__(self, kernel_size):
+        super(series_decomp, self).__init__()
+        self.moving_avg = moving_avg(kernel_size, stride=1)
+    def forward(self, x):
+        moving_mean = self.moving_avg(x)
+        res = x - moving_mean
+        return res, moving_mean
+class EncoderLayer(nn.Module):
+    """
+    Autoformer encoder layer with the progressive decomposition architecture
+    """
+    def __init__(self, attention, d_model, d_ff=None, moving_avg=25, dropout=0.1, activation="relu"):
+        super(EncoderLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.attention = attention
+        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1, bias=False)
+        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1, bias=False)
+        self.decomp1 = series_decomp(moving_avg)
+        self.decomp2 = series_decomp(moving_avg)
+        self.dropout = nn.Dropout(dropout)
+        self.activation =activation()
+    def forward(self, x, attn_mask=None):
+        new_x, attn = self.attention(
+            x, x, x,
+            attn_mask=attn_mask
+        )
+        x = x + self.dropout(new_x)
+        x, _ = self.decomp1(x)
+        y = x
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
+        res, _ = self.decomp2(x + y)
+        return res, attn
+class Encoder(nn.Module):
+    """
+    Autoformer encoder
+    """
+    def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
+        super(Encoder, self).__init__()
+        self.attn_layers = nn.ModuleList(attn_layers)
+        self.conv_layers = nn.ModuleList(conv_layers) if conv_layers is not None else None
+        self.norm = norm_layer
+    def forward(self, x, attn_mask=None):
+        attns = []
+        if self.conv_layers is not None:
+            for attn_layer, conv_layer in zip(self.attn_layers, self.conv_layers):
+                x, attn = attn_layer(x, attn_mask=attn_mask)
+                x = conv_layer(x)
+                attns.append(attn)
+            x, attn = self.attn_layers[-1](x)
+            attns.append(attn)
+        else:
+            for attn_layer in self.attn_layers:
+                x, attn = attn_layer(x, attn_mask=attn_mask)
+                attns.append(attn)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, attns
+class DecoderLayer(nn.Module):
+    """
+    Autoformer decoder layer with the progressive decomposition architecture
+    """
+    def __init__(self, self_attention, cross_attention, d_model, c_out, d_ff=None,
+                 moving_avg=25, dropout=0.1, activation="relu"):
+        super(DecoderLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.self_attention = self_attention
+        self.cross_attention = cross_attention
+        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1, bias=False)
+        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1, bias=False)
+        self.decomp1 = series_decomp(moving_avg)
+        self.decomp2 = series_decomp(moving_avg)
+        self.decomp3 = series_decomp(moving_avg)
+        self.dropout = nn.Dropout(dropout)
+        self.projection = nn.Conv1d(in_channels=d_model, out_channels=c_out, kernel_size=3, stride=1, padding=1,
+                                    padding_mode='circular', bias=False)
+        self.activation = activation()
+    def forward(self, x, cross, x_mask=None, cross_mask=None):
+        self.self_attention.device = x.device
+        x = x + self.dropout(self.self_attention(
+            x, x, x,
+            attn_mask=x_mask
+        )[0])
+        x, trend1 = self.decomp1(x)
+        x = x + self.dropout(self.cross_attention(
+            x, cross, cross,
+            attn_mask=cross_mask
+        )[0])
+        x, trend2 = self.decomp2(x)
+        y = x
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
+        x, trend3 = self.decomp3(x + y)
+        residual_trend = trend1 + trend2 + trend3
+        residual_trend = self.projection(residual_trend.permute(0, 2, 1)).transpose(1, 2)
+        return x, residual_trend
+class Decoder(nn.Module):
+    """
+    Autoformer encoder
+    """
+    def __init__(self, layers, norm_layer=None, projection=None):
+        super(Decoder, self).__init__()
+        self.layers = nn.ModuleList(layers)
+        self.norm = norm_layer
+        self.projection = projection
+    def forward(self, x, cross, x_mask=None, cross_mask=None, trend=None):
+        for layer in self.layers:
+            x, residual_trend = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask)
+            trend = trend + residual_trend
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.projection is not None:
+            x = self.projection(x)
+        return x, trend
+class PositionalEmbedding(nn.Module):
+    def __init__(self, d_model, max_len=5000):
+        super(PositionalEmbedding, self).__init__()
+        # Compute the positional encodings once in log space.
+        pe = torch.zeros(max_len, d_model).float()
+        pe.require_grad = False
+        position = torch.arange(0, max_len).float().unsqueeze(1)
+        div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        return self.pe[:, :x.size(1)]