RubyGems - secryst - Versions diffs - 0.1.0 - Mend

secryst 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +7 -0
data/README.adoc +103 -0
data/lib/secryst-trainer.rb +8 -0
data/lib/secryst.rb +11 -0
data/lib/secryst/clip_grad_norm.rb +25 -0
data/lib/secryst/multi_head_attention_forward.rb +288 -0
data/lib/secryst/multihead_attention.rb +156 -0
data/lib/secryst/trainer.rb +235 -0
data/lib/secryst/transformer.rb +382 -0
data/lib/secryst/translator.rb +51 -0
data/lib/secryst/version.rb +3 -0
data/lib/secryst/vocab.rb +88 -0
metadata +95 -0

data/lib/secryst/multihead_attention.rb ADDED

@@ -0,0 +1,156 @@
+# ported from https://github.com/pytorch/pytorch/blob/4ae832e1060c72cb89de1d9693629783dbe0c9a6/torch/csrc/api/include/torch/nn/functional/activation.h
+require_relative 'multi_head_attention_forward'
+module Secryst
+  class MultiheadAttention < Torch::NN::Module
+    # Allows the model to jointly attend to information
+    # from different representation subspaces.
+    # See reference: Attention Is All You Need
+    # .. math::
+    #     \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+    #     \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
+    # Args:
+    #     embed_dim: total dimension of the model.
+    #     num_heads: parallel attention heads.
+    #     dropout: a Dropout layer on attn_output_weights. Default: 0.0.
+    #     bias: add bias as module parameter. Default: true.
+    #     add_bias_kv: add bias to the key and value sequences at dim=0.
+    #     add_zero_attn: add a new batch of zeros to the key and
+    #                     value sequences at dim=1.
+    #     kdim: total number of features in key. Default: nil.
+    #     vdim: total number of features in value. Default: nil.
+    #     Note: if kdim and vdim are nil, they will be set to embed_dim such that
+    #     query, key, and value have the same number of features.
+    # Examples::
+    #     >>> multihead_attn = MultiheadAttention.new(embed_dim: embed_dim, num_heads: num_heads)
+    #     >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
+    # bias_k: Optional[Torch::Tensor]
+    # bias_v: Optional[Torch::Tensor]
+    def initialize(embed_dim, num_heads, dropout:0.0, bias: true, add_bias_kv: false, add_zero_attn: false, kdim: nil, vdim: nil)
+      super()
+      @embed_dim = embed_dim
+      @kdim = kdim || embed_dim
+      @vdim = vdim || embed_dim
+      @_qkv_same_embed_dim = @kdim == @embed_dim && @vdim == @embed_dim
+      @num_heads = num_heads
+      @dropout = dropout
+      @head_dim = embed_dim / num_heads
+      raise ArgumentError, "embed_dim must be divisible by num_heads" if @head_dim * num_heads != @embed_dim
+      if !@_qkv_same_embed_dim
+        @q_proj_weight = Torch::NN::Parameter.new(Torch::Tensor.new(embed_dim, embed_dim))
+        @k_proj_weight = Torch::NN::Parameter.new(Torch::Tensor.new(embed_dim, @kdim))
+        @v_proj_weight = Torch::NN::Parameter.new(Torch::Tensor.new(embed_dim, @vdim))
+        register_parameter('in_proj_weight', nil)
+      else
+        @in_proj_weight = Torch::NN::Parameter.new(Torch.empty(3 * embed_dim, embed_dim))
+        register_parameter('q_proj_weight', nil)
+        register_parameter('k_proj_weight', nil)
+        register_parameter('v_proj_weight', nil)
+      end
+      if bias
+        @in_proj_bias = Torch::NN::Parameter.new(Torch.empty(3 * embed_dim))
+      else
+        register_parameter('in_proj_bias', nil)
+      end
+      @out_proj = Torch::NN::Linear.new(embed_dim, embed_dim)
+      if add_bias_kv
+        @bias_k = Torch::NN::Parameter.new(Torch.empty(1, 1, embed_dim))
+        @bias_v = Torch::NN::Parameter.new(Torch.empty(1, 1, embed_dim))
+      else
+        @bias_k = @bias_v = nil
+      end
+      @add_zero_attn = add_zero_attn
+      _reset_parameters
+    end
+    def _reset_parameters
+      if @_qkv_same_embed_dim
+        Torch::NN::Init.xavier_uniform!(@in_proj_weight)
+      else
+        Torch::NN::Init.xavier_uniform!(@q_proj_weight)
+        Torch::NN::Init.xavier_uniform!(@k_proj_weight)
+        Torch::NN::Init.xavier_uniform!(@v_proj_weight)
+      end
+      if @in_proj_bias
+        Torch::NN::Init.constant!(@in_proj_bias, 0.0)
+        Torch::NN::Init.constant!(@out_proj.bias, 0.0)
+      end
+      if @bias_k
+        Torch::NN::Init.xavier_normal!(@bias_k)
+      end
+      if @bias_v
+        Torch::NN::Init.xavier_normal!(@bias_v)
+      end
+    end
+    # Args:
+    #     query, key, value: map a query and a set of key-value pairs to an output.
+    #         See "Attention Is All You Need" for more details.
+    #     key_padding_mask: if provided, specified padding elements in the key will
+    #         be ignored by the attention. When given a binary mask and a value is true,
+    #         the corresponding value on the attention layer will be ignored. When given
+    #         a byte mask and a value is non-zero, the corresponding value on the attention
+    #         layer will be ignored
+    #     need_weights: output attn_output_weights.
+    #     attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+    #         the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+    # Shape:
+    #     - Inputs:
+    #     - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+    #       the embedding dimension.
+    #     - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+    #       the embedding dimension.
+    #     - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+    #       the embedding dimension.
+    #     - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+    #       If a ByteTensor is provided, the non-zero positions will be ignored while the position
+    #       with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
+    #       value of ``true`` will be ignored while the position with the value of ``false`` will be unchanged.
+    #     - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+    #       3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
+    #       S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
+    #       positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+    #       while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``true``
+    #       is not allowed to attend while ``false`` values will be unchanged. If a FloatTensor
+    #       is provided, it will be added to the attention weight.
+    #     - Outputs:
+    #     - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+    #       E is the embedding dimension.
+    #     - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
+    #       L is the target sequence length, S is the source sequence length.
+    def forward(query, key, value, key_padding_mask:nil,
+                need_weights:true, attn_mask:nil)
+      if !@_qkv_same_embed_dim
+        return Secryst::MultiHeadAttentionForward.multi_head_attention_forward(
+          query, key, value, @embed_dim, @num_heads,
+          @in_proj_weight, @in_proj_bias,
+          @bias_k, @bias_v, @add_zero_attn,
+          @dropout, @out_proj.weight, @out_proj.bias,
+          training: @training,
+          key_padding_mask: key_padding_mask, need_weights: need_weights,
+          attn_mask: attn_mask, use_separate_proj_weight: true,
+          q_proj_weight: @q_proj_weight, k_proj_weight: @k_proj_weight,
+          v_proj_weight: @v_proj_weight)
+      else
+        return Secryst::MultiHeadAttentionForward.multi_head_attention_forward(
+            query, key, value, @embed_dim, @num_heads,
+            @in_proj_weight, @in_proj_bias,
+            @bias_k, @bias_v, @add_zero_attn,
+            @dropout, @out_proj.weight, @out_proj.bias,
+            training: @training,
+            key_padding_mask: key_padding_mask, need_weights: need_weights,
+            attn_mask: attn_mask)
+      end
+    end
+  end
+end

data/lib/secryst/trainer.rb ADDED

@@ -0,0 +1,235 @@
+module Secryst
+  class Trainer
+    def initialize(
+      model:,
+      batch_size:,
+      lr:,
+      data_input:,
+      data_target:,
+      hyperparameters:,
+      max_epochs: nil,
+      log_interval: 1,
+      checkpoint_every:,
+      checkpoint_dir:,
+      scheduler_step_size:,
+      gamma:
+    )
+      @data_input = File.readlines(data_input, chomp: true)
+      @data_target = File.readlines(data_target, chomp: true)
+      @device = "cpu"
+      @lr = lr
+      @scheduler_step_size = scheduler_step_size
+      @gamma = gamma
+      @batch_size = batch_size
+      @model_name = model
+      @max_epochs = max_epochs
+      @log_interval = log_interval
+      @checkpoint_every = checkpoint_every
+      @checkpoint_dir = checkpoint_dir
+      FileUtils.mkdir_p(@checkpoint_dir)
+      generate_vocabs_and_data
+      save_vocabs
+      case model
+      when 'transformer'
+        @model = Secryst::Transformer.new(hyperparameters.merge({
+          input_vocab_size: @input_vocab.length,
+          target_vocab_size: @target_vocab.length,
+        }))
+      else
+        raise ArgumentError, 'Only transformer model is currently supported'
+      end
+    end
+    def train
+      best_model = nil
+      best_val_loss = 1.0/0.0 # infinity
+      return unless @model_name == 'transformer'
+      criterion = Torch::NN::CrossEntropyLoss.new(ignore_index: index_of('<pad>')).to(@device)
+      optimizer = Torch::Optim::SGD.new(@model.parameters, lr: @lr)
+      scheduler = Torch::Optim::LRScheduler::StepLR.new(optimizer, step_size: @scheduler_step_size, gamma: @gamma)
+      total_loss = 0.0
+      start_time = Time.now
+      ntokens = @target_vocab.length
+      epoch = 0
+      loop do
+        epoch_start_time = Time.now
+        @model.train
+        @train_data.each.with_index do |batch, i|
+          inputs, targets, decoder_inputs, src_mask, tgt_mask, memory_mask = batch
+          inputs = Torch.tensor(inputs).t
+          decoder_inputs = Torch.tensor(decoder_inputs).t
+          targets = Torch.tensor(targets).t
+          src_key_padding_mask = inputs.t.eq(1)
+          tgt_key_padding_mask = decoder_inputs.t.eq(1)
+          optimizer.zero_grad
+          opts = {
+            # src_mask: src_mask,
+            tgt_mask: tgt_mask,
+            # memory_mask: memory_mask,
+            src_key_padding_mask: src_key_padding_mask,
+            tgt_key_padding_mask: tgt_key_padding_mask,
+            memory_key_padding_mask: src_key_padding_mask,
+          }
+          output = @model.call(inputs, decoder_inputs, opts)
+          loss = criterion.call(output.transpose(0,1).reshape(-1, ntokens), targets.t.view(-1))
+          loss.backward
+          ClipGradNorm.clip_grad_norm(@model.parameters, max_norm: 0.5)
+          optimizer.step
+          # puts "i[#{i}] loss: #{loss}"
+          total_loss += loss.item()
+          if ( (i + 1) % @log_interval == 0 )
+            cur_loss = total_loss / @log_interval
+            elapsed = Time.now - start_time
+            puts "| epoch #{epoch} | #{i + 1}/#{@train_data.length} batches | "\
+                  "lr #{scheduler.get_lr()[0].round(4)} | ms/batch #{(1000*elapsed.to_f / @log_interval).round} | "\
+                  "loss #{cur_loss.round(5)} | ppl #{Math.exp(cur_loss).round(5)}"
+            total_loss = 0
+            start_time = Time.now
+          end
+        end
+        if epoch > 0 && epoch % @checkpoint_every == 0
+          puts ">> Saving checkpoint '#{@checkpoint_dir}/checkpoint-#{epoch}.pth'"
+          Torch.save(@model.state_dict, "#{@checkpoint_dir}/checkpoint-#{epoch}.pth")
+        end
+        # Evaluate
+        @model.eval()
+        total_loss = 0.0
+        Torch.no_grad do
+          @eval_data.each.with_index do |batch, i|
+            inputs, targets, decoder_inputs, src_mask, tgt_mask, memory_mask = batch
+            inputs = Torch.tensor(inputs).t
+            decoder_inputs = Torch.tensor(decoder_inputs).t
+            targets = Torch.tensor(targets).t
+            src_key_padding_mask = inputs.t.eq(1)
+            tgt_key_padding_mask = decoder_inputs.t.eq(1)
+            opts = {
+              # src_mask: src_mask,
+              tgt_mask: tgt_mask,
+              # memory_mask: memory_mask,
+              src_key_padding_mask: src_key_padding_mask,
+              tgt_key_padding_mask: tgt_key_padding_mask,
+              memory_key_padding_mask: src_key_padding_mask,
+            }
+            output = @model.call(inputs, decoder_inputs, **opts)
+            output_flat = output.transpose(0,1).reshape(-1, ntokens)
+            total_loss += criterion.call(output_flat, targets.t.view(-1)).item
+          end
+          total_loss = total_loss / @eval_data.length
+          puts('-' * 89)
+          puts "| end of epoch #{epoch} | time: #{(Time.now - epoch_start_time).round(3)}s | "\
+                  " valid loss #{total_loss.round(5)} | valid ppl #{Math.exp(total_loss).round(5)} "
+          puts('-' * 89)
+          if total_loss < best_val_loss
+            best_model = @model
+            best_val_loss = total_loss
+          end
+        end
+        scheduler.step
+        epoch += 1
+        break if @max_epochs && @max_epochs < epoch
+      end
+    end
+    private
+    def generate_vocabs_and_data
+      input_texts = []
+      target_texts = []
+      input_vocab_counter = Hash.new(0)
+      target_vocab_counter = Hash.new(0)
+      @data_input.each do |input_text|
+        input_text.strip!
+        input_texts.push(input_text)
+        input_text.each_char do |char|
+          input_vocab_counter[char] += 1
+        end
+      end
+      @data_target.each do |target_text|
+        target_text.strip!
+        target_texts.push(target_text)
+        target_text.each_char do |char|
+          target_vocab_counter[char] += 1
+        end
+      end
+      @input_vocab = Vocab.new(input_vocab_counter)
+      @target_vocab = Vocab.new(target_vocab_counter)
+      # Generate train, eval, and test batches
+      seed = 1
+      zipped_texts = input_texts.zip(target_texts)
+      zipped_texts = zipped_texts.shuffle(random: Random.new(seed))
+      # train - 90%, eval - 7%, test - 3%
+      train_texts = zipped_texts[0..(zipped_texts.length*0.9).to_i]
+      eval_texts = zipped_texts[(zipped_texts.length*0.9).to_i + 1..(zipped_texts.length*0.97).to_i]
+      test_texts = zipped_texts[(zipped_texts.length*0.97).to_i+1..-1]
+      # prepare batches
+      @train_data = batchify(train_texts)
+      @eval_data = batchify(eval_texts)
+      @test_data = batchify(test_texts)
+    end
+    def pad(arr, length, no_eos:false, no_sos:false)
+      if !no_eos
+        arr = arr + ["<eos>"]
+      end
+      if !no_sos
+        arr = ["<sos>"] + arr
+      end
+      arr.fill("<pad>", arr.length...length)
+    end
+    def index_of(token)
+      @target_vocab.stoi[token]
+    end
+    def batchify(data)
+      batches = []
+      (1 + data.length / @batch_size).times do |i|
+        input_data = data[i*@batch_size, @batch_size].transpose[0]
+        decoder_input_data = data[i*@batch_size, @batch_size].transpose[1]
+        target_data = data[i*@batch_size, @batch_size].transpose[1]
+        max_input_seq_length = input_data.max_by(&:length).length + 2
+        max_target_seq_length = target_data.max_by(&:length).length + 1
+        src_mask = Torch.triu(Torch.ones(max_input_seq_length,max_input_seq_length)).eq(0).transpose(0,1)
+        tgt_mask = Torch.triu(Torch.ones(max_target_seq_length,max_target_seq_length)).eq(0).transpose(0,1)
+        memory_mask = Torch.triu(Torch.ones(max_input_seq_length,max_target_seq_length)).eq(0).transpose(0,1)
+        batches << [
+          input_data.map {|line| pad(line.chars, max_input_seq_length).map {|c| @input_vocab[c]} },
+          target_data.map {|line| pad(line.chars, max_target_seq_length, no_sos: true).map {|c| @target_vocab[c]} },
+          decoder_input_data.map {|line| pad(line.chars, max_target_seq_length, no_eos: true).map {|c| @target_vocab[c]} },
+          src_mask,
+          tgt_mask,
+          memory_mask
+        ]
+      end
+      batches
+    end
+    def save_vocabs
+      File.write("#{@checkpoint_dir}/input_vocab.json", JSON.generate(@input_vocab.freqs))
+      File.write("#{@checkpoint_dir}/target_vocab.json", JSON.generate(@target_vocab.freqs))
+    end
+  end
+end

data/lib/secryst/transformer.rb ADDED

@@ -0,0 +1,382 @@
+# ported from https://github.com/pytorch/pytorch/blob/626e410e1dedcdb9d5a410a8827cc7a8a9fbcce1/torch/nn/modules/transformer.py
+module Secryst
+  class Transformer < Torch::NN::Module
+    # A transformer model. User is able to modify the attributes as needed. The architecture
+    # is based on the paper "Attention Is All You Need". Ashish Vaswani, Noam Shazeer,
+    # Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and
+    # Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information
+    # Processing Systems, pages 6000-6010. Users can build the BERT(https://arxiv.org/abs/1810.04805)
+    # model with corresponding parameters.
+    # Args:
+    #     d_model: the number of expected features in the encoder/decoder inputs (default=512).
+    #     nhead: the number of heads in the multiheadattention models (default=8).
+    #     num_encoder_layers: the number of sub-encoder-layers in the encoder (default=6).
+    #     num_decoder_layers: the number of sub-decoder-layers in the decoder (default=6).
+    #     dim_feedforward: the dimension of the feedforward network model (default=2048).
+    #     dropout: the dropout value (default=0.1).
+    #     activation: the activation function of encoder/decoder intermediate layer, relu or gelu (default=relu).
+    #     custom_encoder: custom encoder (default=nil).
+    #     custom_decoder: custom decoder (default=nil).
+    #     input_vocab_size: size of vocabulary for input sequence (number of different possible tokens).
+    #     target_vocab_size: size of vocabulary for target sequence (number of different possible tokens).
+    # Examples::
+    #     >>> transformer_model = Transformer.new(nhead: 16, num_encoder_layers: 12)
+    #     >>> src = Torch.rand((10, 32, 512))
+    #     >>> tgt = Torch.rand((20, 32, 512))
+    #     >>> out = transformer_model.call(src, tgt)
+    def initialize(d_model: 512, nhead: 8, num_encoder_layers: 6, num_decoder_layers: 6,
+      dim_feedforward: 2048, dropout: 0.1, activation: 'relu', custom_encoder: nil, custom_decoder: nil, input_vocab_size:, target_vocab_size:)
+      super()
+      if custom_encoder
+        @encoder = custom_encoder
+      else
+        encoder_layers = num_encoder_layers.times.map { TransformerEncoderLayer.new(d_model, nhead, dim_feedforward: dim_feedforward, dropout: dropout, activation: activation) }
+        encoder_norm = Torch::NN::LayerNorm.new(d_model)
+        @encoder = TransformerEncoder.new(encoder_layers, encoder_norm, d_model, input_vocab_size, dropout)
+      end
+      if custom_decoder
+        @decoder = custom_decoder
+      else
+        decoder_layers = num_decoder_layers.times.map { TransformerDecoderLayer.new(d_model, nhead, dim_feedforward: dim_feedforward, dropout: dropout, activation: activation) }
+        decoder_norm = Torch::NN::LayerNorm.new(d_model)
+        @decoder = TransformerDecoder.new(decoder_layers, decoder_norm, d_model, target_vocab_size, dropout)
+      end
+      @linear = Torch::NN::Linear.new(d_model, target_vocab_size)
+      @softmax = Torch::NN::LogSoftmax.new(dim: -1)
+      _reset_parameters()
+      @d_model = d_model
+      @nhead = nhead
+    end
+    # Take in and process masked source/target sequences.
+    # Args:
+    #     src: the sequence to the encoder (required).
+    #     tgt: the sequence to the decoder (required).
+    #     src_mask: the additive mask for the src sequence (optional).
+    #     tgt_mask: the additive mask for the tgt sequence (optional).
+    #     memory_mask: the additive mask for the encoder output (optional).
+    #     src_key_padding_mask: the ByteTensor mask for src keys per batch (optional).
+    #     tgt_key_padding_mask: the ByteTensor mask for tgt keys per batch (optional).
+    #     memory_key_padding_mask: the ByteTensor mask for memory keys per batch (optional).
+    # Shape:
+    #     - src: :math:`(S, N, E)`.
+    #     - tgt: :math:`(T, N, E)`.
+    #     - src_mask: :math:`(S, S)`.
+    #     - tgt_mask: :math:`(T, T)`.
+    #     - memory_mask: :math:`(T, S)`.
+    #     - src_key_padding_mask: :math:`(N, S)`.
+    #     - tgt_key_padding_mask: :math:`(N, T)`.
+    #     - memory_key_padding_mask: :math:`(N, S)`.
+    #     Note: [src/tgt/memory]_mask ensures that position i is allowed to attend the unmasked
+    #     positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+    #     while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``true``
+    #     are not allowed to attend while ``false`` values will be unchanged. If a FloatTensor
+    #     is provided, it will be added to the attention weight.
+    #     [src/tgt/memory]_key_padding_mask provides specified elements in the key to be ignored by
+    #     the attention. If a ByteTensor is provided, the non-zero positions will be ignored while the zero
+    #     positions will be unchanged. If a BoolTensor is provided, the positions with the
+    #     value of ``true`` will be ignored while the position with the value of ``false`` will be unchanged.
+    #     - output: :math:`(T, N, E)`.
+    #     Note: Due to the multi-head attention architecture in the transformer model,
+    #     the output sequence length of a transformer is same as the input sequence
+    #     (i.e. target) length of the decode.
+    #     where S is the source sequence length, T is the target sequence length, N is the
+    #     batch size, E is the feature number
+    # Examples:
+    #     >>> output = transformer_model.call(src, tgt, src_mask: src_mask, tgt_mask: tgt_mask)
+    def forward(src, tgt, src_mask: nil, tgt_mask: nil,
+              memory_mask: nil, src_key_padding_mask: nil,
+              tgt_key_padding_mask: nil, memory_key_padding_mask: nil)
+      if src.size(1) != tgt.size(1)
+        raise RuntimeError, "the batch number of src and tgt must be equal"
+      end
+      memory = @encoder.call(src, mask: src_mask, src_key_padding_mask: src_key_padding_mask)
+      output = @decoder.call(tgt, memory, tgt_mask: tgt_mask, memory_mask: memory_mask,
+                            tgt_key_padding_mask: tgt_key_padding_mask,
+                            memory_key_padding_mask: memory_key_padding_mask)
+      output = @linear.call(output)
+      output = @softmax.call(output)
+      return output
+    end
+    def _reset_parameters
+      parameters.each do |p|
+        Torch::NN::Init.xavier_uniform!(p) if p.dim > 1
+      end
+    end
+  end
+  class TransformerEncoder < Torch::NN::Module
+    # TransformerEncoder is a stack of N encoder layers
+    # Args:
+    #     encoder_layers: an array of instances of the TransformerEncoderLayer class (required).
+    #     norm: the layer normalization component (optional).
+    #     d_model: the number of expected features in the encoder/decoder inputs.
+    #     vocab_size: size of vocabulary (number of different possible tokens).
+    # Examples::
+    #     >>> encoder_layers = 6.times.map {|i| TransformerEncoderLayer.new(512, 8) }
+    #     >>> transformer_encoder = nn.TransformerEncoder(encoder_layers, nil, 512, 72, 0.1)
+    #     >>> src = Torch.rand(10, 32, 512)
+    #     >>> out = transformer_encoder.call(src)
+    def initialize(encoder_layers, norm=nil, d_model, vocab_size, dropout)
+      super()
+      @d_model = d_model
+      encoder_layers.each.with_index do |l, i|
+        instance_variable_set("@layer#{i}", l)
+      end
+      @layers = encoder_layers.length.times.map {|i| instance_variable_get("@layer#{i}") }
+      @num_layers = encoder_layers.length
+      @embedding = Torch::NN::Embedding.new(vocab_size, d_model)
+      @pos_encoder = PositionalEncoding.new(d_model, dropout: dropout)
+      @norm = norm
+    end
+    # Pass the input through the encoder layers in turn.
+    # Args:
+    #     src: the sequence to the encoder (required).
+    #     mask: the mask for the src sequence (optional).
+    #     src_key_padding_mask: the mask for the src keys per batch (optional).
+    # Shape:
+    #     see the docs in Transformer class.
+    def forward(src, mask: nil, src_key_padding_mask: nil)
+      output = @embedding.call(src) * Math.sqrt(@d_model)
+      output = @pos_encoder.call(output)
+      @layers.each { |mod|
+        output = mod.call(output, src_mask: mask, src_key_padding_mask: src_key_padding_mask)
+      }
+      if @norm
+        output = @norm.call(output)
+      end
+      return output
+    end
+  end
+  class TransformerEncoderLayer < Torch::NN::Module
+    # TransformerEncoderLayer is made up of self-attn and feedforward network.
+    # This standard encoder layer is based on the paper "Attention Is All You Need".
+    # Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
+    # Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
+    # Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
+    # in a different way during application.
+    # Args:
+    #     d_model: the number of expected features in the input (required).
+    #     nhead: the number of heads in the multiheadattention models (required).
+    #     dim_feedforward: the dimension of the feedforward network model (default=2048).
+    #     dropout: the dropout value (default=0.1).
+    #     activation: the activation function of intermediate layer, relu or gelu (default=relu).
+    # Examples::
+    #     >>> encoder_layer = TransformerEncoderLayer.new(512, 8)
+    #     >>> src = Torch.rand(10, 32, 512)
+    #     >>> out = encoder_layer.call(src)
+    def initialize(d_model, nhead, dim_feedforward:2048, dropout:0.1, activation:"relu")
+      super()
+      @self_attn = MultiheadAttention.new(d_model, nhead, dropout: dropout)
+      # Implementation of Feedforward model
+      @linear1 = Torch::NN::Linear.new(d_model, dim_feedforward)
+      @dropout = Torch::NN::Dropout.new(p: dropout)
+      @linear2 = Torch::NN::Linear.new(dim_feedforward, d_model)
+      @norm1 = Torch::NN::LayerNorm.new(d_model)
+      @norm2 = Torch::NN::LayerNorm.new(d_model)
+      @dropout1 = Torch::NN::Dropout.new(p: dropout)
+      @dropout2 = Torch::NN::Dropout.new(p: dropout)
+      @activation = _get_activation_fn(activation)
+    end
+    # Pass the input through the encoder layer.
+    # Args:
+    #     src: the sequence to the encoder layer (required).
+    #     src_mask: the mask for the src sequence (optional).
+    #     src_key_padding_mask: the mask for the src keys per batch (optional).
+    # Shape:
+    #     see the docs in Transformer class.
+    def forward(src, src_mask: nil, src_key_padding_mask: nil)
+      src2 = @self_attn.call(src, src, src, attn_mask: src_mask,
+                            key_padding_mask: src_key_padding_mask)[0]
+      src = src + @dropout1.call(src2)
+      src = @norm1.call(src)
+      src2 = @linear2.call(@dropout.call(@activation.call(@linear1.call(src))))
+      src = src + @dropout2.call(src2)
+      src = @norm2.call(src)
+      return src
+    end
+  end
+  class TransformerDecoder < Torch::NN::Module
+    # TransformerDecoder is a stack of N decoder layers
+    # Args:
+    #     decoder_layers: an array of instances of the TransformerDecoderLayer class (required).
+    #     norm: the layer normalization component (optional).
+    #     d_model: the number of expected features in the encoder/decoder inputs.
+    #     vocab_size: size of vocabulary (number of different possible tokens).
+    # Examples::
+    #     >>> decoder_layers = 6.times.map {|i| TransformerDecoderLayer.new(512, 8) }
+    #     >>> transformer_decoder = TransformerDecoder.new(encoder_layers, nil, 512, 72, 0.1)
+    #     >>> memory = Torch.rand(10, 32, 512)
+    #     >>> tgt = Torch.rand(20, 32, 512)
+    #     >>> out = transformer_decoder.call(tgt, memory)
+    def initialize(decoder_layers, norm=nil, d_model, vocab_size, dropout)
+      super()
+      @d_model = d_model
+      decoder_layers.each.with_index do |l, i|
+        instance_variable_set("@layer#{i}", l)
+      end
+      @layers = decoder_layers.length.times.map {|i| instance_variable_get("@layer#{i}") }
+      @num_layers = decoder_layers.length
+      @embedding = Torch::NN::Embedding.new(vocab_size, d_model)
+      @pos_encoder = PositionalEncoding.new(d_model, dropout: dropout)
+      @norm = norm
+    end
+    # Pass the inputs (and mask) through the decoder layer in turn.
+    # Args:
+    #     tgt: the sequence to the decoder (required).
+    #     memory: the sequence from the last layer of the encoder (required).
+    #     tgt_mask: the mask for the tgt sequence (optional).
+    #     memory_mask: the mask for the memory sequence (optional).
+    #     tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
+    #     memory_key_padding_mask: the mask for the memory keys per batch (optional).
+    # Shape:
+    #     see the docs in Transformer class.
+    def forward(tgt, memory, tgt_mask: nil,
+              memory_mask: nil, tgt_key_padding_mask: nil,
+              memory_key_padding_mask: nil)
+      output = @embedding.call(tgt) * Math.sqrt(@d_model)
+      output = @pos_encoder.call(output)
+      @layers.each { |mod|
+        output = mod.call(output, memory, tgt_mask: tgt_mask,
+                       memory_mask: memory_mask,
+                       tgt_key_padding_mask: tgt_key_padding_mask,
+                       memory_key_padding_mask: memory_key_padding_mask)
+      }
+      if @norm
+        output = @norm.call(output)
+      end
+      return output
+    end
+  end
+  class TransformerDecoderLayer < Torch::NN::Module
+    # TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network.
+    # This standard decoder layer is based on the paper "Attention Is All You Need".
+    # Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
+    # Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
+    # Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
+    # in a different way during application.
+    # Args:
+    #     d_model: the number of expected features in the input (required).
+    #     nhead: the number of heads in the multiheadattention models (required).
+    #     dim_feedforward: the dimension of the feedforward network model (default=2048).
+    #     dropout: the dropout value (default=0.1).
+    #     activation: the activation function of intermediate layer, relu or gelu (default=relu).
+    # Examples::
+    #     >>> decoder_layer = TransformerDecoderLayer(512, 8)
+    #     >>> memory = Torch.rand(10, 32, 512)
+    #     >>> tgt = Torch.rand(20, 32, 512)
+    #     >>> out = decoder_layer.call(tgt, memory)
+    def initialize(d_model, nhead, dim_feedforward: 2048, dropout: 0.1, activation: "relu")
+      super()
+      @self_attn = MultiheadAttention.new(d_model, nhead, dropout: dropout)
+      @multihead_attn = MultiheadAttention.new(d_model, nhead, dropout: dropout)
+      # Implementation of Feedforward model
+      @linear1 = Torch::NN::Linear.new(d_model, dim_feedforward)
+      @dropout = Torch::NN::Dropout.new(p: dropout)
+      @linear2 = Torch::NN::Linear.new(dim_feedforward, d_model)
+      @norm1 = Torch::NN::LayerNorm.new(d_model)
+      @norm2 = Torch::NN::LayerNorm.new(d_model)
+      @norm3 = Torch::NN::LayerNorm.new(d_model)
+      @dropout1 = Torch::NN::Dropout.new(p: dropout)
+      @dropout2 = Torch::NN::Dropout.new(p: dropout)
+      @dropout3 = Torch::NN::Dropout.new(p: dropout)
+      @activation = _get_activation_fn(activation)
+    end
+    # Pass the inputs (and mask) through the decoder layer.
+    # Args:
+    #     tgt: the sequence to the decoder layer (required).
+    #     memory: the sequence from the last layer of the encoder (required).
+    #     tgt_mask: the mask for the tgt sequence (optional).
+    #     memory_mask: the mask for the memory sequence (optional).
+    #     tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
+    #     memory_key_padding_mask: the mask for the memory keys per batch (optional).
+    # Shape:
+    #     see the docs in Transformer class.
+    def forward(tgt, memory, tgt_mask: nil, memory_mask: nil,
+              tgt_key_padding_mask: nil, memory_key_padding_mask: nil)
+      tgt2 = @self_attn.call(tgt, tgt, tgt, attn_mask: tgt_mask,
+                            key_padding_mask: tgt_key_padding_mask)[0]
+      tgt = tgt + @dropout1.call(tgt2)
+      tgt = @norm1.call(tgt)
+      tgt2 = @multihead_attn.call(tgt, memory, memory, attn_mask: memory_mask,
+                                 key_padding_mask: memory_key_padding_mask)[0]
+      tgt = tgt + @dropout2.call(tgt2)
+      tgt = @norm2.call(tgt)
+      tgt2 = @linear2.call(@dropout.call(@activation.call(@linear1.call(tgt))))
+      tgt = tgt + @dropout3.call(tgt2)
+      tgt = @norm3.call(tgt)
+      return tgt
+    end
+  end
+  class PositionalEncoding < Torch::NN::Module
+    # PositionalEncoding module injects some information about the relative or absolute position of the tokens in the sequence. The positional encodings have the same dimension as the embeddings so that the two can be summed. Here, we use sine and cosine functions of different frequencies.
+    def initialize(d_model, dropout: 0.1, max_len: 5000)
+      super()
+      @dropout = Torch::NN::Dropout.new(p: dropout)
+      pe = Torch.zeros(max_len, d_model)
+      position = Torch.arange(0, max_len, dtype: :float).unsqueeze(1)
+      div_term = Torch.exp(Torch.arange(0, d_model, 2).float() * (-Math.log(10000.0) / d_model))
+      sin = Torch.sin(position * div_term).t
+      cos = Torch.cos(position * div_term).t
+      pe.t!
+      pe.each.with_index do |row, i|
+        pe[i] = sin[i / 2] if i % 2 == 0
+        pe[i] = cos[(i-1)/2] if i % 2 != 0
+      end
+      pe.t!
+      pe = pe.unsqueeze(0).transpose(0, 1)
+      register_buffer('pe', pe)
+    end
+    def forward(x)
+      x = x + pe.narrow(0, 0, x.size(0))
+      return x
+    end
+  end
+end
+def _get_activation_fn(activation)
+  if activation == "relu"
+    return Torch::NN::F.method(:relu)
+  elsif activation == "gelu"
+    return Torch::NN::F.method(:gelu)
+  end
+  raise RuntimeError, "activation should be relu/gelu, not %s" % activation
+end