RubyGems - torch-rb - Versions diffs - 0.7.0 → 0.8.3 - Mend

torch-rb 0.7.0 → 0.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +21 -0
data/README.md +23 -41
data/codegen/function.rb +2 -0
data/codegen/generate_functions.rb +41 -4
data/codegen/native_functions.yaml +2007 -1327
data/ext/torch/backends.cpp +17 -0
data/ext/torch/ext.cpp +8 -0
data/ext/torch/fft.cpp +13 -0
data/ext/torch/fft_functions.h +6 -0
data/ext/torch/linalg.cpp +13 -0
data/ext/torch/linalg_functions.h +6 -0
data/ext/torch/ruby_arg_parser.h +7 -1
data/ext/torch/special.cpp +13 -0
data/ext/torch/special_functions.h +6 -0
data/ext/torch/templates.h +1 -0
data/lib/torch/nn/convnd.rb +2 -0
data/lib/torch/nn/functional_attention.rb +241 -0
data/lib/torch/nn/module.rb +30 -0
data/lib/torch/nn/module_list.rb +49 -0
data/lib/torch/nn/multihead_attention.rb +123 -0
data/lib/torch/nn/parameter.rb +6 -0
data/lib/torch/nn/transformer.rb +92 -0
data/lib/torch/nn/transformer_decoder.rb +25 -0
data/lib/torch/nn/transformer_decoder_layer.rb +43 -0
data/lib/torch/nn/transformer_encoder.rb +25 -0
data/lib/torch/nn/transformer_encoder_layer.rb +36 -0
data/lib/torch/nn/utils.rb +12 -0
data/lib/torch/tensor.rb +8 -0
data/lib/torch/utils/data/data_loader.rb +2 -0
data/lib/torch/version.rb +1 -1
data/lib/torch.rb +6 -0
metadata +18 -3

data/ext/torch/backends.cpp ADDED Viewed

@@ -0,0 +1,17 @@
+#include <torch/torch.h>
+#include <rice/rice.hpp>
+#include "utils.h"
+void init_backends(Rice::Module& m) {
+  auto rb_mBackends = Rice::define_module_under(m, "Backends");
+   Rice::define_module_under(rb_mBackends, "OpenMP")
+    .add_handler<torch::Error>(handle_error)
+    .define_singleton_function("available?", &torch::hasOpenMP);
+   Rice::define_module_under(rb_mBackends, "MKL")
+    .add_handler<torch::Error>(handle_error)
+    .define_singleton_function("available?", &torch::hasMKL);
+}

data/ext/torch/ext.cpp CHANGED Viewed

@@ -2,10 +2,14 @@
 #include <rice/rice.hpp>
+void init_fft(Rice::Module& m);
+void init_linalg(Rice::Module& m);
 void init_nn(Rice::Module& m);
+void init_special(Rice::Module& m);
 void init_tensor(Rice::Module& m, Rice::Class& c, Rice::Class& rb_cTensorOptions);
 void init_torch(Rice::Module& m);
+void init_backends(Rice::Module& m);
 void init_cuda(Rice::Module& m);
 void init_device(Rice::Module& m);
 void init_ivalue(Rice::Module& m, Rice::Class& rb_cIValue);
@@ -27,7 +31,11 @@ void Init_ext()
   init_torch(m);
   init_tensor(m, rb_cTensor, rb_cTensorOptions);
   init_nn(m);
+  init_fft(m);
+  init_linalg(m);
+  init_special(m);
+  init_backends(m);
   init_cuda(m);
   init_device(m);
   init_ivalue(m, rb_cIValue);

data/ext/torch/fft.cpp ADDED Viewed

@@ -0,0 +1,13 @@
+#include <torch/torch.h>
+#include <rice/rice.hpp>
+#include "fft_functions.h"
+#include "templates.h"
+#include "utils.h"
+void init_fft(Rice::Module& m) {
+  auto rb_mFFT = Rice::define_module_under(m, "FFT");
+  rb_mFFT.add_handler<torch::Error>(handle_error);
+  add_fft_functions(rb_mFFT);
+}

data/ext/torch/fft_functions.h ADDED Viewed

@@ -0,0 +1,6 @@
+// generated by rake generate:functions
+// do not edit by hand
+#pragma once
+void add_fft_functions(Rice::Module& m);

data/ext/torch/linalg.cpp ADDED Viewed

@@ -0,0 +1,13 @@
+#include <torch/torch.h>
+#include <rice/rice.hpp>
+#include "linalg_functions.h"
+#include "templates.h"
+#include "utils.h"
+void init_linalg(Rice::Module& m) {
+  auto rb_mLinalg = Rice::define_module_under(m, "Linalg");
+  rb_mLinalg.add_handler<torch::Error>(handle_error);
+  add_linalg_functions(rb_mLinalg);
+}

data/ext/torch/linalg_functions.h ADDED Viewed

@@ -0,0 +1,6 @@
+// generated by rake generate:functions
+// do not edit by hand
+#pragma once
+void add_linalg_functions(Rice::Module& m);

data/ext/torch/ruby_arg_parser.h CHANGED Viewed

@@ -78,6 +78,7 @@ struct RubyArgs {
   inline OptionalTensor optionalTensor(int i);
   inline at::Scalar scalar(int i);
   // inline at::Scalar scalarWithDefault(int i, at::Scalar default_scalar);
+  inline std::vector<at::Scalar> scalarlist(int i);
   inline std::vector<at::Tensor> tensorlist(int i);
   template<int N>
   inline std::array<at::Tensor, N> tensorlist_n(int i);
@@ -134,6 +135,11 @@ inline at::Scalar RubyArgs::scalar(int i) {
   return Rice::detail::From_Ruby<torch::Scalar>().convert(args[i]);
 }
+inline std::vector<at::Scalar> RubyArgs::scalarlist(int i) {
+  if (NIL_P(args[i])) return std::vector<at::Scalar>();
+  return Rice::detail::From_Ruby<std::vector<at::Scalar>>().convert(args[i]);
+}
 inline std::vector<at::Tensor> RubyArgs::tensorlist(int i) {
   if (NIL_P(args[i])) return std::vector<at::Tensor>();
   return Rice::detail::From_Ruby<std::vector<Tensor>>().convert(args[i]);
@@ -312,7 +318,7 @@ inline std::string RubyArgs::string(int i) {
 }
 inline c10::optional<std::string> RubyArgs::stringOptional(int i) {
-  if (!args[i]) return c10::nullopt;
+  if (NIL_P(args[i])) return c10::nullopt;
   return Rice::detail::From_Ruby<std::string>().convert(args[i]);
 }

data/ext/torch/special.cpp ADDED Viewed

@@ -0,0 +1,13 @@
+#include <torch/torch.h>
+#include <rice/rice.hpp>
+#include "special_functions.h"
+#include "templates.h"
+#include "utils.h"
+void init_special(Rice::Module& m) {
+  auto rb_mSpecial = Rice::define_module_under(m, "Special");
+  rb_mSpecial.add_handler<torch::Error>(handle_error);
+  add_special_functions(rb_mSpecial);
+}

data/ext/torch/special_functions.h ADDED Viewed

@@ -0,0 +1,6 @@
+// generated by rake generate:functions
+// do not edit by hand
+#pragma once
+void add_special_functions(Rice::Module& m);

data/ext/torch/templates.h CHANGED Viewed

@@ -21,6 +21,7 @@ using torch::IntArrayRef;
 using torch::ArrayRef;
 using torch::TensorList;
 using torch::Storage;
+using ScalarList = ArrayRef<Scalar>;
 using torch::nn::init::FanModeType;
 using torch::nn::init::NonlinearityType;

data/lib/torch/nn/convnd.rb CHANGED Viewed

@@ -1,6 +1,8 @@
 module Torch
   module NN
     class ConvNd < Module
+      attr_reader :in_channels, :out_channels, :kernel_size, :stride, :padding, :dilation, :transposed, :output_paddding, :groups, :padding_mode
       def initialize(in_channels, out_channels, kernel_size, stride, padding, dilation, transposed, output_padding, groups, bias, padding_mode)
         super()
         raise ArgumentError, "in_channels must be divisible by groups" if in_channels % groups != 0

data/lib/torch/nn/functional_attention.rb ADDED Viewed

@@ -0,0 +1,241 @@
+module Torch
+  module NN
+    class Functional
+      class << self
+        def in_projection_packed(q, k, v, w, b: nil)
+          e = q.size(-1)
+          if k.eql? v
+            if q.eql? k
+              # self-attention
+              return linear(q, w, b).chunk(3, dim: -1)
+            else
+              # encoder-decoder attention
+              w_q, w_kv = w.split_with_sizes([e, e * 2])
+              if b.nil?
+                b_q = b_kv = nil
+              else
+                b_q, b_kv = b.split_with_sizes([e, e * 2])
+              end
+              return [linear(q, w_q, b_q), *linear(k, w_kv, b_kv).chunk(2, dim: -1)]
+            end
+          else
+            w_q, w_k, w_v = w.chunk(3)
+            if b.nil?
+              b_q = b_k = b_v = nil
+            else
+              b_q, b_k, b_v = b.chunk(3)
+            end
+            return [linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)]
+          end
+        end
+        def in_projection(
+          q, k, v,
+          w_q, w_k, w_v,
+          b_q: nil, b_k: nil, b_v: nil
+        )
+          e_q, e_k, e_v = q.size(-1), k.size(-1), v.size(-1)
+          raise ArgumentError, "Expecting query weights shape of #{[e_q, e_q]}, but got #{w_q.shape}" unless w_q.shape == [e_q, e_q]
+          raise ArgumentError, "Expecting key weights shape of #{[e_k, e_k]}, but got #{w_k.shape}" unless w_k.shape == [e_k, e_k]
+          raise ArgumentError, "Expecting value weights shape of #{[e_v, e_v]}, but got #{w_v.shape}" unless w_v.shape == [e_v, e_v]
+          raise ArgumentError, "Expecting query bias shape of #{[e_q]}, but got #{b_q.shape}" if b_q && b_q.shape != [e_q]
+          raise ArgumentError, "Expecting key bias shape of #{[e_k]}, but got #{b_k.shape}" if b_k && b_k.shape != [e_k]
+          raise ArgumentError, "Expecting value bias shape of #{[e_v]}, but got #{b_v.shape}" if b_v && b_v.shape != [e_v]
+          [linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)]
+        end
+        def scaled_dot_product_attention(
+          q, k, v,
+          attn_mask: nil, dropout_p: 0.0
+        )
+          b, nt, e = q.shape
+          q = q / Math.sqrt(e)
+          attn = Torch.bmm(q, k.transpose(-2, -1))
+          attn += attn_mask if attn_mask
+          attn = softmax(attn, dim: -1)
+          attn = dropout(attn, p: dropout_p) if dropout_p > 0
+          output = Torch.bmm(attn, v)
+          [output, attn]
+        end
+        def multi_head_attention_forward(
+          query, key, value,
+          embed_dim_to_check, num_heads,
+          in_proj_weight, in_proj_bias,
+          bias_k, bias_v,
+          add_zero_attn,
+          dropout_p,
+          out_proj_weight, out_proj_bias,
+          training: true,
+          key_padding_mask: nil,
+          need_weights: true,
+          attn_mask: nil,
+          use_separate_proj_weight: false,
+          q_proj_weight: nil, k_proj_weight: nil, v_proj_weight: nil,
+          static_k: nil, static_v: nil
+        )
+          tgt_len, bsz, embed_dim = query.shape
+          src_len = key.shape.first
+          raise ArgumentError, "Was expecting embedding dimension of #{embed_dim_to_check}, but got #{embed_dim}" unless embed_dim == embed_dim_to_check
+          head_dim = if embed_dim.is_a?(Torch::Tensor)
+            embed_dim.div(num_heads, rounding_mode: 'trunc')
+          else
+            head_dim = embed_dim.div num_heads
+          end
+          if use_separate_proj_weight
+            raise ArgumentError, "Key's sequence and batch dims #{key.shape[0...2]} do not match value's #{value.shape[0...2]}" unless key.shape[0...2] == value.shape[0...2]
+          else
+            raise ArgumentError, "Key shape #{key.shape} does not match value shape #{value.shape}" unless key.shape == value.shape
+          end
+          # compute in-projection
+          q, k, v =
+            if use_separate_proj_weight
+              raise ArgumentError, "use_separate_proj_weight is true but q_proj_weight is nil" unless q_proj_weight
+              raise ArgumentError, "use_separate_proj_weight is true but k_proj_weight is nil" unless k_proj_weight
+              raise ArgumentError, "use_separate_proj_weight is true but v_proj_weight is nil" unless v_proj_weight
+              if in_proj_bias
+                b_q, b_k, b_v = in_proj_bias.chunk(3)
+              else
+                b_q = b_k = b_v = nil
+              end
+              in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q: b_q, b_k: b_k, b_v: b_v)
+            else
+              in_projection_packed(query, key, value, in_proj_weight, b: in_proj_bias)
+            end
+          # prep attention mask
+          if attn_mask
+            if attn_mask.dtype == :uint8
+              puts "[WARN] Byte tensor for attn_mask in Multihead Attention is deprecated. Use bool tensor instead."
+              attn_mask = attn_mask.bool
+            else
+              raise ArgumentError, "Only float, byte, and bool types are supported for attn_mask, not #{attn_mask.dtype}" unless attn_mask.floating_point? || attn_mask.dtype == :bool
+            end
+            if attn_mask.dim == 2
+              correct_2d_size = [tgt_len, src_len]
+              raise ArgumentError, "The shape of the 2D attn_mask is #{attn_mask.shape}, but should be #{correct_2d_size}." unless attn_mask.shape == correct_2d_size
+              attn_mask = attn_mask.unsqueeze(0)
+            elsif attn_mask.dim == 3
+              correct_3d_size = [bsz * num_heads, tgt_len, src_len]
+              raise ArgumentError, "The shape of the 3D attn_mask is #{attn_mask.shape}, but should be #{correct_3d_size}." unless attn_mask.shape == correct_3d_size
+            else
+              raise ArgumentError, "attn_mask's dimension #{attn_mask.dim} is not supported"
+            end
+          end
+          # prep key padding mask
+          if key_padding_mask && key_padding_mask.dtype == :uint8
+            puts "[WARN] Byte tensor for key_padding_mask in Multihead Attention is deprecated. Use bool tensor instead."
+            key_padding_mask = key_padding_mask.bool
+          end
+          # add bias along batch dimension (currently second)
+          if bias_k && bias_v
+            raise ArgumentError, "bias cannot be added to static key." if static_k
+            raise ArgumentError, "bias cannot be added to static value." if static_v
+            k = Torch.cat([k, bias_k.repeat(1, bsz, 1)])
+            v = Torch.cat([v, bias_v.repeat(1, bsz, 1)])
+            attn_mask = pad(attn_mask, [0, 1]) if attn_mask
+            key_padding_mask = pad(key_padding_mask, [0, 1]) if key_padding_mask
+          else
+            raise ArgumentError unless bias_k.nil?
+            raise ArgumentError unless bias_v.nil?
+          end
+          # reshape q, k, v for multihead attention and make em batch first
+          q = q.contiguous.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+          if static_k.nil?
+            k = k.contiguous.view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+          else
+            raise ArgumentError, "Expecting static_k.size(0) of #{bsz * num_heads}, but got #{static_k.size(0)}" unless static_k.size(0) == bsz * num_heads
+            raise ArgumentError, "Expecting static_k.size(2) of #{head_dim}, but got #{static_k.size(2)}" unless static_k.size(2) == head_dim
+            k = static_k
+          end
+          if static_v.nil?
+            v = v.contiguous.view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+          else
+            raise ArgumentError, "Expecting static_v.size(0) of #{bsz * num_heads}, but got #{static_v.size(0)}" unless static_v.size(0) == bsz * num_heads
+            raise ArgumentError, "Expecting static_v.size(2) of #{head_dim}, but got #{static_v.size(2)}" unless static_v.size(2) == head_dim
+            v = static_v
+          end
+          # add zero attention along batch dimension (now first)
+          if add_zero_attn
+            zero_attn_shape = [bsz * num_heads, 1, head_dim]
+            k = Torch.cat([k, Torch.zeros(zero_attn_shape, dtype: k.dtype, device: k.device)], dim: 1)
+            v = Torch.cat([v, Torch.zeros(zero_attn_shape, dtype: v.dtype, device: v.device)], dim: 1)
+            attn_mask = pad(attn_mask, [0, 1]) if attn_mask
+            key_padding_mask = pad(key_padding_mask, [0, 1]) if key_padding_mask
+          end
+          # update source sequence length after adjustments
+          src_len = k.size(1)
+          # merge key padding and attention masks
+          if key_padding_mask
+            raise ArgumentError, "Expecting key_padding_mask shape of #{[bsz, src_len]}, but got #{key_padding_mask.shape}" unless key_padding_mask.shape == [bsz, src_len]
+            key_padding_mask = key_padding_mask.view(bsz, 1, 1, src_len).expand(-1, num_heads, -1, -1).reshape(bsz * num_heads, 1, src_len)
+            attn_mask = if attn_mask.nil?
+              key_padding_mask
+            elsif attn_mask.dtype == :bool
+              attn_mask.logical_or(key_padding_mask)
+            else
+              attn_mask.masked_fill(key_padding_mask, -Float::INFINITY)
+            end
+          end
+          # convert mask to float
+          if attn_mask && attn_mask.dtype == :bool
+            new_attn_mask = Torch.zeros_like(attn_mask, dtype: :float32)
+            attn_mask = new_attn_mask.masked_fill(attn_mask, -Float::INFINITY)
+          end
+          dropout_p = 0.0 unless training
+          # (deep breath) calculate attention and out projection
+          attn_output, attn_output_weights = scaled_dot_product_attention(q, k, v, attn_mask: attn_mask, dropout_p: dropout_p)
+          attn_output = attn_output.transpose(0, 1).contiguous.view(tgt_len, bsz, embed_dim)
+          attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+          if need_weights
+            # average attention weights over heads
+            attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+            [attn_output, attn_output_weights.sum(dim: 1) / num_heads]
+          else
+            [attn_output, nil]
+          end
+        end
+      end
+    end
+  end
+end

data/lib/torch/nn/module.rb CHANGED Viewed

@@ -3,6 +3,8 @@ module Torch
     class Module
       include Utils
+      attr_reader :training
       def initialize
         @training = true
         @parameters = {}
@@ -278,6 +280,11 @@ module Torch
         end
       end
+      def deep_dup
+        memo = {}
+        dup_value(self, memo)
+      end
       def method_missing(method, *args, &block)
         name = method.to_s
         if named_parameters.key?(name)
@@ -386,6 +393,29 @@ module Torch
           destination[prefix + k] = v
         end
       end
+      # keep memo hash like Python deepcopy
+      # https://docs.python.org/3/library/copy.html
+      def dup_value(v, memo)
+        memo[v.object_id] ||= begin
+          case v
+          when Method, UnboundMethod
+            v
+          when Hash
+            v.to_h { |k, v2| [dup_value(k, memo), dup_value(v2, memo)] }
+          when Array
+            v.map { |v2| dup_value(v2, memo) }
+          when Torch::NN::Module
+            copy = v.dup
+            v.instance_variables.each do |var|
+              copy.instance_variable_set(var, dup_value(v.instance_variable_get(var), memo))
+            end
+            copy
+          else
+            v.dup
+          end
+        end
+      end
     end
   end
 end

data/lib/torch/nn/module_list.rb ADDED Viewed

@@ -0,0 +1,49 @@
+module Torch
+  module NN
+    class ModuleList < Module
+      include Enumerable
+      def initialize(mods = nil)
+        super()
+        self.concat(mods) if mods
+      end
+      def length
+        @modules.length
+      end
+      alias_method :count, :length
+      alias_method :size, :length
+      def concat(mods)
+        raise ArgumentError, "Modules should respond to #each" unless mods.respond_to?(:each)
+        mods.each { |m| append m }
+        self
+      end
+      def each(&block)
+        if block_given?
+          @modules.values.each(&block)
+        else
+          to_enum(:each)
+        end
+      end
+      def append(mod)
+        raise ArgumentError, "Provided element is not a module" unless mod.is_a?(Module)
+        add_module(length.to_s, mod)
+        self
+      end
+      def [](idx)
+        if idx.is_a?(Range)
+          self.class.new(@modules.values[idx])
+        else
+          @modules[idx.to_s]
+        end
+      end
+    end
+  end
+end

data/lib/torch/nn/multihead_attention.rb ADDED Viewed

@@ -0,0 +1,123 @@
+module Torch
+  module NN
+    class MultiheadAttention < Module
+      def initialize(
+        embed_dim, num_heads,
+        dropout: 0.0, bias: true, add_bias_kv: false, add_zero_attn: false,
+        kdim: nil, vdim: nil, batch_first: false, device: nil, dtype: nil
+      )
+        super()
+        @embed_dim = embed_dim
+        @kdim = kdim || @embed_dim
+        @vdim = vdim || @embed_dim
+        @qkv_same_embed_dim = @kdim == @embed_dim && @vdim == @embed_dim
+        @num_heads = num_heads
+        @dropout = dropout
+        @batch_first = batch_first
+        @head_dim = @embed_dim.div @num_heads
+        raise ArgumentError, "embed_dim must be divisible by num_heads" unless @head_dim * @num_heads == @embed_dim
+        if @qkv_same_embed_dim
+          @in_proj_weight = Parameter.new(Torch.empty([3 * @embed_dim, @embed_dim]))
+          %w(q k v).each { |x| register_parameter("#{x}_proj_weight", nil) }
+        else
+          @q_proj_weight = Parameter.new(Torch.empty([@embed_dim, @embed_dim]))
+          @k_proj_weight = Parameter.new(Torch.empty([@embed_dim, @kdim]))
+          @v_proj_weight = Parameter.new(Torch.empty([@embed_dim, @vdim]))
+          register_parameter('in_proj_weight', nil)
+        end
+        if bias
+          @in_proj_bias = Parameter.new(Torch.empty(3 * @embed_dim))
+        else
+          register_parameter('in_proj_bias', nil)
+        end
+        @out_proj = Linear.new(@embed_dim, @embed_dim, bias: bias)
+        if add_bias_kv
+          @bias_k = Parameter.new(Torch.empty([1, 1, @embed_dim]))
+          @bias_v = Parameter.new(Torch.empty([1, 1, @embed_dim]))
+        else
+          @bias_k = @bias_v = nil
+        end
+        @add_zero_attn = add_zero_attn
+        reset_parameters
+      end
+      def batch_first?
+        !!@batch_first
+      end
+      def reset_parameters
+        if @qkv_same_embed_dim
+          Init.xavier_uniform!(@in_proj_weight)
+        else
+          Init.xavier_uniform!(@q_proj_weight)
+          Init.xavier_uniform!(@k_proj_weight)
+          Init.xavier_uniform!(@v_proj_weight)
+        end
+        if @in_proj_bias
+          Init.constant!(@in_proj_bias, 0.0)
+          Init.constant!(@out_proj.bias, 0.0)
+        end
+        Init.xavier_uniform!(@bias_k) if @bias_k
+        Init.xavier_uniform!(@bias_v) if @bias_v
+      end
+      def forward(
+        query, key, value,
+        key_padding_mask: nil, need_weights: true, attn_mask: nil
+      )
+        if batch_first?
+          query, key, value = [query, key, value].map { |t| t.transpose(1, 0) }
+        end
+        attn_output, attn_output_weights =
+          if @qkv_same_embed_dim
+            F.multi_head_attention_forward(
+              query, key, value,
+              @embed_dim, @num_heads,
+              @in_proj_weight, @in_proj_bias,
+              @bias_k, @bias_v, @add_zero_attn,
+              @dropout, @out_proj.weight, @out_proj.bias,
+              training: @training,
+              key_padding_mask: key_padding_mask,
+              need_weights: need_weights,
+              attn_mask: attn_mask
+            )
+          else
+            F.multi_head_attention_forward(
+              query, key, value,
+              @embed_dim, @num_heads,
+              @in_proj_weight, @in_proj_bias,
+              @bias_k, @bias_v, @add_zero_attn,
+              @dropout, @out_proj.weight, @out_proj.bias,
+              training: @training,
+              key_padding_mask: key_padding_mask,
+              need_weights: need_weights,
+              attn_mask: attn_mask,
+              use_separate_proj_weight: true,
+              q_proj_weight: @q_proj_weight, k_proj_weight: @k_proj_weight, v_proj_weight: @v_proj_weight
+            )
+          end
+        attn_output = attn_output.transpose(1, 0) if batch_first?
+        [attn_output, attn_output_weights]
+      end
+    end
+  end
+end

data/lib/torch/nn/parameter.rb CHANGED Viewed

@@ -9,6 +9,12 @@ module Torch
       def inspect
         "Parameter containing:\n#{super}"
       end
+      def dup
+        Torch.no_grad do
+          Parameter.new(clone, requires_grad: requires_grad)
+        end
+      end
     end
   end
 end