RubyGems - torch-rb - Versions diffs - 0.8.1 → 0.9.1 - Mend

torch-rb 0.8.1 → 0.9.1

Files changed (30) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +20 -0
data/README.md +26 -44
data/codegen/generate_functions.rb +13 -5
data/codegen/native_functions.yaml +1103 -373
data/ext/torch/backends.cpp +2 -2
data/ext/torch/ruby_arg_parser.cpp +2 -2
data/ext/torch/ruby_arg_parser.h +19 -5
data/ext/torch/templates.h +0 -37
data/ext/torch/tensor.cpp +8 -8
data/ext/torch/utils.h +0 -6
data/lib/torch/inspector.rb +1 -1
data/lib/torch/nn/convnd.rb +2 -0
data/lib/torch/nn/functional.rb +1 -1
data/lib/torch/nn/functional_attention.rb +241 -0
data/lib/torch/nn/module.rb +30 -0
data/lib/torch/nn/module_list.rb +49 -0
data/lib/torch/nn/multihead_attention.rb +123 -0
data/lib/torch/nn/parameter.rb +6 -0
data/lib/torch/nn/transformer.rb +92 -0
data/lib/torch/nn/transformer_decoder.rb +25 -0
data/lib/torch/nn/transformer_decoder_layer.rb +43 -0
data/lib/torch/nn/transformer_encoder.rb +25 -0
data/lib/torch/nn/transformer_encoder_layer.rb +36 -0
data/lib/torch/nn/utils.rb +12 -0
data/lib/torch/tensor.rb +21 -8
data/lib/torch/utils/data/data_loader.rb +3 -1
data/lib/torch/version.rb +1 -1
data/lib/torch.rb +6 -45
metadata +11 -3

data/ext/torch/backends.cpp CHANGED Viewed

@@ -7,11 +7,11 @@
 void init_backends(Rice::Module& m) {
   auto rb_mBackends = Rice::define_module_under(m, "Backends");
-   Rice::define_module_under(rb_mBackends, "OpenMP")
+  Rice::define_module_under(rb_mBackends, "OpenMP")
     .add_handler<torch::Error>(handle_error)
     .define_singleton_function("available?", &torch::hasOpenMP);
-   Rice::define_module_under(rb_mBackends, "MKL")
+  Rice::define_module_under(rb_mBackends, "MKL")
     .add_handler<torch::Error>(handle_error)
     .define_singleton_function("available?", &torch::hasMKL);
 }

data/ext/torch/ruby_arg_parser.cpp CHANGED Viewed

@@ -472,12 +472,12 @@ static void extra_kwargs(FunctionSignature& signature, VALUE kwargs, ssize_t num
     auto param_idx = find_param(signature, key);
     if (param_idx < 0) {
       rb_raise(rb_eArgError, "%s() got an unexpected keyword argument '%s'",
-          signature.name.c_str(), THPUtils_unpackSymbol(key).c_str());
+          signature.name.c_str(), rb_id2name(rb_to_id(key)));
     }
     if (param_idx < num_pos_args) {
       rb_raise(rb_eArgError, "%s() got multiple values for argument '%s'",
-          signature.name.c_str(), THPUtils_unpackSymbol(key).c_str());
+          signature.name.c_str(), rb_id2name(rb_to_id(key)));
     }
   }

data/ext/torch/ruby_arg_parser.h CHANGED Viewed

@@ -75,7 +75,7 @@ struct RubyArgs {
   int idx;
   inline at::Tensor tensor(int i);
-  inline OptionalTensor optionalTensor(int i);
+  inline c10::optional<at::Tensor> optionalTensor(int i);
   inline at::Scalar scalar(int i);
   // inline at::Scalar scalarWithDefault(int i, at::Scalar default_scalar);
   inline std::vector<at::Scalar> scalarlist(int i);
@@ -109,6 +109,9 @@ struct RubyArgs {
   // inline at::QScheme toQScheme(int i);
   inline std::string string(int i);
   inline c10::optional<std::string> stringOptional(int i);
+  inline c10::string_view stringView(int i);
+  // inline c10::string_view stringViewWithDefault(int i, const c10::string_view default_str);
+  inline c10::optional<c10::string_view> stringViewOptional(int i);
   // inline PyObject* pyobject(int i);
   inline int64_t toInt64(int i);
   // inline int64_t toInt64WithDefault(int i, int64_t default_int);
@@ -125,8 +128,8 @@ inline at::Tensor RubyArgs::tensor(int i) {
   return Rice::detail::From_Ruby<torch::Tensor>().convert(args[i]);
 }
-inline OptionalTensor RubyArgs::optionalTensor(int i) {
-  if (NIL_P(args[i])) return OptionalTensor(Nil);
+inline c10::optional<at::Tensor> RubyArgs::optionalTensor(int i) {
+  if (NIL_P(args[i])) return c10::nullopt;
   return tensor(i);
 }
@@ -232,7 +235,7 @@ inline ScalarType RubyArgs::scalartype(int i) {
   auto it = dtype_map.find(args[i]);
   if (it == dtype_map.end()) {
-    rb_raise(rb_eArgError, "invalid dtype: %s", THPUtils_unpackSymbol(args[i]).c_str());
+    rb_raise(rb_eArgError, "invalid dtype: %s", rb_id2name(rb_to_id(args[i])));
   }
   return it->second;
 }
@@ -290,7 +293,7 @@ inline c10::optional<at::Layout> RubyArgs::layoutOptional(int i) {
   auto it = layout_map.find(args[i]);
   if (it == layout_map.end()) {
-    rb_raise(rb_eArgError, "invalid layout: %s", THPUtils_unpackSymbol(args[i]).c_str());
+    rb_raise(rb_eArgError, "invalid layout: %s", rb_id2name(rb_to_id(args[i])));
   }
   return it->second;
 }
@@ -322,6 +325,17 @@ inline c10::optional<std::string> RubyArgs::stringOptional(int i) {
   return Rice::detail::From_Ruby<std::string>().convert(args[i]);
 }
+// string_view does not own data
+inline c10::string_view RubyArgs::stringView(int i) {
+  return c10::string_view(RSTRING_PTR(args[i]), RSTRING_LEN(args[i]));
+}
+// string_view does not own data
+inline c10::optional<c10::string_view> RubyArgs::stringViewOptional(int i) {
+  if (NIL_P(args[i])) return c10::nullopt;
+  return c10::string_view(RSTRING_PTR(args[i]), RSTRING_LEN(args[i]));
+}
 inline int64_t RubyArgs::toInt64(int i) {
   if (NIL_P(args[i])) return signature.params[i].default_int;
   return Rice::detail::From_Ruby<int64_t>().convert(args[i]);

data/ext/torch/templates.h CHANGED Viewed

@@ -41,24 +41,6 @@ using torch::nn::init::NonlinearityType;
 #define RETURN_NIL                                                   \
   return Qnil;
-class OptionalTensor {
-  torch::Tensor value;
-  public:
-    OptionalTensor(Object o) {
-      if (o.is_nil()) {
-        value = {};
-      } else {
-        value = Rice::detail::From_Ruby<torch::Tensor>().convert(o.value());
-      }
-    }
-    OptionalTensor(torch::Tensor o) {
-      value = o;
-    }
-    operator torch::Tensor() const {
-      return value;
-    }
-};
 namespace Rice::detail
 {
   template<>
@@ -131,25 +113,6 @@ namespace Rice::detail
     }
   };
-  template<>
-  struct Type<OptionalTensor>
-  {
-    static bool verify()
-    {
-      return true;
-    }
-  };
-  template<>
-  class From_Ruby<OptionalTensor>
-  {
-  public:
-    OptionalTensor convert(VALUE x)
-    {
-      return OptionalTensor(x);
-    }
-  };
   template<>
   struct Type<Scalar>
   {

data/ext/torch/tensor.cpp CHANGED Viewed

@@ -107,7 +107,7 @@ static VALUE tensor__backward(int argc, VALUE* argv, VALUE self_)
   ParsedArgs<4> parsed_args;
   auto _r = parser.parse(self_, argc, argv, parsed_args);
   // _backward(Tensor self, Tensor[] inputs, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()
-  auto dispatch__backward = [](const Tensor & self, TensorList inputs, const OptionalTensor & gradient, c10::optional<bool> retain_graph, bool create_graph) -> void {
+  auto dispatch__backward = [](const Tensor & self, TensorList inputs, const c10::optional<at::Tensor> & gradient, c10::optional<bool> retain_graph, bool create_graph) -> void {
     // in future, release GVL
     self._backward(inputs, gradient, retain_graph, create_graph);
   };
@@ -125,13 +125,13 @@ void init_tensor(Rice::Module& m, Rice::Class& c, Rice::Class& rb_cTensorOptions
   rb_define_method(rb_cTensor, "backward", (VALUE (*)(...)) tensor__backward, -1);
   rb_cTensor
-    .define_method("cuda?", &torch::Tensor::is_cuda)
-    .define_method("sparse?", &torch::Tensor::is_sparse)
-    .define_method("quantized?", &torch::Tensor::is_quantized)
-    .define_method("dim", &torch::Tensor::dim)
-    .define_method("numel", &torch::Tensor::numel)
-    .define_method("element_size", &torch::Tensor::element_size)
-    .define_method("requires_grad", &torch::Tensor::requires_grad)
+    .define_method("cuda?", [](Tensor& self) { return self.is_cuda(); })
+    .define_method("sparse?", [](Tensor& self) { return self.is_sparse(); })
+    .define_method("quantized?", [](Tensor& self) { return self.is_quantized(); })
+    .define_method("dim", [](Tensor& self) { return self.dim(); })
+    .define_method("numel", [](Tensor& self) { return self.numel(); })
+    .define_method("element_size", [](Tensor& self) { return self.element_size(); })
+    .define_method("requires_grad", [](Tensor& self) { return self.requires_grad(); })
     .define_method(
       "_size",
       [](Tensor& self, int64_t dim) {

data/ext/torch/utils.h CHANGED Viewed

@@ -16,12 +16,6 @@ inline VALUE THPUtils_internSymbol(const std::string& str) {
   return Rice::Symbol(str);
 }
-inline std::string THPUtils_unpackSymbol(VALUE obj) {
-  Check_Type(obj, T_SYMBOL);
-  obj = rb_funcall(obj, rb_intern("to_s"), 0);
-  return std::string(RSTRING_PTR(obj), RSTRING_LEN(obj));
-}
 inline std::string THPUtils_unpackString(VALUE obj) {
   Check_Type(obj, T_STRING);
   return std::string(RSTRING_PTR(obj), RSTRING_LEN(obj));

data/lib/torch/inspector.rb CHANGED Viewed

@@ -247,7 +247,7 @@ module Torch
       # length includes spaces and comma between elements
       element_length = formatter.width + 2
       elements_per_line = [1, ((PRINT_OPTS[:linewidth] - indent) / element_length.to_f).floor.to_i].max
-      char_per_line = element_length * elements_per_line
+      _char_per_line = element_length * elements_per_line
       if summarize && slf.size(0) > 2 * PRINT_OPTS[:edgeitems]
         data = (

data/lib/torch/nn/convnd.rb CHANGED Viewed

@@ -1,6 +1,8 @@
 module Torch
   module NN
     class ConvNd < Module
+      attr_reader :in_channels, :out_channels, :kernel_size, :stride, :padding, :dilation, :transposed, :output_paddding, :groups, :padding_mode
       def initialize(in_channels, out_channels, kernel_size, stride, padding, dilation, transposed, output_padding, groups, bias, padding_mode)
         super()
         raise ArgumentError, "in_channels must be divisible by groups" if in_channels % groups != 0

data/lib/torch/nn/functional.rb CHANGED Viewed

@@ -571,7 +571,7 @@ module Torch
         end
         def _interp_output_size(closed_over_args)
-          input, size, scale_factor, recompute_scale_factor = closed_over_args
+          input, size, scale_factor, _recompute_scale_factor = closed_over_args
           dim = input.dim - 2
           if size.nil? && scale_factor.nil?
             raise ArgumentError, "either size or scale_factor should be defined"

data/lib/torch/nn/functional_attention.rb ADDED Viewed

@@ -0,0 +1,241 @@
+module Torch
+  module NN
+    class Functional
+      class << self
+        def in_projection_packed(q, k, v, w, b: nil)
+          e = q.size(-1)
+          if k.eql? v
+            if q.eql? k
+              # self-attention
+              return linear(q, w, b).chunk(3, dim: -1)
+            else
+              # encoder-decoder attention
+              w_q, w_kv = w.split_with_sizes([e, e * 2])
+              if b.nil?
+                b_q = b_kv = nil
+              else
+                b_q, b_kv = b.split_with_sizes([e, e * 2])
+              end
+              return [linear(q, w_q, b_q), *linear(k, w_kv, b_kv).chunk(2, dim: -1)]
+            end
+          else
+            w_q, w_k, w_v = w.chunk(3)
+            if b.nil?
+              b_q = b_k = b_v = nil
+            else
+              b_q, b_k, b_v = b.chunk(3)
+            end
+            return [linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)]
+          end
+        end
+        def in_projection(
+          q, k, v,
+          w_q, w_k, w_v,
+          b_q: nil, b_k: nil, b_v: nil
+        )
+          e_q, e_k, e_v = q.size(-1), k.size(-1), v.size(-1)
+          raise ArgumentError, "Expecting query weights shape of #{[e_q, e_q]}, but got #{w_q.shape}" unless w_q.shape == [e_q, e_q]
+          raise ArgumentError, "Expecting key weights shape of #{[e_k, e_k]}, but got #{w_k.shape}" unless w_k.shape == [e_k, e_k]
+          raise ArgumentError, "Expecting value weights shape of #{[e_v, e_v]}, but got #{w_v.shape}" unless w_v.shape == [e_v, e_v]
+          raise ArgumentError, "Expecting query bias shape of #{[e_q]}, but got #{b_q.shape}" if b_q && b_q.shape != [e_q]
+          raise ArgumentError, "Expecting key bias shape of #{[e_k]}, but got #{b_k.shape}" if b_k && b_k.shape != [e_k]
+          raise ArgumentError, "Expecting value bias shape of #{[e_v]}, but got #{b_v.shape}" if b_v && b_v.shape != [e_v]
+          [linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)]
+        end
+        def scaled_dot_product_attention(
+          q, k, v,
+          attn_mask: nil, dropout_p: 0.0
+        )
+          _b, _nt, e = q.shape
+          q = q / Math.sqrt(e)
+          attn = Torch.bmm(q, k.transpose(-2, -1))
+          attn += attn_mask if attn_mask
+          attn = softmax(attn, dim: -1)
+          attn = dropout(attn, p: dropout_p) if dropout_p > 0
+          output = Torch.bmm(attn, v)
+          [output, attn]
+        end
+        def multi_head_attention_forward(
+          query, key, value,
+          embed_dim_to_check, num_heads,
+          in_proj_weight, in_proj_bias,
+          bias_k, bias_v,
+          add_zero_attn,
+          dropout_p,
+          out_proj_weight, out_proj_bias,
+          training: true,
+          key_padding_mask: nil,
+          need_weights: true,
+          attn_mask: nil,
+          use_separate_proj_weight: false,
+          q_proj_weight: nil, k_proj_weight: nil, v_proj_weight: nil,
+          static_k: nil, static_v: nil
+        )
+          tgt_len, bsz, embed_dim = query.shape
+          src_len = key.shape.first
+          raise ArgumentError, "Was expecting embedding dimension of #{embed_dim_to_check}, but got #{embed_dim}" unless embed_dim == embed_dim_to_check
+          head_dim = if embed_dim.is_a?(Torch::Tensor)
+            embed_dim.div(num_heads, rounding_mode: 'trunc')
+          else
+            head_dim = embed_dim.div num_heads
+          end
+          if use_separate_proj_weight
+            raise ArgumentError, "Key's sequence and batch dims #{key.shape[0...2]} do not match value's #{value.shape[0...2]}" unless key.shape[0...2] == value.shape[0...2]
+          else
+            raise ArgumentError, "Key shape #{key.shape} does not match value shape #{value.shape}" unless key.shape == value.shape
+          end
+          # compute in-projection
+          q, k, v =
+            if use_separate_proj_weight
+              raise ArgumentError, "use_separate_proj_weight is true but q_proj_weight is nil" unless q_proj_weight
+              raise ArgumentError, "use_separate_proj_weight is true but k_proj_weight is nil" unless k_proj_weight
+              raise ArgumentError, "use_separate_proj_weight is true but v_proj_weight is nil" unless v_proj_weight
+              if in_proj_bias
+                b_q, b_k, b_v = in_proj_bias.chunk(3)
+              else
+                b_q = b_k = b_v = nil
+              end
+              in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q: b_q, b_k: b_k, b_v: b_v)
+            else
+              in_projection_packed(query, key, value, in_proj_weight, b: in_proj_bias)
+            end
+          # prep attention mask
+          if attn_mask
+            if attn_mask.dtype == :uint8
+              puts "[WARN] Byte tensor for attn_mask in Multihead Attention is deprecated. Use bool tensor instead."
+              attn_mask = attn_mask.bool
+            else
+              raise ArgumentError, "Only float, byte, and bool types are supported for attn_mask, not #{attn_mask.dtype}" unless attn_mask.floating_point? || attn_mask.dtype == :bool
+            end
+            if attn_mask.dim == 2
+              correct_2d_size = [tgt_len, src_len]
+              raise ArgumentError, "The shape of the 2D attn_mask is #{attn_mask.shape}, but should be #{correct_2d_size}." unless attn_mask.shape == correct_2d_size
+              attn_mask = attn_mask.unsqueeze(0)
+            elsif attn_mask.dim == 3
+              correct_3d_size = [bsz * num_heads, tgt_len, src_len]
+              raise ArgumentError, "The shape of the 3D attn_mask is #{attn_mask.shape}, but should be #{correct_3d_size}." unless attn_mask.shape == correct_3d_size
+            else
+              raise ArgumentError, "attn_mask's dimension #{attn_mask.dim} is not supported"
+            end
+          end
+          # prep key padding mask
+          if key_padding_mask && key_padding_mask.dtype == :uint8
+            puts "[WARN] Byte tensor for key_padding_mask in Multihead Attention is deprecated. Use bool tensor instead."
+            key_padding_mask = key_padding_mask.bool
+          end
+          # add bias along batch dimension (currently second)
+          if bias_k && bias_v
+            raise ArgumentError, "bias cannot be added to static key." if static_k
+            raise ArgumentError, "bias cannot be added to static value." if static_v
+            k = Torch.cat([k, bias_k.repeat(1, bsz, 1)])
+            v = Torch.cat([v, bias_v.repeat(1, bsz, 1)])
+            attn_mask = pad(attn_mask, [0, 1]) if attn_mask
+            key_padding_mask = pad(key_padding_mask, [0, 1]) if key_padding_mask
+          else
+            raise ArgumentError unless bias_k.nil?
+            raise ArgumentError unless bias_v.nil?
+          end
+          # reshape q, k, v for multihead attention and make em batch first
+          q = q.contiguous.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+          if static_k.nil?
+            k = k.contiguous.view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+          else
+            raise ArgumentError, "Expecting static_k.size(0) of #{bsz * num_heads}, but got #{static_k.size(0)}" unless static_k.size(0) == bsz * num_heads
+            raise ArgumentError, "Expecting static_k.size(2) of #{head_dim}, but got #{static_k.size(2)}" unless static_k.size(2) == head_dim
+            k = static_k
+          end
+          if static_v.nil?
+            v = v.contiguous.view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+          else
+            raise ArgumentError, "Expecting static_v.size(0) of #{bsz * num_heads}, but got #{static_v.size(0)}" unless static_v.size(0) == bsz * num_heads
+            raise ArgumentError, "Expecting static_v.size(2) of #{head_dim}, but got #{static_v.size(2)}" unless static_v.size(2) == head_dim
+            v = static_v
+          end
+          # add zero attention along batch dimension (now first)
+          if add_zero_attn
+            zero_attn_shape = [bsz * num_heads, 1, head_dim]
+            k = Torch.cat([k, Torch.zeros(zero_attn_shape, dtype: k.dtype, device: k.device)], dim: 1)
+            v = Torch.cat([v, Torch.zeros(zero_attn_shape, dtype: v.dtype, device: v.device)], dim: 1)
+            attn_mask = pad(attn_mask, [0, 1]) if attn_mask
+            key_padding_mask = pad(key_padding_mask, [0, 1]) if key_padding_mask
+          end
+          # update source sequence length after adjustments
+          src_len = k.size(1)
+          # merge key padding and attention masks
+          if key_padding_mask
+            raise ArgumentError, "Expecting key_padding_mask shape of #{[bsz, src_len]}, but got #{key_padding_mask.shape}" unless key_padding_mask.shape == [bsz, src_len]
+            key_padding_mask = key_padding_mask.view(bsz, 1, 1, src_len).expand(-1, num_heads, -1, -1).reshape(bsz * num_heads, 1, src_len)
+            attn_mask = if attn_mask.nil?
+              key_padding_mask
+            elsif attn_mask.dtype == :bool
+              attn_mask.logical_or(key_padding_mask)
+            else
+              attn_mask.masked_fill(key_padding_mask, -Float::INFINITY)
+            end
+          end
+          # convert mask to float
+          if attn_mask && attn_mask.dtype == :bool
+            new_attn_mask = Torch.zeros_like(attn_mask, dtype: :float32)
+            attn_mask = new_attn_mask.masked_fill(attn_mask, -Float::INFINITY)
+          end
+          dropout_p = 0.0 unless training
+          # (deep breath) calculate attention and out projection
+          attn_output, attn_output_weights = scaled_dot_product_attention(q, k, v, attn_mask: attn_mask, dropout_p: dropout_p)
+          attn_output = attn_output.transpose(0, 1).contiguous.view(tgt_len, bsz, embed_dim)
+          attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+          if need_weights
+            # average attention weights over heads
+            attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+            [attn_output, attn_output_weights.sum(dim: 1) / num_heads]
+          else
+            [attn_output, nil]
+          end
+        end
+      end
+    end
+  end
+end

data/lib/torch/nn/module.rb CHANGED Viewed

@@ -3,6 +3,8 @@ module Torch
     class Module
       include Utils
+      attr_reader :training
       def initialize
         @training = true
         @parameters = {}
@@ -278,6 +280,11 @@ module Torch
         end
       end
+      def deep_dup
+        memo = {}
+        dup_value(self, memo)
+      end
       def method_missing(method, *args, &block)
         name = method.to_s
         if named_parameters.key?(name)
@@ -386,6 +393,29 @@ module Torch
           destination[prefix + k] = v
         end
       end
+      # keep memo hash like Python deepcopy
+      # https://docs.python.org/3/library/copy.html
+      def dup_value(v, memo)
+        memo[v.object_id] ||= begin
+          case v
+          when Method, UnboundMethod
+            v
+          when Hash
+            v.to_h { |k, v2| [dup_value(k, memo), dup_value(v2, memo)] }
+          when Array
+            v.map { |v2| dup_value(v2, memo) }
+          when Torch::NN::Module
+            copy = v.dup
+            v.instance_variables.each do |var|
+              copy.instance_variable_set(var, dup_value(v.instance_variable_get(var), memo))
+            end
+            copy
+          else
+            v.dup
+          end
+        end
+      end
     end
   end
 end

data/lib/torch/nn/module_list.rb ADDED Viewed

@@ -0,0 +1,49 @@
+module Torch
+  module NN
+    class ModuleList < Module
+      include Enumerable
+      def initialize(mods = nil)
+        super()
+        self.concat(mods) if mods
+      end
+      def length
+        @modules.length
+      end
+      alias_method :count, :length
+      alias_method :size, :length
+      def concat(mods)
+        raise ArgumentError, "Modules should respond to #each" unless mods.respond_to?(:each)
+        mods.each { |m| append m }
+        self
+      end
+      def each(&block)
+        if block_given?
+          @modules.values.each(&block)
+        else
+          to_enum(:each)
+        end
+      end
+      def append(mod)
+        raise ArgumentError, "Provided element is not a module" unless mod.is_a?(Module)
+        add_module(length.to_s, mod)
+        self
+      end
+      def [](idx)
+        if idx.is_a?(Range)
+          self.class.new(@modules.values[idx])
+        else
+          @modules[idx.to_s]
+        end
+      end
+    end
+  end
+end

data/lib/torch/nn/multihead_attention.rb ADDED Viewed

@@ -0,0 +1,123 @@
+module Torch
+  module NN
+    class MultiheadAttention < Module
+      def initialize(
+        embed_dim, num_heads,
+        dropout: 0.0, bias: true, add_bias_kv: false, add_zero_attn: false,
+        kdim: nil, vdim: nil, batch_first: false, device: nil, dtype: nil
+      )
+        super()
+        @embed_dim = embed_dim
+        @kdim = kdim || @embed_dim
+        @vdim = vdim || @embed_dim
+        @qkv_same_embed_dim = @kdim == @embed_dim && @vdim == @embed_dim
+        @num_heads = num_heads
+        @dropout = dropout
+        @batch_first = batch_first
+        @head_dim = @embed_dim.div @num_heads
+        raise ArgumentError, "embed_dim must be divisible by num_heads" unless @head_dim * @num_heads == @embed_dim
+        if @qkv_same_embed_dim
+          @in_proj_weight = Parameter.new(Torch.empty([3 * @embed_dim, @embed_dim]))
+          %w(q k v).each { |x| register_parameter("#{x}_proj_weight", nil) }
+        else
+          @q_proj_weight = Parameter.new(Torch.empty([@embed_dim, @embed_dim]))
+          @k_proj_weight = Parameter.new(Torch.empty([@embed_dim, @kdim]))
+          @v_proj_weight = Parameter.new(Torch.empty([@embed_dim, @vdim]))
+          register_parameter('in_proj_weight', nil)
+        end
+        if bias
+          @in_proj_bias = Parameter.new(Torch.empty(3 * @embed_dim))
+        else
+          register_parameter('in_proj_bias', nil)
+        end
+        @out_proj = Linear.new(@embed_dim, @embed_dim, bias: bias)
+        if add_bias_kv
+          @bias_k = Parameter.new(Torch.empty([1, 1, @embed_dim]))
+          @bias_v = Parameter.new(Torch.empty([1, 1, @embed_dim]))
+        else
+          @bias_k = @bias_v = nil
+        end
+        @add_zero_attn = add_zero_attn
+        reset_parameters
+      end
+      def batch_first?
+        !!@batch_first
+      end
+      def reset_parameters
+        if @qkv_same_embed_dim
+          Init.xavier_uniform!(@in_proj_weight)
+        else
+          Init.xavier_uniform!(@q_proj_weight)
+          Init.xavier_uniform!(@k_proj_weight)
+          Init.xavier_uniform!(@v_proj_weight)
+        end
+        if @in_proj_bias
+          Init.constant!(@in_proj_bias, 0.0)
+          Init.constant!(@out_proj.bias, 0.0)
+        end
+        Init.xavier_uniform!(@bias_k) if @bias_k
+        Init.xavier_uniform!(@bias_v) if @bias_v
+      end
+      def forward(
+        query, key, value,
+        key_padding_mask: nil, need_weights: true, attn_mask: nil
+      )
+        if batch_first?
+          query, key, value = [query, key, value].map { |t| t.transpose(1, 0) }
+        end
+        attn_output, attn_output_weights =
+          if @qkv_same_embed_dim
+            F.multi_head_attention_forward(
+              query, key, value,
+              @embed_dim, @num_heads,
+              @in_proj_weight, @in_proj_bias,
+              @bias_k, @bias_v, @add_zero_attn,
+              @dropout, @out_proj.weight, @out_proj.bias,
+              training: @training,
+              key_padding_mask: key_padding_mask,
+              need_weights: need_weights,
+              attn_mask: attn_mask
+            )
+          else
+            F.multi_head_attention_forward(
+              query, key, value,
+              @embed_dim, @num_heads,
+              @in_proj_weight, @in_proj_bias,
+              @bias_k, @bias_v, @add_zero_attn,
+              @dropout, @out_proj.weight, @out_proj.bias,
+              training: @training,
+              key_padding_mask: key_padding_mask,
+              need_weights: need_weights,
+              attn_mask: attn_mask,
+              use_separate_proj_weight: true,
+              q_proj_weight: @q_proj_weight, k_proj_weight: @k_proj_weight, v_proj_weight: @v_proj_weight
+            )
+          end
+        attn_output = attn_output.transpose(1, 0) if batch_first?
+        [attn_output, attn_output_weights]
+      end
+    end
+  end
+end