RubyGems - torch-rb - Versions diffs - 0.6.0 → 0.8.2 - Mend

torch-rb 0.6.0 → 0.8.2

Files changed (44) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +21 -0
data/README.md +23 -41
data/codegen/function.rb +2 -0
data/codegen/generate_functions.rb +43 -6
data/codegen/native_functions.yaml +2007 -1327
data/ext/torch/backends.cpp +17 -0
data/ext/torch/cuda.cpp +5 -5
data/ext/torch/device.cpp +13 -6
data/ext/torch/ext.cpp +22 -5
data/ext/torch/extconf.rb +1 -3
data/ext/torch/fft.cpp +13 -0
data/ext/torch/fft_functions.h +6 -0
data/ext/torch/ivalue.cpp +31 -33
data/ext/torch/linalg.cpp +13 -0
data/ext/torch/linalg_functions.h +6 -0
data/ext/torch/nn.cpp +34 -34
data/ext/torch/random.cpp +5 -5
data/ext/torch/ruby_arg_parser.cpp +2 -2
data/ext/torch/ruby_arg_parser.h +23 -12
data/ext/torch/special.cpp +13 -0
data/ext/torch/special_functions.h +6 -0
data/ext/torch/templates.h +111 -133
data/ext/torch/tensor.cpp +80 -67
data/ext/torch/torch.cpp +30 -21
data/ext/torch/utils.h +3 -4
data/ext/torch/wrap_outputs.h +72 -65
data/lib/torch/inspector.rb +5 -2
data/lib/torch/nn/convnd.rb +2 -0
data/lib/torch/nn/functional_attention.rb +241 -0
data/lib/torch/nn/module.rb +2 -0
data/lib/torch/nn/module_list.rb +49 -0
data/lib/torch/nn/multihead_attention.rb +123 -0
data/lib/torch/nn/transformer.rb +92 -0
data/lib/torch/nn/transformer_decoder.rb +25 -0
data/lib/torch/nn/transformer_decoder_layer.rb +43 -0
data/lib/torch/nn/transformer_encoder.rb +25 -0
data/lib/torch/nn/transformer_encoder_layer.rb +36 -0
data/lib/torch/nn/utils.rb +16 -0
data/lib/torch/tensor.rb +2 -0
data/lib/torch/utils/data/data_loader.rb +2 -0
data/lib/torch/version.rb +1 -1
data/lib/torch.rb +11 -0
metadata +20 -5

data/ext/torch/torch.cpp CHANGED Viewed

@@ -1,6 +1,6 @@
 #include <torch/torch.h>
-#include <rice/Module.hpp>
+#include <rice/rice.hpp>
 #include "torch_functions.h"
 #include "templates.h"
@@ -9,69 +9,78 @@
 void init_torch(Rice::Module& m) {
   m.add_handler<torch::Error>(handle_error);
   add_torch_functions(m);
-  m.define_singleton_method(
+  m.define_singleton_function(
       "grad_enabled?",
-      *[]() {
+      []() {
         return torch::GradMode::is_enabled();
       })
-    .define_singleton_method(
+    .define_singleton_function(
       "_set_grad_enabled",
-      *[](bool enabled) {
+      [](bool enabled) {
         torch::GradMode::set_enabled(enabled);
       })
-    .define_singleton_method(
+    .define_singleton_function(
       "manual_seed",
-      *[](uint64_t seed) {
+      [](uint64_t seed) {
         return torch::manual_seed(seed);
       })
     // config
-    .define_singleton_method(
+    .define_singleton_function(
       "show_config",
-      *[] {
+      [] {
         return torch::show_config();
       })
-    .define_singleton_method(
+    .define_singleton_function(
       "parallel_info",
-      *[] {
+      [] {
         return torch::get_parallel_info();
       })
     // begin operations
-    .define_singleton_method(
+    .define_singleton_function(
       "_save",
-      *[](const torch::IValue &value) {
+      [](const torch::IValue &value) {
         auto v = torch::pickle_save(value);
         std::string str(v.begin(), v.end());
         return str;
       })
-    .define_singleton_method(
+    .define_singleton_function(
       "_load",
-      *[](const std::string &s) {
+      [](const std::string &s) {
         std::vector<char> v;
         std::copy(s.begin(), s.end(), std::back_inserter(v));
         // https://github.com/pytorch/pytorch/issues/20356#issuecomment-567663701
         return torch::pickle_load(v);
       })
-    .define_singleton_method(
+    .define_singleton_function(
       "_from_blob",
-      *[](Rice::String s, std::vector<int64_t> size, const torch::TensorOptions &options) {
+      [](Rice::String s, std::vector<int64_t> size, const torch::TensorOptions &options) {
         void *data = const_cast<char *>(s.c_str());
         return torch::from_blob(data, size, options);
       })
-    .define_singleton_method(
+    .define_singleton_function(
       "_tensor",
-      *[](Rice::Array a, std::vector<int64_t> size, const torch::TensorOptions &options) {
+      [](Rice::Array a, std::vector<int64_t> size, const torch::TensorOptions &options) {
         auto dtype = options.dtype();
         torch::Tensor t;
         if (dtype == torch::kBool) {
           std::vector<uint8_t> vec;
           for (long i = 0; i < a.size(); i++) {
-            vec.push_back(from_ruby<bool>(a[i]));
+            vec.push_back(Rice::detail::From_Ruby<bool>().convert(a[i].value()));
+          }
+          t = torch::tensor(vec, options);
+        } else if (dtype == torch::kComplexFloat || dtype == torch::kComplexDouble) {
+          // TODO use template
+          std::vector<c10::complex<double>> vec;
+          Object obj;
+          for (long i = 0; i < a.size(); i++) {
+            obj = a[i];
+            vec.push_back(c10::complex<double>(Rice::detail::From_Ruby<double>().convert(obj.call("real").value()), Rice::detail::From_Ruby<double>().convert(obj.call("imag").value())));
           }
           t = torch::tensor(vec, options);
         } else {
           std::vector<float> vec;
           for (long i = 0; i < a.size(); i++) {
-            vec.push_back(from_ruby<float>(a[i]));
+            vec.push_back(Rice::detail::From_Ruby<float>().convert(a[i].value()));
           }
           // hack for requires_grad error
           if (options.requires_grad()) {

data/ext/torch/utils.h CHANGED Viewed

@@ -1,11 +1,10 @@
 #pragma once
-#include <rice/Exception.hpp>
-#include <rice/Symbol.hpp>
+#include <rice/rice.hpp>
+#include <rice/stl.hpp>
 // TODO find better place
-inline void handle_error(torch::Error const & ex)
-{
+inline void handle_error(torch::Error const & ex) {
   throw Rice::Exception(rb_eRuntimeError, ex.what_without_backtrace());
 }

data/ext/torch/wrap_outputs.h CHANGED Viewed

@@ -1,99 +1,106 @@
 #pragma once
 #include <torch/torch.h>
-#include <rice/Object.hpp>
+#include <rice/rice.hpp>
-inline Object wrap(bool x) {
-  return to_ruby<bool>(x);
+inline VALUE wrap(bool x) {
+  return Rice::detail::To_Ruby<bool>().convert(x);
 }
-inline Object wrap(int64_t x) {
-  return to_ruby<int64_t>(x);
+inline VALUE wrap(int64_t x) {
+  return Rice::detail::To_Ruby<int64_t>().convert(x);
 }
-inline Object wrap(double x) {
-  return to_ruby<double>(x);
+inline VALUE wrap(double x) {
+  return Rice::detail::To_Ruby<double>().convert(x);
 }
-inline Object wrap(torch::Tensor x) {
-  return to_ruby<torch::Tensor>(x);
+inline VALUE wrap(torch::Tensor x) {
+  return Rice::detail::To_Ruby<torch::Tensor>().convert(x);
 }
-inline Object wrap(torch::Scalar x) {
-  return to_ruby<torch::Scalar>(x);
+inline VALUE wrap(torch::Scalar x) {
+  return Rice::detail::To_Ruby<torch::Scalar>().convert(x);
 }
-inline Object wrap(torch::ScalarType x) {
-  return to_ruby<torch::ScalarType>(x);
+inline VALUE wrap(torch::ScalarType x) {
+  return Rice::detail::To_Ruby<torch::ScalarType>().convert(x);
 }
-inline Object wrap(torch::QScheme x) {
-  return to_ruby<torch::QScheme>(x);
+inline VALUE wrap(torch::QScheme x) {
+  return Rice::detail::To_Ruby<torch::QScheme>().convert(x);
 }
-inline Object wrap(std::tuple<torch::Tensor, torch::Tensor> x) {
-  Array a;
-  a.push(to_ruby<torch::Tensor>(std::get<0>(x)));
-  a.push(to_ruby<torch::Tensor>(std::get<1>(x)));
-  return Object(a);
+inline VALUE wrap(std::tuple<torch::Tensor, torch::Tensor> x) {
+  return rb_ary_new3(
+    2,
+    Rice::detail::To_Ruby<torch::Tensor>().convert(std::get<0>(x)),
+    Rice::detail::To_Ruby<torch::Tensor>().convert(std::get<1>(x))
+  );
 }
-inline Object wrap(std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> x) {
-  Array a;
-  a.push(to_ruby<torch::Tensor>(std::get<0>(x)));
-  a.push(to_ruby<torch::Tensor>(std::get<1>(x)));
-  a.push(to_ruby<torch::Tensor>(std::get<2>(x)));
-  return Object(a);
+inline VALUE wrap(std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> x) {
+  return rb_ary_new3(
+    3,
+    Rice::detail::To_Ruby<torch::Tensor>().convert(std::get<0>(x)),
+    Rice::detail::To_Ruby<torch::Tensor>().convert(std::get<1>(x)),
+    Rice::detail::To_Ruby<torch::Tensor>().convert(std::get<2>(x))
+  );
 }
-inline Object wrap(std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> x) {
-  Array a;
-  a.push(to_ruby<torch::Tensor>(std::get<0>(x)));
-  a.push(to_ruby<torch::Tensor>(std::get<1>(x)));
-  a.push(to_ruby<torch::Tensor>(std::get<2>(x)));
-  a.push(to_ruby<torch::Tensor>(std::get<3>(x)));
-  return Object(a);
+inline VALUE wrap(std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> x) {
+  return rb_ary_new3(
+    4,
+    Rice::detail::To_Ruby<torch::Tensor>().convert(std::get<0>(x)),
+    Rice::detail::To_Ruby<torch::Tensor>().convert(std::get<1>(x)),
+    Rice::detail::To_Ruby<torch::Tensor>().convert(std::get<2>(x)),
+    Rice::detail::To_Ruby<torch::Tensor>().convert(std::get<3>(x))
+  );
 }
-inline Object wrap(std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> x) {
-  Array a;
-  a.push(to_ruby<torch::Tensor>(std::get<0>(x)));
-  a.push(to_ruby<torch::Tensor>(std::get<1>(x)));
-  a.push(to_ruby<torch::Tensor>(std::get<2>(x)));
-  a.push(to_ruby<torch::Tensor>(std::get<3>(x)));
-  a.push(to_ruby<torch::Tensor>(std::get<4>(x)));
-  return Object(a);
+inline VALUE wrap(std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> x) {
+  return rb_ary_new3(
+    5,
+    Rice::detail::To_Ruby<torch::Tensor>().convert(std::get<0>(x)),
+    Rice::detail::To_Ruby<torch::Tensor>().convert(std::get<1>(x)),
+    Rice::detail::To_Ruby<torch::Tensor>().convert(std::get<2>(x)),
+    Rice::detail::To_Ruby<torch::Tensor>().convert(std::get<3>(x)),
+    Rice::detail::To_Ruby<torch::Tensor>().convert(std::get<4>(x))
+  );
 }
-inline Object wrap(std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, int64_t> x) {
-  Array a;
-  a.push(to_ruby<torch::Tensor>(std::get<0>(x)));
-  a.push(to_ruby<torch::Tensor>(std::get<1>(x)));
-  a.push(to_ruby<torch::Tensor>(std::get<2>(x)));
-  a.push(to_ruby<int64_t>(std::get<3>(x)));
-  return Object(a);
+inline VALUE wrap(std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, int64_t> x) {
+  return rb_ary_new3(
+    4,
+    Rice::detail::To_Ruby<torch::Tensor>().convert(std::get<0>(x)),
+    Rice::detail::To_Ruby<torch::Tensor>().convert(std::get<1>(x)),
+    Rice::detail::To_Ruby<torch::Tensor>().convert(std::get<2>(x)),
+    Rice::detail::To_Ruby<int64_t>().convert(std::get<3>(x))
+  );
 }
-inline Object wrap(std::tuple<torch::Tensor, torch::Tensor, double, int64_t> x) {
-  Array a;
-  a.push(to_ruby<torch::Tensor>(std::get<0>(x)));
-  a.push(to_ruby<torch::Tensor>(std::get<1>(x)));
-  a.push(to_ruby<double>(std::get<2>(x)));
-  a.push(to_ruby<int64_t>(std::get<3>(x)));
-  return Object(a);
+inline VALUE wrap(std::tuple<torch::Tensor, torch::Tensor, double, int64_t> x) {
+  return rb_ary_new3(
+    4,
+    Rice::detail::To_Ruby<torch::Tensor>().convert(std::get<0>(x)),
+    Rice::detail::To_Ruby<torch::Tensor>().convert(std::get<1>(x)),
+    Rice::detail::To_Ruby<double>().convert(std::get<2>(x)),
+    Rice::detail::To_Ruby<int64_t>().convert(std::get<3>(x))
+  );
 }
-inline Object wrap(torch::TensorList x) {
-  Array a;
-  for (auto& t : x) {
-    a.push(to_ruby<torch::Tensor>(t));
+inline VALUE wrap(torch::TensorList x) {
+  auto a = rb_ary_new2(x.size());
+  for (auto t : x) {
+    rb_ary_push(a, Rice::detail::To_Ruby<torch::Tensor>().convert(t));
   }
-  return Object(a);
+  return a;
 }
-inline Object wrap(std::tuple<double, double> x) {
-  Array a;
-  a.push(to_ruby<double>(std::get<0>(x)));
-  a.push(to_ruby<double>(std::get<1>(x)));
-  return Object(a);
+inline VALUE wrap(std::tuple<double, double> x) {
+  return rb_ary_new3(
+    2,
+    Rice::detail::To_Ruby<double>().convert(std::get<0>(x)),
+    Rice::detail::To_Ruby<double>().convert(std::get<1>(x))
+  );
 }

data/lib/torch/inspector.rb CHANGED Viewed

@@ -96,8 +96,11 @@ module Torch
             ret = "%.#{PRINT_OPTS[:precision]}f" % value
           end
         elsif @complex_dtype
-          p = PRINT_OPTS[:precision]
-          raise NotImplementedYet
+          # TODO use float formatter for each part
+          precision = PRINT_OPTS[:precision]
+          imag = value.imag
+          sign = imag >= 0 ? "+" : "-"
+          ret = "%.#{precision}f#{sign}%.#{precision}fi" % [value.real, value.imag.abs]
         else
           ret = value.to_s
         end

data/lib/torch/nn/convnd.rb CHANGED Viewed

@@ -1,6 +1,8 @@
 module Torch
   module NN
     class ConvNd < Module
+      attr_reader :in_channels, :out_channels, :kernel_size, :stride, :padding, :dilation, :transposed, :output_paddding, :groups, :padding_mode
       def initialize(in_channels, out_channels, kernel_size, stride, padding, dilation, transposed, output_padding, groups, bias, padding_mode)
         super()
         raise ArgumentError, "in_channels must be divisible by groups" if in_channels % groups != 0

data/lib/torch/nn/functional_attention.rb ADDED Viewed

@@ -0,0 +1,241 @@
+module Torch
+  module NN
+    class Functional
+      class << self
+        def in_projection_packed(q, k, v, w, b: nil)
+          e = q.size(-1)
+          if k.eql? v
+            if q.eql? k
+              # self-attention
+              return linear(q, w, b).chunk(3, dim: -1)
+            else
+              # encoder-decoder attention
+              w_q, w_kv = w.split_with_sizes([e, e * 2])
+              if b.nil?
+                b_q = b_kv = nil
+              else
+                b_q, b_kv = b.split_with_sizes([e, e * 2])
+              end
+              return [linear(q, w_q, b_q), *linear(k, w_kv, b_kv).chunk(2, dim: -1)]
+            end
+          else
+            w_q, w_k, w_v = w.chunk(3)
+            if b.nil?
+              b_q = b_k = b_v = nil
+            else
+              b_q, b_k, b_v = b.chunk(3)
+            end
+            return [linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)]
+          end
+        end
+        def in_projection(
+          q, k, v,
+          w_q, w_k, w_v,
+          b_q: nil, b_k: nil, b_v: nil
+        )
+          e_q, e_k, e_v = q.size(-1), k.size(-1), v.size(-1)
+          raise ArgumentError, "Expecting query weights shape of #{[e_q, e_q]}, but got #{w_q.shape}" unless w_q.shape == [e_q, e_q]
+          raise ArgumentError, "Expecting key weights shape of #{[e_k, e_k]}, but got #{w_k.shape}" unless w_k.shape == [e_k, e_k]
+          raise ArgumentError, "Expecting value weights shape of #{[e_v, e_v]}, but got #{w_v.shape}" unless w_v.shape == [e_v, e_v]
+          raise ArgumentError, "Expecting query bias shape of #{[e_q]}, but got #{b_q.shape}" if b_q && b_q.shape != [e_q]
+          raise ArgumentError, "Expecting key bias shape of #{[e_k]}, but got #{b_k.shape}" if b_k && b_k.shape != [e_k]
+          raise ArgumentError, "Expecting value bias shape of #{[e_v]}, but got #{b_v.shape}" if b_v && b_v.shape != [e_v]
+          [linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)]
+        end
+        def scaled_dot_product_attention(
+          q, k, v,
+          attn_mask: nil, dropout_p: 0.0
+        )
+          b, nt, e = q.shape
+          q = q / Math.sqrt(e)
+          attn = Torch.bmm(q, k.transpose(-2, -1))
+          attn += attn_mask if attn_mask
+          attn = softmax(attn, dim: -1)
+          attn = dropout(attn, p: dropout_p) if dropout_p > 0
+          output = Torch.bmm(attn, v)
+          [output, attn]
+        end
+        def multi_head_attention_forward(
+          query, key, value,
+          embed_dim_to_check, num_heads,
+          in_proj_weight, in_proj_bias,
+          bias_k, bias_v,
+          add_zero_attn,
+          dropout_p,
+          out_proj_weight, out_proj_bias,
+          training: true,
+          key_padding_mask: nil,
+          need_weights: true,
+          attn_mask: nil,
+          use_separate_proj_weight: false,
+          q_proj_weight: nil, k_proj_weight: nil, v_proj_weight: nil,
+          static_k: nil, static_v: nil
+        )
+          tgt_len, bsz, embed_dim = query.shape
+          src_len = key.shape.first
+          raise ArgumentError, "Was expecting embedding dimension of #{embed_dim_to_check}, but got #{embed_dim}" unless embed_dim == embed_dim_to_check
+          head_dim = if embed_dim.is_a?(Torch::Tensor)
+            embed_dim.div(num_heads, rounding_mode: 'trunc')
+          else
+            head_dim = embed_dim.div num_heads
+          end
+          if use_separate_proj_weight
+            raise ArgumentError, "Key's sequence and batch dims #{key.shape[0...2]} do not match value's #{value.shape[0...2]}" unless key.shape[0...2] == value.shape[0...2]
+          else
+            raise ArgumentError, "Key shape #{key.shape} does not match value shape #{value.shape}" unless key.shape == value.shape
+          end
+          # compute in-projection
+          q, k, v =
+            if use_separate_proj_weight
+              raise ArgumentError, "use_separate_proj_weight is true but q_proj_weight is nil" unless q_proj_weight
+              raise ArgumentError, "use_separate_proj_weight is true but k_proj_weight is nil" unless k_proj_weight
+              raise ArgumentError, "use_separate_proj_weight is true but v_proj_weight is nil" unless v_proj_weight
+              if in_proj_bias
+                b_q, b_k, b_v = in_proj_bias.chunk(3)
+              else
+                b_q = b_k = b_v = nil
+              end
+              in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q: b_q, b_k: b_k, b_v: b_v)
+            else
+              in_projection_packed(query, key, value, in_proj_weight, b: in_proj_bias)
+            end
+          # prep attention mask
+          if attn_mask
+            if attn_mask.dtype == :uint8
+              puts "[WARN] Byte tensor for attn_mask in Multihead Attention is deprecated. Use bool tensor instead."
+              attn_mask = attn_mask.bool
+            else
+              raise ArgumentError, "Only float, byte, and bool types are supported for attn_mask, not #{attn_mask.dtype}" unless attn_mask.floating_point? || attn_mask.dtype == :bool
+            end
+            if attn_mask.dim == 2
+              correct_2d_size = [tgt_len, src_len]
+              raise ArgumentError, "The shape of the 2D attn_mask is #{attn_mask.shape}, but should be #{correct_2d_size}." unless attn_mask.shape == correct_2d_size
+              attn_mask = attn_mask.unsqueeze(0)
+            elsif attn_mask.dim == 3
+              correct_3d_size = [bsz * num_heads, tgt_len, src_len]
+              raise ArgumentError, "The shape of the 3D attn_mask is #{attn_mask.shape}, but should be #{correct_3d_size}." unless attn_mask.shape == correct_3d_size
+            else
+              raise ArgumentError, "attn_mask's dimension #{attn_mask.dim} is not supported"
+            end
+          end
+          # prep key padding mask
+          if key_padding_mask && key_padding_mask.dtype == :uint8
+            puts "[WARN] Byte tensor for key_padding_mask in Multihead Attention is deprecated. Use bool tensor instead."
+            key_padding_mask = key_padding_mask.bool
+          end
+          # add bias along batch dimension (currently second)
+          if bias_k && bias_v
+            raise ArgumentError, "bias cannot be added to static key." if static_k
+            raise ArgumentError, "bias cannot be added to static value." if static_v
+            k = Torch.cat([k, bias_k.repeat(1, bsz, 1)])
+            v = Torch.cat([v, bias_v.repeat(1, bsz, 1)])
+            attn_mask = pad(attn_mask, [0, 1]) if attn_mask
+            key_padding_mask = pad(key_padding_mask, [0, 1]) if key_padding_mask
+          else
+            raise ArgumentError unless bias_k.nil?
+            raise ArgumentError unless bias_v.nil?
+          end
+          # reshape q, k, v for multihead attention and make em batch first
+          q = q.contiguous.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+          if static_k.nil?
+            k = k.contiguous.view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+          else
+            raise ArgumentError, "Expecting static_k.size(0) of #{bsz * num_heads}, but got #{static_k.size(0)}" unless static_k.size(0) == bsz * num_heads
+            raise ArgumentError, "Expecting static_k.size(2) of #{head_dim}, but got #{static_k.size(2)}" unless static_k.size(2) == head_dim
+            k = static_k
+          end
+          if static_v.nil?
+            v = v.contiguous.view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+          else
+            raise ArgumentError, "Expecting static_v.size(0) of #{bsz * num_heads}, but got #{static_v.size(0)}" unless static_v.size(0) == bsz * num_heads
+            raise ArgumentError, "Expecting static_v.size(2) of #{head_dim}, but got #{static_v.size(2)}" unless static_v.size(2) == head_dim
+            v = static_v
+          end
+          # add zero attention along batch dimension (now first)
+          if add_zero_attn
+            zero_attn_shape = [bsz * num_heads, 1, head_dim]
+            k = Torch.cat([k, Torch.zeros(zero_attn_shape, dtype: k.dtype, device: k.device)], dim: 1)
+            v = Torch.cat([v, Torch.zeros(zero_attn_shape, dtype: v.dtype, device: v.device)], dim: 1)
+            attn_mask = pad(attn_mask, [0, 1]) if attn_mask
+            key_padding_mask = pad(key_padding_mask, [0, 1]) if key_padding_mask
+          end
+          # update source sequence length after adjustments
+          src_len = k.size(1)
+          # merge key padding and attention masks
+          if key_padding_mask
+            raise ArgumentError, "Expecting key_padding_mask shape of #{[bsz, src_len]}, but got #{key_padding_mask.shape}" unless key_padding_mask.shape == [bsz, src_len]
+            key_padding_mask = key_padding_mask.view(bsz, 1, 1, src_len).expand(-1, num_heads, -1, -1).reshape(bsz * num_heads, 1, src_len)
+            attn_mask = if attn_mask.nil?
+              key_padding_mask
+            elsif attn_mask.dtype == :bool
+              attn_mask.logical_or(key_padding_mask)
+            else
+              attn_mask.masked_fill(key_padding_mask, -Float::INFINITY)
+            end
+          end
+          # convert mask to float
+          if attn_mask && attn_mask.dtype == :bool
+            new_attn_mask = Torch.zeros_like(attn_mask, dtype: :float32)
+            attn_mask = new_attn_mask.masked_fill(attn_mask, -Float::INFINITY)
+          end
+          dropout_p = 0.0 unless training
+          # (deep breath) calculate attention and out projection
+          attn_output, attn_output_weights = scaled_dot_product_attention(q, k, v, attn_mask: attn_mask, dropout_p: dropout_p)
+          attn_output = attn_output.transpose(0, 1).contiguous.view(tgt_len, bsz, embed_dim)
+          attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+          if need_weights
+            # average attention weights over heads
+            attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+            [attn_output, attn_output_weights.sum(dim: 1) / num_heads]
+          else
+            [attn_output, nil]
+          end
+        end
+      end
+    end
+  end
+end

data/lib/torch/nn/module.rb CHANGED Viewed

@@ -3,6 +3,8 @@ module Torch
     class Module
       include Utils
+      attr_reader :training
       def initialize
         @training = true
         @parameters = {}

data/lib/torch/nn/module_list.rb ADDED Viewed

@@ -0,0 +1,49 @@
+module Torch
+  module NN
+    class ModuleList < Module
+      include Enumerable
+      def initialize(mods = nil)
+        super()
+        self.concat(mods) if mods
+      end
+      def length
+        @modules.length
+      end
+      alias_method :count, :length
+      alias_method :size, :length
+      def concat(mods)
+        raise ArgumentError, "Modules should respond to #each" unless mods.respond_to?(:each)
+        mods.each { |m| append m }
+        self
+      end
+      def each(&block)
+        if block_given?
+          @modules.values.each(&block)
+        else
+          to_enum(:each)
+        end
+      end
+      def append(mod)
+        raise ArgumentError, "Provided element is not a module" unless mod.is_a?(Module)
+        add_module(length.to_s, mod)
+        self
+      end
+      def [](idx)
+        if idx.is_a?(Range)
+          self.class.new(@modules.values[idx])
+        else
+          @modules[idx.to_s]
+        end
+      end
+    end
+  end
+end