RubyGems - cumo - Versions diffs - 0.4.3 → 0.5.1 - Mend

cumo 0.4.3 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

checksums.yaml +4 -4
data/.gitignore +3 -0
data/.rubocop.yml +15 -0
data/.rubocop_todo.yml +1252 -0
data/3rd_party/mkmf-cu/Gemfile +2 -0
data/3rd_party/mkmf-cu/Rakefile +2 -1
data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +2 -0
data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +43 -7
data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +51 -45
data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +2 -0
data/3rd_party/mkmf-cu/mkmf-cu.gemspec +3 -1
data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +5 -3
data/CHANGELOG.md +85 -0
data/Dockerfile +34 -0
data/Gemfile +6 -1
data/README.md +2 -10
data/Rakefile +8 -11
data/bench/broadcast_fp32.rb +28 -26
data/bench/cumo_bench.rb +18 -16
data/bench/numo_bench.rb +18 -16
data/bench/reduction_fp32.rb +14 -12
data/bin/console +1 -0
data/cumo.gemspec +6 -9
data/docker-build.sh +4 -0
data/docker-launch.sh +4 -0
data/docs/src-tree.md +1 -1
data/ext/cumo/cuda/cudnn.c +2 -2
data/ext/cumo/cuda/cudnn_impl.cpp +25 -3
data/ext/cumo/cuda/driver.c +8 -0
data/ext/cumo/cumo.c +7 -3
data/ext/cumo/depend.erb +15 -13
data/ext/cumo/extconf.rb +33 -47
data/ext/cumo/include/cumo/cuda/cudnn.h +3 -1
data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +13 -6
data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +3 -3
data/ext/cumo/include/cumo/intern.h +1 -0
data/ext/cumo/include/cumo/narray.h +13 -1
data/ext/cumo/include/cumo/template.h +2 -4
data/ext/cumo/include/cumo/types/complex_macro.h +1 -1
data/ext/cumo/include/cumo/types/complex_macro_kernel.h +15 -4
data/ext/cumo/include/cumo/types/float_macro.h +2 -2
data/ext/cumo/include/cumo/types/real_accum_kernel.h +15 -4
data/ext/cumo/include/cumo/types/xint_macro.h +3 -2
data/ext/cumo/include/cumo/types/xint_macro_kernel.h +11 -3
data/ext/cumo/include/cumo.h +2 -2
data/ext/cumo/narray/array.c +8 -6
data/ext/cumo/narray/data.c +48 -28
data/ext/cumo/narray/gen/cogen.rb +8 -7
data/ext/cumo/narray/gen/cogen_kernel.rb +8 -7
data/ext/cumo/narray/gen/def/bit.rb +3 -1
data/ext/cumo/narray/gen/def/dcomplex.rb +2 -0
data/ext/cumo/narray/gen/def/dfloat.rb +2 -0
data/ext/cumo/narray/gen/def/int16.rb +2 -0
data/ext/cumo/narray/gen/def/int32.rb +2 -0
data/ext/cumo/narray/gen/def/int64.rb +2 -0
data/ext/cumo/narray/gen/def/int8.rb +2 -0
data/ext/cumo/narray/gen/def/robject.rb +2 -0
data/ext/cumo/narray/gen/def/scomplex.rb +2 -0
data/ext/cumo/narray/gen/def/sfloat.rb +2 -0
data/ext/cumo/narray/gen/def/uint16.rb +2 -0
data/ext/cumo/narray/gen/def/uint32.rb +2 -0
data/ext/cumo/narray/gen/def/uint64.rb +2 -0
data/ext/cumo/narray/gen/def/uint8.rb +2 -0
data/ext/cumo/narray/gen/erbln.rb +9 -7
data/ext/cumo/narray/gen/erbpp2.rb +26 -24
data/ext/cumo/narray/gen/narray_def.rb +13 -11
data/ext/cumo/narray/gen/spec.rb +58 -55
data/ext/cumo/narray/gen/tmpl/accum.c +2 -2
data/ext/cumo/narray/gen/tmpl/accum_binary.c +1 -1
data/ext/cumo/narray/gen/tmpl/alloc_func.c +1 -1
data/ext/cumo/narray/gen/tmpl/aref.c +18 -18
data/ext/cumo/narray/gen/tmpl/aset.c +16 -16
data/ext/cumo/narray/gen/tmpl/at.c +34 -0
data/ext/cumo/narray/gen/tmpl/batch_norm.c +5 -2
data/ext/cumo/narray/gen/tmpl/batch_norm_backward.c +6 -3
data/ext/cumo/narray/gen/tmpl/bincount.c +7 -7
data/ext/cumo/narray/gen/tmpl/clip.c +11 -15
data/ext/cumo/narray/gen/tmpl/conv.c +1 -1
data/ext/cumo/narray/gen/tmpl/conv_grad_w.c +3 -1
data/ext/cumo/narray/gen/tmpl/conv_transpose.c +1 -1
data/ext/cumo/narray/gen/tmpl/cum.c +1 -1
data/ext/cumo/narray/gen/tmpl/each.c +4 -2
data/ext/cumo/narray/gen/tmpl/each_with_index.c +5 -2
data/ext/cumo/narray/gen/tmpl/fixed_batch_norm.c +5 -2
data/ext/cumo/narray/gen/tmpl/init_class.c +1 -0
data/ext/cumo/narray/gen/tmpl/logseq.c +6 -5
data/ext/cumo/narray/gen/tmpl/map_with_index.c +5 -6
data/ext/cumo/narray/gen/tmpl/median.c +2 -2
data/ext/cumo/narray/gen/tmpl/minmax.c +1 -1
data/ext/cumo/narray/gen/tmpl/poly.c +4 -4
data/ext/cumo/narray/gen/tmpl/pooling_backward.c +1 -1
data/ext/cumo/narray/gen/tmpl/pooling_forward.c +1 -1
data/ext/cumo/narray/gen/tmpl/qsort.c +1 -5
data/ext/cumo/narray/gen/tmpl/rand.c +8 -6
data/ext/cumo/narray/gen/tmpl/rand_norm.c +18 -16
data/ext/cumo/narray/gen/tmpl/seq.c +5 -4
data/ext/cumo/narray/gen/tmpl/sort.c +3 -3
data/ext/cumo/narray/gen/tmpl/sort_index.c +2 -2
data/ext/cumo/narray/gen/tmpl_bit/aref.c +26 -32
data/ext/cumo/narray/gen/tmpl_bit/aset.c +18 -30
data/ext/cumo/narray/gen/tmpl_bit/binary.c +42 -14
data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +5 -0
data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +5 -0
data/ext/cumo/narray/gen/tmpl_bit/mask.c +27 -7
data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +21 -7
data/ext/cumo/narray/gen/tmpl_bit/unary.c +21 -7
data/ext/cumo/narray/index.c +244 -40
data/ext/cumo/narray/index_kernel.cu +84 -0
data/ext/cumo/narray/narray.c +57 -19
data/ext/cumo/narray/ndloop.c +1 -1
data/ext/cumo/narray/struct.c +1 -1
data/lib/cumo/cuda/compile_error.rb +1 -1
data/lib/cumo/cuda/compiler.rb +23 -22
data/lib/cumo/cuda/cudnn.rb +1 -1
data/lib/cumo/cuda/device.rb +1 -1
data/lib/cumo/cuda/link_state.rb +2 -2
data/lib/cumo/cuda/module.rb +1 -2
data/lib/cumo/cuda/nvrtc_program.rb +3 -2
data/lib/cumo/cuda.rb +2 -0
data/lib/cumo/linalg.rb +2 -0
data/lib/cumo/narray/extra.rb +297 -341
data/lib/cumo/narray.rb +2 -0
data/lib/cumo.rb +3 -1
data/test/bit_test.rb +157 -0
data/test/cuda/compiler_test.rb +69 -0
data/test/cuda/device_test.rb +31 -0
data/test/cuda/memory_pool_test.rb +45 -0
data/test/cuda/nvrtc_test.rb +51 -0
data/test/cuda/runtime_test.rb +28 -0
data/test/cudnn_test.rb +498 -0
data/test/cumo_test.rb +27 -0
data/test/narray_test.rb +745 -0
data/test/ractor_test.rb +52 -0
data/test/test_helper.rb +31 -0
metadata +34 -54
data/.travis.yml +0 -5
data/numo-narray-version +0 -1

data/bench/broadcast_fp32.rb CHANGED Viewed

@@ -1,11 +1,13 @@
+# frozen_string_literal: true
 require 'benchmark'
 require 'cumo/narray'
 num_iteration = 1000
 Benchmark.bm 20 do |r|
-  x = Cumo::SFloat.ones([1000,784])
-  y = Cumo::SFloat.ones([1000,784])
+  x = Cumo::SFloat.ones([1000, 784])
+  y = Cumo::SFloat.ones([1000, 784])
   r.report "x.inplace + y" do
     num_iteration.times do
       x.inplace + y
@@ -13,8 +15,8 @@ Benchmark.bm 20 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([1000,784])
-  y = Cumo::SFloat.ones([1000,784])
+  x = Cumo::SFloat.ones([1000, 784])
+  y = Cumo::SFloat.ones([1000, 784])
   r.report "x + y" do
     num_iteration.times do
       (x + y).free
@@ -22,8 +24,8 @@ Benchmark.bm 20 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([1000,784])
-  y = Cumo::SFloat.ones([1000,784])
+  x = Cumo::SFloat.ones([1000, 784])
+  y = Cumo::SFloat.ones([1000, 784])
   r.report "x.inplace + 1.0" do
     num_iteration.times do
       x.inplace + 1.0
@@ -31,8 +33,8 @@ Benchmark.bm 20 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([1000,784])
-  z = Cumo::SFloat.ones([1000,1])
+  x = Cumo::SFloat.ones([1000, 784])
+  z = Cumo::SFloat.ones([1000, 1])
   r.report "x.inplace + z" do
     num_iteration.times do
       x.inplace + z
@@ -40,8 +42,8 @@ Benchmark.bm 20 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([1000,784])
-  y = Cumo::SFloat.ones([1000,784])
+  x = Cumo::SFloat.ones([1000, 784])
+  y = Cumo::SFloat.ones([1000, 784])
   r.report "x.inplace - y" do
     num_iteration.times do
       x.inplace - y
@@ -49,8 +51,8 @@ Benchmark.bm 20 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([1000,784])
-  y = Cumo::SFloat.ones([1000,784])
+  x = Cumo::SFloat.ones([1000, 784])
+  y = Cumo::SFloat.ones([1000, 784])
   r.report "x.inplace - 1.0" do
     num_iteration.times do
       x.inplace - 1.0
@@ -58,8 +60,8 @@ Benchmark.bm 20 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([1000,784])
-  z = Cumo::SFloat.ones([1000,1])
+  x = Cumo::SFloat.ones([1000, 784])
+  z = Cumo::SFloat.ones([1000, 1])
   r.report "x.inplace - z" do
     num_iteration.times do
       x.inplace - z
@@ -67,8 +69,8 @@ Benchmark.bm 20 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([1000,784])
-  y = Cumo::SFloat.ones([1000,784])
+  x = Cumo::SFloat.ones([1000, 784])
+  y = Cumo::SFloat.ones([1000, 784])
   r.report "x.inplace * y" do
     num_iteration.times do
       x.inplace * y
@@ -76,8 +78,8 @@ Benchmark.bm 20 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([1000,784])
-  y = Cumo::SFloat.ones([1000,784])
+  x = Cumo::SFloat.ones([1000, 784])
+  y = Cumo::SFloat.ones([1000, 784])
   r.report "x.inplace * 1.0" do
     num_iteration.times do
       x.inplace * 1.0
@@ -85,8 +87,8 @@ Benchmark.bm 20 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([1000,784])
-  z = Cumo::SFloat.ones([1000,1])
+  x = Cumo::SFloat.ones([1000, 784])
+  z = Cumo::SFloat.ones([1000, 1])
   r.report "x.inplace * z" do
     num_iteration.times do
       x.inplace * z
@@ -94,8 +96,8 @@ Benchmark.bm 20 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([1000,784])
-  y = Cumo::SFloat.ones([1000,784])
+  x = Cumo::SFloat.ones([1000, 784])
+  y = Cumo::SFloat.ones([1000, 784])
   r.report "x.inplace / y" do
     num_iteration.times do
       x.inplace / y
@@ -103,8 +105,8 @@ Benchmark.bm 20 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([1000,784])
-  y = Cumo::SFloat.ones([1000,784])
+  x = Cumo::SFloat.ones([1000, 784])
+  y = Cumo::SFloat.ones([1000, 784])
   r.report "x.inplace / 1.0" do
     num_iteration.times do
       x.inplace / 1.0
@@ -112,8 +114,8 @@ Benchmark.bm 20 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([1000,784])
-  z = Cumo::SFloat.ones([1000,1])
+  x = Cumo::SFloat.ones([1000, 784])
+  z = Cumo::SFloat.ones([1000, 1])
   r.report "x.inplace / z" do
     num_iteration.times do
       x.inplace / z

data/bench/cumo_bench.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 require 'cumo/narray'
 require 'benchmark'
@@ -5,7 +7,7 @@ NUM = (ARGV.first || 100).to_i
 # warm up
 a = Cumo::Float32.new(10).seq(1)
-b = Cumo::Float32.new(10).seq(10,10)
+b = Cumo::Float32.new(10).seq(10, 10)
 c = a + b
 c.free
@@ -14,7 +16,7 @@ def elementwise(num = nil)
   puts "elementwise(#{num})"
   Benchmark.bm do |r|
     a = Cumo::Float32.new(10000).seq(1)
-    b = Cumo::Float32.new(10000).seq(10,10)
+    b = Cumo::Float32.new(10000).seq(10, 10)
     (a + b).free # warm up
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
     r.report('10**4') do
@@ -25,7 +27,7 @@ def elementwise(num = nil)
     end
     a = Cumo::Float32.new(100000).seq(1)
-    b = Cumo::Float32.new(100000).seq(10,10)
+    b = Cumo::Float32.new(100000).seq(10, 10)
     (a + b).free # warm up
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
     r.report('10**5') do
@@ -36,7 +38,7 @@ def elementwise(num = nil)
     end
     a = Cumo::Float32.new(1000000).seq(1)
-    b = Cumo::Float32.new(1000000).seq(10,10)
+    b = Cumo::Float32.new(1000000).seq(10, 10)
     (a + b).free # warm up
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
     r.report('10**6') do
@@ -47,7 +49,7 @@ def elementwise(num = nil)
     end
     a = Cumo::Float32.new(10000000).seq(1)
-    b = Cumo::Float32.new(10000000).seq(10,10)
+    b = Cumo::Float32.new(10000000).seq(10, 10)
     (a + b).free # warm up
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
     r.report('10**7') do
@@ -58,7 +60,7 @@ def elementwise(num = nil)
     end
     a = Cumo::Float32.new(100000000).seq(1)
-    b = Cumo::Float32.new(100000000).seq(10,10)
+    b = Cumo::Float32.new(100000000).seq(10, 10)
     (a + b).free # warm up
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
     r.report('10**8') do
@@ -130,8 +132,8 @@ def dot(num = nil)
   num ||= 1
   puts "dot(#{num})"
   Benchmark.bm do |r|
-    a = Cumo::Float32.new(100,100).seq(1)
-    b = Cumo::Float32.new(100,100).seq(10,10)
+    a = Cumo::Float32.new(100, 100).seq(1)
+    b = Cumo::Float32.new(100, 100).seq(10, 10)
     a.dot(b).free # warm up
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
     r.report('10**4') do
@@ -141,8 +143,8 @@ def dot(num = nil)
       end
     end
-    a = Cumo::Float32.new(100,1000).seq(1)
-    b = Cumo::Float32.new(1000,100).seq(10,10)
+    a = Cumo::Float32.new(100, 1000).seq(1)
+    b = Cumo::Float32.new(1000, 100).seq(10, 10)
     a.dot(b).free # warm up
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
     r.report('10**5') do
@@ -152,8 +154,8 @@ def dot(num = nil)
       end
     end
-    a = Cumo::Float32.new(100,10000).seq(1)
-    b = Cumo::Float32.new(10000,100).seq(10,10)
+    a = Cumo::Float32.new(100, 10000).seq(1)
+    b = Cumo::Float32.new(10000, 100).seq(10, 10)
     a.dot(b).free # warm up
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
     r.report('10**6') do
@@ -163,8 +165,8 @@ def dot(num = nil)
       end
     end
-    a = Cumo::Float32.new(100,100000).seq(1)
-    b = Cumo::Float32.new(100000,100).seq(10,10)
+    a = Cumo::Float32.new(100, 100000).seq(1)
+    b = Cumo::Float32.new(100000, 100).seq(10, 10)
     a.dot(b).free # warm up
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
     r.report('10**7') do
@@ -174,8 +176,8 @@ def dot(num = nil)
       end
     end
-    a = Cumo::Float32.new(100,1000000).seq(1)
-    b = Cumo::Float32.new(1000000,100).seq(10,10)
+    a = Cumo::Float32.new(100, 1000000).seq(1)
+    b = Cumo::Float32.new(1000000, 100).seq(10, 10)
     a.dot(b).free # warm up
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
     r.report('10**8') do

data/bench/numo_bench.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 require 'numo/narray'
 require 'benchmark'
@@ -5,7 +7,7 @@ NUM = (ARGV.first || 100).to_i
 # warm up
 a = Numo::Float32.new(10).seq(1)
-b = Numo::Float32.new(10).seq(10,10)
+b = Numo::Float32.new(10).seq(10, 10)
 c = a + b
 def elementwise(num = nil)
@@ -13,31 +15,31 @@ def elementwise(num = nil)
   puts "elementwise(#{num})"
   Benchmark.bm do |r|
     a = Numo::Float32.new(10000).seq(1)
-    b = Numo::Float32.new(10000).seq(10,10)
+    b = Numo::Float32.new(10000).seq(10, 10)
     r.report('10**4') do
       NUM.times { (a + b) }
     end
     a = Numo::Float32.new(100000).seq(1)
-    b = Numo::Float32.new(100000).seq(10,10)
+    b = Numo::Float32.new(100000).seq(10, 10)
     r.report('10**5') do
       NUM.times { (a + b) }
     end
     a = Numo::Float32.new(1000000).seq(1)
-    b = Numo::Float32.new(1000000).seq(10,10)
+    b = Numo::Float32.new(1000000).seq(10, 10)
     r.report('10**6') do
       NUM.times { (a + b) }
     end
     a = Numo::Float32.new(10000000).seq(1)
-    b = Numo::Float32.new(10000000).seq(10,10)
+    b = Numo::Float32.new(10000000).seq(10, 10)
     r.report('10**7') do
       NUM.times { (a + b) }
     end
     a = Numo::Float32.new(100000000).seq(1)
-    b = Numo::Float32.new(100000000).seq(10,10)
+    b = Numo::Float32.new(100000000).seq(10, 10)
     r.report('10**8') do
       NUM.times { (a + b) }
     end
@@ -79,32 +81,32 @@ def dot(num = nil)
   num ||= 1
   puts "dot(#{num})"
   Benchmark.bm do |r|
-    a = Numo::Float32.new(100,100).seq(1)
-    b = Numo::Float32.new(100,100).seq(10,10)
+    a = Numo::Float32.new(100, 100).seq(1)
+    b = Numo::Float32.new(100, 100).seq(10, 10)
     r.report('10**4') do
       num.times { a.dot(b) }
     end
-    a = Numo::Float32.new(100,1000).seq(1)
-    b = Numo::Float32.new(1000,100).seq(10,10)
+    a = Numo::Float32.new(100, 1000).seq(1)
+    b = Numo::Float32.new(1000, 100).seq(10, 10)
     r.report('10**5') do
       num.times { a.dot(b) }
     end
-    a = Numo::Float32.new(100,10000).seq(1)
-    b = Numo::Float32.new(10000,100).seq(10,10)
+    a = Numo::Float32.new(100, 10000).seq(1)
+    b = Numo::Float32.new(10000, 100).seq(10, 10)
     r.report('10**6') do
       num.times { a.dot(b) }
     end
-    a = Numo::Float32.new(100,100000).seq(1)
-    b = Numo::Float32.new(100000,100).seq(10,10)
+    a = Numo::Float32.new(100, 100000).seq(1)
+    b = Numo::Float32.new(100000, 100).seq(10, 10)
     r.report('10**7') do
       num.times { a.dot(b) }
     end
-    a = Numo::Float32.new(100,1000000).seq(1)
-    b = Numo::Float32.new(1000000,100).seq(10,10)
+    a = Numo::Float32.new(100, 1000000).seq(1)
+    b = Numo::Float32.new(1000000, 100).seq(10, 10)
     r.report('10**8') do
       num.times { a.dot(b) }
     end

data/bench/reduction_fp32.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 require 'benchmark'
 require 'cumo/narray'
@@ -5,7 +7,7 @@ num_iteration = 100
 Cumo::CUDA::Runtime.cudaDeviceSynchronize
 Benchmark.bm 30 do |r|
-  x = Cumo::SFloat.ones([500,500])
+  x = Cumo::SFloat.ones([500, 500])
   r.report "x.sum" do
     num_iteration.times do
       x.sum
@@ -13,7 +15,7 @@ Benchmark.bm 30 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([500,500])
+  x = Cumo::SFloat.ones([500, 500])
   r.report "x.sum(axis: 0)" do
     num_iteration.times do
       x.sum(axis: 0)
@@ -21,7 +23,7 @@ Benchmark.bm 30 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([500,500])
+  x = Cumo::SFloat.ones([500, 500])
   r.report "x.sum(axis: 1)" do
     num_iteration.times do
       x.sum(axis: 1)
@@ -29,7 +31,7 @@ Benchmark.bm 30 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([500,500])
+  x = Cumo::SFloat.ones([500, 500])
   r.report "x.sum(keepdims: true)" do
     num_iteration.times do
       x.sum(keepdims: true)
@@ -37,7 +39,7 @@ Benchmark.bm 30 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([500,500])
+  x = Cumo::SFloat.ones([500, 500])
   r.report "x.sum(axis: 0, keepdims: true)" do
     num_iteration.times do
       x.sum(axis: 0, keepdims: true)
@@ -45,7 +47,7 @@ Benchmark.bm 30 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([500,500])
+  x = Cumo::SFloat.ones([500, 500])
   r.report "x.sum(axis: 1, keepdims: true)" do
     num_iteration.times do
       x.sum(axis: 1, keepdims: true)
@@ -53,7 +55,7 @@ Benchmark.bm 30 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([500,500])
+  x = Cumo::SFloat.ones([500, 500])
   r.report "x.max" do
     num_iteration.times do
       x.max
@@ -61,7 +63,7 @@ Benchmark.bm 30 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([500,500])
+  x = Cumo::SFloat.ones([500, 500])
   r.report "x.max(axis: 0)" do
     num_iteration.times do
       x.max(axis: 0)
@@ -69,7 +71,7 @@ Benchmark.bm 30 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([500,500])
+  x = Cumo::SFloat.ones([500, 500])
   r.report "x.max(axis: 1)" do
     num_iteration.times do
       x.max(axis: 1)
@@ -77,7 +79,7 @@ Benchmark.bm 30 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([500,500])
+  x = Cumo::SFloat.ones([500, 500])
   r.report "x.max(keepdims: true)" do
     num_iteration.times do
       x.max(keepdims: true)
@@ -85,7 +87,7 @@ Benchmark.bm 30 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([500,500])
+  x = Cumo::SFloat.ones([500, 500])
   r.report "x.max(axis: 0, keepdims: true)" do
     num_iteration.times do
       x.max(axis: 0, keepdims: true)
@@ -93,7 +95,7 @@ Benchmark.bm 30 do |r|
     Cumo::CUDA::Runtime.cudaDeviceSynchronize
   end
-  x = Cumo::SFloat.ones([500,500])
+  x = Cumo::SFloat.ones([500, 500])
   r.report "x.max(axis: 1, keepdims: true)" do
     num_iteration.times do
       x.max(axis: 1, keepdims: true)

data/bin/console CHANGED Viewed

@@ -1,4 +1,5 @@
 #!/usr/bin/env ruby
+# frozen_string_literal: true
 require "bundler/setup"
 require "cumo"

data/cumo.gemspec CHANGED Viewed

@@ -1,9 +1,9 @@
-# coding: utf-8
+# frozen_string_literal: true
 lib = File.expand_path("../lib", __FILE__)
 $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 cumo_version = File.read(File.join(__dir__, "ext/cumo/include/cumo.h")).match(/CUMO_VERSION "([^"]+)"/)[1]
-numo_narray_version = File.read(File.join(__dir__, "numo-narray-version")).strip
 Gem::Specification.new do |spec|
   spec.name          = "cumo"
@@ -16,17 +16,14 @@ Gem::Specification.new do |spec|
   spec.homepage      = "https://github.com/sonots/cumo"
   spec.license       = "BSD-3-Clause"
-  spec.files         = `git ls-files -z`.split("\x0").reject do |f|
-    f.match(%r{^(test|spec|features)/})
+  spec.required_ruby_version = ">= 3.0.0"
+  spec.files = `git ls-files -z`.split("\x0").reject do |f|
+    f.match(%r{^(test|spec|features|docker)/})
   end
   spec.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
   spec.bindir        = "exe"
   spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
   spec.require_paths = ["lib"]
   spec.extensions    = ["ext/cumo/extconf.rb"]
-  spec.add_runtime_dependency "numo-narray", numo_narray_version
-  spec.add_development_dependency "bundler", "~> 1.15"
-  spec.add_development_dependency "rake", "~> 10.0"
 end

data/docker-build.sh ADDED Viewed

@@ -0,0 +1,4 @@
+#!/bin/bash
+script_dir=$(cd $(dirname ${BASH_SOURCE:-$0}); pwd)
+docker build -t cumo-dev ${script_dir}

data/docker-launch.sh ADDED Viewed

@@ -0,0 +1,4 @@
+#!/bin/bash
+project_dir="$(cd $(dirname ${BASH_SOURCE:-$0}); pwd)/"
+docker run --gpus all -v $project_dir:/workspace -it cumo-dev bash

data/docs/src-tree.md CHANGED Viewed

@@ -6,7 +6,7 @@
     * Technically, it is not possible to use CRuby API such as `VALUE` in .cu files.
         * CRuby API is not callable from CUDA kernel because they do not have `__device__` modifier.
         * nvcc does not support `#include RUBY_EXTCONF_H`, so can not include `ruby.h`.
-    * (RULE) It is allowed to use C++14 codes in .cu files.
+    * (RULE) It is allowed to use C++17 codes in .cu files.
 * Rest of `*.{h,c}` files are for host (CPU).
     * Call C wrapper functions defined in .cu files.
     * It can use CRuby API.

data/ext/cumo/cuda/cudnn.c CHANGED Viewed

@@ -50,7 +50,7 @@ cumo_cuda_cudnn_handle()
   @return [Boolean] Returns true if cuDNN is available
  */
 static VALUE
-rb_cudnn_available_p()
+rb_cudnn_available_p(VALUE self)
 {
 #if CUDNN_FOUND
     return Qtrue;
@@ -72,7 +72,7 @@ Init_cumo_cuda_cudnn(void)
     rb_define_const(mCUDA, "Cudnn", mCUDNN); // alias
     eCUDNNError = rb_define_class_under(mCUDA, "CUDNNError", rb_eStandardError);
-    rb_define_singleton_method(mCUDNN, "available?", RUBY_METHOD_FUNC(rb_cudnn_available_p), 0);
+    rb_define_singleton_method(mCUDNN, "available?", rb_cudnn_available_p, 0);
 #ifdef CUDNN_FOUND
     rb_define_const(mCUDNN, "CUDNN_POOLING_MAX", INT2NUM(CUDNN_POOLING_MAX));
     rb_define_const(mCUDNN, "CUDNN_POOLING_MAX_DETERMINISTIC", INT2NUM(CUDNN_POOLING_MAX_DETERMINISTIC));

data/ext/cumo/cuda/cudnn_impl.cpp CHANGED Viewed

@@ -74,6 +74,25 @@ cumo_cuda_cudnn_CreateTensorDescriptor(
         status = cudnnSetTensor4dDescriptor(
                 *desc, CUDNN_TENSOR_NCHW, cudnn_dtype, shape[0], shape[1], shape[2], shape[3]);
     }
+    else if (ndim < 4) {
+        // cuDNN 9 fix: Force 4D (N, C, H, W)
+        int pad_shape[4] = {1, 1, 1, 1};
+        if (ndim == 1) {
+            // 1D: arrays are treated as "Channel" (1, C, 1, 1)
+            pad_shape[1] = (int)(shape[0]);
+        } else {
+            // 2D: [N, C] -> [N, C, 1, 1]
+            // 3D: [N, C, H] -> [N, C, H, 1]
+            for (int idim = 0; idim < ndim; ++idim) {
+                pad_shape[idim] = (int)(shape[idim]);
+            }
+        }
+        status = cudnnSetTensor4dDescriptor(
+                *desc, CUDNN_TENSOR_NCHW, cudnn_dtype,
+                pad_shape[0], pad_shape[1], pad_shape[2], pad_shape[3]);
+    }
     else {
         int int_shape[CUMO_NA_MAX_DIMENSION];
         for (int idim = 0; idim < ndim; ++idim) {
@@ -514,8 +533,11 @@ cumo_cuda_cudnn_FindConvolutionBackwardFilterAlgorithm(
 // TODO(sonots): Support other than 4, 5 dimensional arrays by reshaping into 4-dimensional arrays as Chainer does.
 cudnnBatchNormMode_t
 cumo_cuda_cudnn_GetBatchNormMode(size_t ndim, int* axis) {
-    if (ndim == 1 && axis[0] == 0) {  // (1, channels, (depth, )height, width)
-        return CUDNN_BATCHNORM_PER_ACTIVATION;
+    if (ndim == 1) {
+        return CUDNN_BATCHNORM_SPATIAL;
+    }
+    if (ndim == 2) {
+        return CUDNN_BATCHNORM_SPATIAL;
     }
     if ((ndim == 3 && axis[0] == 0 && axis[1] == 2 && axis[2] == 3) ||
         (ndim == 4 && axis[0] == 0 && axis[1] == 2 && axis[2] == 3 && axis[3] == 4)) {  // (1, channels, (1, )1, 1)
@@ -533,7 +555,7 @@ cumo_cuda_cudnn_CreateBNTensorDescriptor(
 {
     cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
     status = cudnnCreateTensorDescriptor(desc);
-    if (status = CUDNN_STATUS_SUCCESS) return status;
+    if (status == CUDNN_STATUS_SUCCESS) return status;
     status = cudnnDeriveBNTensorDescriptor(*desc, x_desc, mode);
     return status;

data/ext/cumo/cuda/driver.c CHANGED Viewed

@@ -33,7 +33,11 @@ rb_cuCtxCreate(VALUE self, VALUE flags, VALUE dev)
     CUcontext _pctx;
     CUresult status;
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 13000
+    status = cuCtxCreate(&_pctx, NULL, _flags, _dev);
+#else
     status = cuCtxCreate(&_pctx, _flags, _dev);
+#endif
     check_status(status);
     return SIZET2NUM((size_t)_pctx);
@@ -418,5 +422,9 @@ Init_cumo_cuda_driver()
     cuInit(0);
     cuDeviceGet(&cuDevice, 0);
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 13000
+    cuCtxCreate(&context, NULL, 0, cuDevice);
+#else
     cuCtxCreate(&context, 0, cuDevice);
+#endif
 }

data/ext/cumo/cumo.c CHANGED Viewed

@@ -114,13 +114,17 @@ Init_cumo()
     const char* env;
     VALUE mCumo;
+#ifdef HAVE_RB_EXT_RACTOR_SAFE
+    rb_ext_ractor_safe(true);
+#endif
     mCumo = rb_define_module("Cumo");
     rb_define_const(mCumo, "VERSION", rb_str_new2(CUMO_VERSION));
-    rb_define_singleton_method(mCumo, "enable_compatible_mode", RUBY_METHOD_FUNC(rb_enable_compatible_mode), 0);
-    rb_define_singleton_method(mCumo, "disable_compatible_mode", RUBY_METHOD_FUNC(rb_disable_compatible_mode), 0);
-    rb_define_singleton_method(mCumo, "compatible_mode_enabled?", RUBY_METHOD_FUNC(rb_compatible_mode_enabled_p), 0);
+    rb_define_singleton_method(mCumo, "enable_compatible_mode", rb_enable_compatible_mode, 0);
+    rb_define_singleton_method(mCumo, "disable_compatible_mode", rb_disable_compatible_mode, 0);
+    rb_define_singleton_method(mCumo, "compatible_mode_enabled?", rb_compatible_mode_enabled_p, 0);
     // default is false
     env = getenv("CUMO_COMPATIBLE_MODE");