cumo 0.4.3 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/.rubocop.yml +15 -0
  4. data/.rubocop_todo.yml +1252 -0
  5. data/3rd_party/mkmf-cu/Gemfile +2 -0
  6. data/3rd_party/mkmf-cu/Rakefile +2 -1
  7. data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +2 -0
  8. data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +43 -7
  9. data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +51 -45
  10. data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +2 -0
  11. data/3rd_party/mkmf-cu/mkmf-cu.gemspec +3 -1
  12. data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +5 -3
  13. data/CHANGELOG.md +85 -0
  14. data/Dockerfile +34 -0
  15. data/Gemfile +6 -1
  16. data/README.md +2 -10
  17. data/Rakefile +8 -11
  18. data/bench/broadcast_fp32.rb +28 -26
  19. data/bench/cumo_bench.rb +18 -16
  20. data/bench/numo_bench.rb +18 -16
  21. data/bench/reduction_fp32.rb +14 -12
  22. data/bin/console +1 -0
  23. data/cumo.gemspec +6 -9
  24. data/docker-build.sh +4 -0
  25. data/docker-launch.sh +4 -0
  26. data/docs/src-tree.md +1 -1
  27. data/ext/cumo/cuda/cudnn.c +2 -2
  28. data/ext/cumo/cuda/cudnn_impl.cpp +25 -3
  29. data/ext/cumo/cuda/driver.c +8 -0
  30. data/ext/cumo/cumo.c +7 -3
  31. data/ext/cumo/depend.erb +15 -13
  32. data/ext/cumo/extconf.rb +33 -47
  33. data/ext/cumo/include/cumo/cuda/cudnn.h +3 -1
  34. data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +13 -6
  35. data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +3 -3
  36. data/ext/cumo/include/cumo/intern.h +1 -0
  37. data/ext/cumo/include/cumo/narray.h +13 -1
  38. data/ext/cumo/include/cumo/template.h +2 -4
  39. data/ext/cumo/include/cumo/types/complex_macro.h +1 -1
  40. data/ext/cumo/include/cumo/types/complex_macro_kernel.h +15 -4
  41. data/ext/cumo/include/cumo/types/float_macro.h +2 -2
  42. data/ext/cumo/include/cumo/types/real_accum_kernel.h +15 -4
  43. data/ext/cumo/include/cumo/types/xint_macro.h +3 -2
  44. data/ext/cumo/include/cumo/types/xint_macro_kernel.h +11 -3
  45. data/ext/cumo/include/cumo.h +2 -2
  46. data/ext/cumo/narray/array.c +8 -6
  47. data/ext/cumo/narray/data.c +48 -28
  48. data/ext/cumo/narray/gen/cogen.rb +8 -7
  49. data/ext/cumo/narray/gen/cogen_kernel.rb +8 -7
  50. data/ext/cumo/narray/gen/def/bit.rb +3 -1
  51. data/ext/cumo/narray/gen/def/dcomplex.rb +2 -0
  52. data/ext/cumo/narray/gen/def/dfloat.rb +2 -0
  53. data/ext/cumo/narray/gen/def/int16.rb +2 -0
  54. data/ext/cumo/narray/gen/def/int32.rb +2 -0
  55. data/ext/cumo/narray/gen/def/int64.rb +2 -0
  56. data/ext/cumo/narray/gen/def/int8.rb +2 -0
  57. data/ext/cumo/narray/gen/def/robject.rb +2 -0
  58. data/ext/cumo/narray/gen/def/scomplex.rb +2 -0
  59. data/ext/cumo/narray/gen/def/sfloat.rb +2 -0
  60. data/ext/cumo/narray/gen/def/uint16.rb +2 -0
  61. data/ext/cumo/narray/gen/def/uint32.rb +2 -0
  62. data/ext/cumo/narray/gen/def/uint64.rb +2 -0
  63. data/ext/cumo/narray/gen/def/uint8.rb +2 -0
  64. data/ext/cumo/narray/gen/erbln.rb +9 -7
  65. data/ext/cumo/narray/gen/erbpp2.rb +26 -24
  66. data/ext/cumo/narray/gen/narray_def.rb +13 -11
  67. data/ext/cumo/narray/gen/spec.rb +58 -55
  68. data/ext/cumo/narray/gen/tmpl/accum.c +2 -2
  69. data/ext/cumo/narray/gen/tmpl/accum_binary.c +1 -1
  70. data/ext/cumo/narray/gen/tmpl/alloc_func.c +1 -1
  71. data/ext/cumo/narray/gen/tmpl/aref.c +18 -18
  72. data/ext/cumo/narray/gen/tmpl/aset.c +16 -16
  73. data/ext/cumo/narray/gen/tmpl/at.c +34 -0
  74. data/ext/cumo/narray/gen/tmpl/batch_norm.c +5 -2
  75. data/ext/cumo/narray/gen/tmpl/batch_norm_backward.c +6 -3
  76. data/ext/cumo/narray/gen/tmpl/bincount.c +7 -7
  77. data/ext/cumo/narray/gen/tmpl/clip.c +11 -15
  78. data/ext/cumo/narray/gen/tmpl/conv.c +1 -1
  79. data/ext/cumo/narray/gen/tmpl/conv_grad_w.c +3 -1
  80. data/ext/cumo/narray/gen/tmpl/conv_transpose.c +1 -1
  81. data/ext/cumo/narray/gen/tmpl/cum.c +1 -1
  82. data/ext/cumo/narray/gen/tmpl/each.c +4 -2
  83. data/ext/cumo/narray/gen/tmpl/each_with_index.c +5 -2
  84. data/ext/cumo/narray/gen/tmpl/fixed_batch_norm.c +5 -2
  85. data/ext/cumo/narray/gen/tmpl/init_class.c +1 -0
  86. data/ext/cumo/narray/gen/tmpl/logseq.c +6 -5
  87. data/ext/cumo/narray/gen/tmpl/map_with_index.c +5 -6
  88. data/ext/cumo/narray/gen/tmpl/median.c +2 -2
  89. data/ext/cumo/narray/gen/tmpl/minmax.c +1 -1
  90. data/ext/cumo/narray/gen/tmpl/poly.c +4 -4
  91. data/ext/cumo/narray/gen/tmpl/pooling_backward.c +1 -1
  92. data/ext/cumo/narray/gen/tmpl/pooling_forward.c +1 -1
  93. data/ext/cumo/narray/gen/tmpl/qsort.c +1 -5
  94. data/ext/cumo/narray/gen/tmpl/rand.c +8 -6
  95. data/ext/cumo/narray/gen/tmpl/rand_norm.c +18 -16
  96. data/ext/cumo/narray/gen/tmpl/seq.c +5 -4
  97. data/ext/cumo/narray/gen/tmpl/sort.c +3 -3
  98. data/ext/cumo/narray/gen/tmpl/sort_index.c +2 -2
  99. data/ext/cumo/narray/gen/tmpl_bit/aref.c +26 -32
  100. data/ext/cumo/narray/gen/tmpl_bit/aset.c +18 -30
  101. data/ext/cumo/narray/gen/tmpl_bit/binary.c +42 -14
  102. data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +5 -0
  103. data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +5 -0
  104. data/ext/cumo/narray/gen/tmpl_bit/mask.c +27 -7
  105. data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +21 -7
  106. data/ext/cumo/narray/gen/tmpl_bit/unary.c +21 -7
  107. data/ext/cumo/narray/index.c +244 -40
  108. data/ext/cumo/narray/index_kernel.cu +84 -0
  109. data/ext/cumo/narray/narray.c +57 -19
  110. data/ext/cumo/narray/ndloop.c +1 -1
  111. data/ext/cumo/narray/struct.c +1 -1
  112. data/lib/cumo/cuda/compile_error.rb +1 -1
  113. data/lib/cumo/cuda/compiler.rb +23 -22
  114. data/lib/cumo/cuda/cudnn.rb +1 -1
  115. data/lib/cumo/cuda/device.rb +1 -1
  116. data/lib/cumo/cuda/link_state.rb +2 -2
  117. data/lib/cumo/cuda/module.rb +1 -2
  118. data/lib/cumo/cuda/nvrtc_program.rb +3 -2
  119. data/lib/cumo/cuda.rb +2 -0
  120. data/lib/cumo/linalg.rb +2 -0
  121. data/lib/cumo/narray/extra.rb +297 -341
  122. data/lib/cumo/narray.rb +2 -0
  123. data/lib/cumo.rb +3 -1
  124. data/test/bit_test.rb +157 -0
  125. data/test/cuda/compiler_test.rb +69 -0
  126. data/test/cuda/device_test.rb +31 -0
  127. data/test/cuda/memory_pool_test.rb +45 -0
  128. data/test/cuda/nvrtc_test.rb +51 -0
  129. data/test/cuda/runtime_test.rb +28 -0
  130. data/test/cudnn_test.rb +498 -0
  131. data/test/cumo_test.rb +27 -0
  132. data/test/narray_test.rb +745 -0
  133. data/test/ractor_test.rb +52 -0
  134. data/test/test_helper.rb +31 -0
  135. metadata +34 -54
  136. data/.travis.yml +0 -5
  137. data/numo-narray-version +0 -1
@@ -1,11 +1,13 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'benchmark'
2
4
  require 'cumo/narray'
3
5
 
4
6
  num_iteration = 1000
5
7
 
6
8
  Benchmark.bm 20 do |r|
7
- x = Cumo::SFloat.ones([1000,784])
8
- y = Cumo::SFloat.ones([1000,784])
9
+ x = Cumo::SFloat.ones([1000, 784])
10
+ y = Cumo::SFloat.ones([1000, 784])
9
11
  r.report "x.inplace + y" do
10
12
  num_iteration.times do
11
13
  x.inplace + y
@@ -13,8 +15,8 @@ Benchmark.bm 20 do |r|
13
15
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
14
16
  end
15
17
 
16
- x = Cumo::SFloat.ones([1000,784])
17
- y = Cumo::SFloat.ones([1000,784])
18
+ x = Cumo::SFloat.ones([1000, 784])
19
+ y = Cumo::SFloat.ones([1000, 784])
18
20
  r.report "x + y" do
19
21
  num_iteration.times do
20
22
  (x + y).free
@@ -22,8 +24,8 @@ Benchmark.bm 20 do |r|
22
24
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
23
25
  end
24
26
 
25
- x = Cumo::SFloat.ones([1000,784])
26
- y = Cumo::SFloat.ones([1000,784])
27
+ x = Cumo::SFloat.ones([1000, 784])
28
+ y = Cumo::SFloat.ones([1000, 784])
27
29
  r.report "x.inplace + 1.0" do
28
30
  num_iteration.times do
29
31
  x.inplace + 1.0
@@ -31,8 +33,8 @@ Benchmark.bm 20 do |r|
31
33
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
32
34
  end
33
35
 
34
- x = Cumo::SFloat.ones([1000,784])
35
- z = Cumo::SFloat.ones([1000,1])
36
+ x = Cumo::SFloat.ones([1000, 784])
37
+ z = Cumo::SFloat.ones([1000, 1])
36
38
  r.report "x.inplace + z" do
37
39
  num_iteration.times do
38
40
  x.inplace + z
@@ -40,8 +42,8 @@ Benchmark.bm 20 do |r|
40
42
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
41
43
  end
42
44
 
43
- x = Cumo::SFloat.ones([1000,784])
44
- y = Cumo::SFloat.ones([1000,784])
45
+ x = Cumo::SFloat.ones([1000, 784])
46
+ y = Cumo::SFloat.ones([1000, 784])
45
47
  r.report "x.inplace - y" do
46
48
  num_iteration.times do
47
49
  x.inplace - y
@@ -49,8 +51,8 @@ Benchmark.bm 20 do |r|
49
51
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
50
52
  end
51
53
 
52
- x = Cumo::SFloat.ones([1000,784])
53
- y = Cumo::SFloat.ones([1000,784])
54
+ x = Cumo::SFloat.ones([1000, 784])
55
+ y = Cumo::SFloat.ones([1000, 784])
54
56
  r.report "x.inplace - 1.0" do
55
57
  num_iteration.times do
56
58
  x.inplace - 1.0
@@ -58,8 +60,8 @@ Benchmark.bm 20 do |r|
58
60
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
59
61
  end
60
62
 
61
- x = Cumo::SFloat.ones([1000,784])
62
- z = Cumo::SFloat.ones([1000,1])
63
+ x = Cumo::SFloat.ones([1000, 784])
64
+ z = Cumo::SFloat.ones([1000, 1])
63
65
  r.report "x.inplace - z" do
64
66
  num_iteration.times do
65
67
  x.inplace - z
@@ -67,8 +69,8 @@ Benchmark.bm 20 do |r|
67
69
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
68
70
  end
69
71
 
70
- x = Cumo::SFloat.ones([1000,784])
71
- y = Cumo::SFloat.ones([1000,784])
72
+ x = Cumo::SFloat.ones([1000, 784])
73
+ y = Cumo::SFloat.ones([1000, 784])
72
74
  r.report "x.inplace * y" do
73
75
  num_iteration.times do
74
76
  x.inplace * y
@@ -76,8 +78,8 @@ Benchmark.bm 20 do |r|
76
78
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
77
79
  end
78
80
 
79
- x = Cumo::SFloat.ones([1000,784])
80
- y = Cumo::SFloat.ones([1000,784])
81
+ x = Cumo::SFloat.ones([1000, 784])
82
+ y = Cumo::SFloat.ones([1000, 784])
81
83
  r.report "x.inplace * 1.0" do
82
84
  num_iteration.times do
83
85
  x.inplace * 1.0
@@ -85,8 +87,8 @@ Benchmark.bm 20 do |r|
85
87
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
86
88
  end
87
89
 
88
- x = Cumo::SFloat.ones([1000,784])
89
- z = Cumo::SFloat.ones([1000,1])
90
+ x = Cumo::SFloat.ones([1000, 784])
91
+ z = Cumo::SFloat.ones([1000, 1])
90
92
  r.report "x.inplace * z" do
91
93
  num_iteration.times do
92
94
  x.inplace * z
@@ -94,8 +96,8 @@ Benchmark.bm 20 do |r|
94
96
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
95
97
  end
96
98
 
97
- x = Cumo::SFloat.ones([1000,784])
98
- y = Cumo::SFloat.ones([1000,784])
99
+ x = Cumo::SFloat.ones([1000, 784])
100
+ y = Cumo::SFloat.ones([1000, 784])
99
101
  r.report "x.inplace / y" do
100
102
  num_iteration.times do
101
103
  x.inplace / y
@@ -103,8 +105,8 @@ Benchmark.bm 20 do |r|
103
105
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
104
106
  end
105
107
 
106
- x = Cumo::SFloat.ones([1000,784])
107
- y = Cumo::SFloat.ones([1000,784])
108
+ x = Cumo::SFloat.ones([1000, 784])
109
+ y = Cumo::SFloat.ones([1000, 784])
108
110
  r.report "x.inplace / 1.0" do
109
111
  num_iteration.times do
110
112
  x.inplace / 1.0
@@ -112,8 +114,8 @@ Benchmark.bm 20 do |r|
112
114
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
113
115
  end
114
116
 
115
- x = Cumo::SFloat.ones([1000,784])
116
- z = Cumo::SFloat.ones([1000,1])
117
+ x = Cumo::SFloat.ones([1000, 784])
118
+ z = Cumo::SFloat.ones([1000, 1])
117
119
  r.report "x.inplace / z" do
118
120
  num_iteration.times do
119
121
  x.inplace / z
data/bench/cumo_bench.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'cumo/narray'
2
4
  require 'benchmark'
3
5
 
@@ -5,7 +7,7 @@ NUM = (ARGV.first || 100).to_i
5
7
 
6
8
  # warm up
7
9
  a = Cumo::Float32.new(10).seq(1)
8
- b = Cumo::Float32.new(10).seq(10,10)
10
+ b = Cumo::Float32.new(10).seq(10, 10)
9
11
  c = a + b
10
12
  c.free
11
13
 
@@ -14,7 +16,7 @@ def elementwise(num = nil)
14
16
  puts "elementwise(#{num})"
15
17
  Benchmark.bm do |r|
16
18
  a = Cumo::Float32.new(10000).seq(1)
17
- b = Cumo::Float32.new(10000).seq(10,10)
19
+ b = Cumo::Float32.new(10000).seq(10, 10)
18
20
  (a + b).free # warm up
19
21
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
20
22
  r.report('10**4') do
@@ -25,7 +27,7 @@ def elementwise(num = nil)
25
27
  end
26
28
 
27
29
  a = Cumo::Float32.new(100000).seq(1)
28
- b = Cumo::Float32.new(100000).seq(10,10)
30
+ b = Cumo::Float32.new(100000).seq(10, 10)
29
31
  (a + b).free # warm up
30
32
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
31
33
  r.report('10**5') do
@@ -36,7 +38,7 @@ def elementwise(num = nil)
36
38
  end
37
39
 
38
40
  a = Cumo::Float32.new(1000000).seq(1)
39
- b = Cumo::Float32.new(1000000).seq(10,10)
41
+ b = Cumo::Float32.new(1000000).seq(10, 10)
40
42
  (a + b).free # warm up
41
43
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
42
44
  r.report('10**6') do
@@ -47,7 +49,7 @@ def elementwise(num = nil)
47
49
  end
48
50
 
49
51
  a = Cumo::Float32.new(10000000).seq(1)
50
- b = Cumo::Float32.new(10000000).seq(10,10)
52
+ b = Cumo::Float32.new(10000000).seq(10, 10)
51
53
  (a + b).free # warm up
52
54
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
53
55
  r.report('10**7') do
@@ -58,7 +60,7 @@ def elementwise(num = nil)
58
60
  end
59
61
 
60
62
  a = Cumo::Float32.new(100000000).seq(1)
61
- b = Cumo::Float32.new(100000000).seq(10,10)
63
+ b = Cumo::Float32.new(100000000).seq(10, 10)
62
64
  (a + b).free # warm up
63
65
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
64
66
  r.report('10**8') do
@@ -130,8 +132,8 @@ def dot(num = nil)
130
132
  num ||= 1
131
133
  puts "dot(#{num})"
132
134
  Benchmark.bm do |r|
133
- a = Cumo::Float32.new(100,100).seq(1)
134
- b = Cumo::Float32.new(100,100).seq(10,10)
135
+ a = Cumo::Float32.new(100, 100).seq(1)
136
+ b = Cumo::Float32.new(100, 100).seq(10, 10)
135
137
  a.dot(b).free # warm up
136
138
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
137
139
  r.report('10**4') do
@@ -141,8 +143,8 @@ def dot(num = nil)
141
143
  end
142
144
  end
143
145
 
144
- a = Cumo::Float32.new(100,1000).seq(1)
145
- b = Cumo::Float32.new(1000,100).seq(10,10)
146
+ a = Cumo::Float32.new(100, 1000).seq(1)
147
+ b = Cumo::Float32.new(1000, 100).seq(10, 10)
146
148
  a.dot(b).free # warm up
147
149
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
148
150
  r.report('10**5') do
@@ -152,8 +154,8 @@ def dot(num = nil)
152
154
  end
153
155
  end
154
156
 
155
- a = Cumo::Float32.new(100,10000).seq(1)
156
- b = Cumo::Float32.new(10000,100).seq(10,10)
157
+ a = Cumo::Float32.new(100, 10000).seq(1)
158
+ b = Cumo::Float32.new(10000, 100).seq(10, 10)
157
159
  a.dot(b).free # warm up
158
160
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
159
161
  r.report('10**6') do
@@ -163,8 +165,8 @@ def dot(num = nil)
163
165
  end
164
166
  end
165
167
 
166
- a = Cumo::Float32.new(100,100000).seq(1)
167
- b = Cumo::Float32.new(100000,100).seq(10,10)
168
+ a = Cumo::Float32.new(100, 100000).seq(1)
169
+ b = Cumo::Float32.new(100000, 100).seq(10, 10)
168
170
  a.dot(b).free # warm up
169
171
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
170
172
  r.report('10**7') do
@@ -174,8 +176,8 @@ def dot(num = nil)
174
176
  end
175
177
  end
176
178
 
177
- a = Cumo::Float32.new(100,1000000).seq(1)
178
- b = Cumo::Float32.new(1000000,100).seq(10,10)
179
+ a = Cumo::Float32.new(100, 1000000).seq(1)
180
+ b = Cumo::Float32.new(1000000, 100).seq(10, 10)
179
181
  a.dot(b).free # warm up
180
182
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
181
183
  r.report('10**8') do
data/bench/numo_bench.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'numo/narray'
2
4
  require 'benchmark'
3
5
 
@@ -5,7 +7,7 @@ NUM = (ARGV.first || 100).to_i
5
7
 
6
8
  # warm up
7
9
  a = Numo::Float32.new(10).seq(1)
8
- b = Numo::Float32.new(10).seq(10,10)
10
+ b = Numo::Float32.new(10).seq(10, 10)
9
11
  c = a + b
10
12
 
11
13
  def elementwise(num = nil)
@@ -13,31 +15,31 @@ def elementwise(num = nil)
13
15
  puts "elementwise(#{num})"
14
16
  Benchmark.bm do |r|
15
17
  a = Numo::Float32.new(10000).seq(1)
16
- b = Numo::Float32.new(10000).seq(10,10)
18
+ b = Numo::Float32.new(10000).seq(10, 10)
17
19
  r.report('10**4') do
18
20
  NUM.times { (a + b) }
19
21
  end
20
22
 
21
23
  a = Numo::Float32.new(100000).seq(1)
22
- b = Numo::Float32.new(100000).seq(10,10)
24
+ b = Numo::Float32.new(100000).seq(10, 10)
23
25
  r.report('10**5') do
24
26
  NUM.times { (a + b) }
25
27
  end
26
28
 
27
29
  a = Numo::Float32.new(1000000).seq(1)
28
- b = Numo::Float32.new(1000000).seq(10,10)
30
+ b = Numo::Float32.new(1000000).seq(10, 10)
29
31
  r.report('10**6') do
30
32
  NUM.times { (a + b) }
31
33
  end
32
34
 
33
35
  a = Numo::Float32.new(10000000).seq(1)
34
- b = Numo::Float32.new(10000000).seq(10,10)
36
+ b = Numo::Float32.new(10000000).seq(10, 10)
35
37
  r.report('10**7') do
36
38
  NUM.times { (a + b) }
37
39
  end
38
40
 
39
41
  a = Numo::Float32.new(100000000).seq(1)
40
- b = Numo::Float32.new(100000000).seq(10,10)
42
+ b = Numo::Float32.new(100000000).seq(10, 10)
41
43
  r.report('10**8') do
42
44
  NUM.times { (a + b) }
43
45
  end
@@ -79,32 +81,32 @@ def dot(num = nil)
79
81
  num ||= 1
80
82
  puts "dot(#{num})"
81
83
  Benchmark.bm do |r|
82
- a = Numo::Float32.new(100,100).seq(1)
83
- b = Numo::Float32.new(100,100).seq(10,10)
84
+ a = Numo::Float32.new(100, 100).seq(1)
85
+ b = Numo::Float32.new(100, 100).seq(10, 10)
84
86
  r.report('10**4') do
85
87
  num.times { a.dot(b) }
86
88
  end
87
89
 
88
- a = Numo::Float32.new(100,1000).seq(1)
89
- b = Numo::Float32.new(1000,100).seq(10,10)
90
+ a = Numo::Float32.new(100, 1000).seq(1)
91
+ b = Numo::Float32.new(1000, 100).seq(10, 10)
90
92
  r.report('10**5') do
91
93
  num.times { a.dot(b) }
92
94
  end
93
95
 
94
- a = Numo::Float32.new(100,10000).seq(1)
95
- b = Numo::Float32.new(10000,100).seq(10,10)
96
+ a = Numo::Float32.new(100, 10000).seq(1)
97
+ b = Numo::Float32.new(10000, 100).seq(10, 10)
96
98
  r.report('10**6') do
97
99
  num.times { a.dot(b) }
98
100
  end
99
101
 
100
- a = Numo::Float32.new(100,100000).seq(1)
101
- b = Numo::Float32.new(100000,100).seq(10,10)
102
+ a = Numo::Float32.new(100, 100000).seq(1)
103
+ b = Numo::Float32.new(100000, 100).seq(10, 10)
102
104
  r.report('10**7') do
103
105
  num.times { a.dot(b) }
104
106
  end
105
107
 
106
- a = Numo::Float32.new(100,1000000).seq(1)
107
- b = Numo::Float32.new(1000000,100).seq(10,10)
108
+ a = Numo::Float32.new(100, 1000000).seq(1)
109
+ b = Numo::Float32.new(1000000, 100).seq(10, 10)
108
110
  r.report('10**8') do
109
111
  num.times { a.dot(b) }
110
112
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'benchmark'
2
4
  require 'cumo/narray'
3
5
 
@@ -5,7 +7,7 @@ num_iteration = 100
5
7
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
6
8
 
7
9
  Benchmark.bm 30 do |r|
8
- x = Cumo::SFloat.ones([500,500])
10
+ x = Cumo::SFloat.ones([500, 500])
9
11
  r.report "x.sum" do
10
12
  num_iteration.times do
11
13
  x.sum
@@ -13,7 +15,7 @@ Benchmark.bm 30 do |r|
13
15
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
14
16
  end
15
17
 
16
- x = Cumo::SFloat.ones([500,500])
18
+ x = Cumo::SFloat.ones([500, 500])
17
19
  r.report "x.sum(axis: 0)" do
18
20
  num_iteration.times do
19
21
  x.sum(axis: 0)
@@ -21,7 +23,7 @@ Benchmark.bm 30 do |r|
21
23
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
22
24
  end
23
25
 
24
- x = Cumo::SFloat.ones([500,500])
26
+ x = Cumo::SFloat.ones([500, 500])
25
27
  r.report "x.sum(axis: 1)" do
26
28
  num_iteration.times do
27
29
  x.sum(axis: 1)
@@ -29,7 +31,7 @@ Benchmark.bm 30 do |r|
29
31
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
30
32
  end
31
33
 
32
- x = Cumo::SFloat.ones([500,500])
34
+ x = Cumo::SFloat.ones([500, 500])
33
35
  r.report "x.sum(keepdims: true)" do
34
36
  num_iteration.times do
35
37
  x.sum(keepdims: true)
@@ -37,7 +39,7 @@ Benchmark.bm 30 do |r|
37
39
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
38
40
  end
39
41
 
40
- x = Cumo::SFloat.ones([500,500])
42
+ x = Cumo::SFloat.ones([500, 500])
41
43
  r.report "x.sum(axis: 0, keepdims: true)" do
42
44
  num_iteration.times do
43
45
  x.sum(axis: 0, keepdims: true)
@@ -45,7 +47,7 @@ Benchmark.bm 30 do |r|
45
47
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
46
48
  end
47
49
 
48
- x = Cumo::SFloat.ones([500,500])
50
+ x = Cumo::SFloat.ones([500, 500])
49
51
  r.report "x.sum(axis: 1, keepdims: true)" do
50
52
  num_iteration.times do
51
53
  x.sum(axis: 1, keepdims: true)
@@ -53,7 +55,7 @@ Benchmark.bm 30 do |r|
53
55
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
54
56
  end
55
57
 
56
- x = Cumo::SFloat.ones([500,500])
58
+ x = Cumo::SFloat.ones([500, 500])
57
59
  r.report "x.max" do
58
60
  num_iteration.times do
59
61
  x.max
@@ -61,7 +63,7 @@ Benchmark.bm 30 do |r|
61
63
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
62
64
  end
63
65
 
64
- x = Cumo::SFloat.ones([500,500])
66
+ x = Cumo::SFloat.ones([500, 500])
65
67
  r.report "x.max(axis: 0)" do
66
68
  num_iteration.times do
67
69
  x.max(axis: 0)
@@ -69,7 +71,7 @@ Benchmark.bm 30 do |r|
69
71
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
70
72
  end
71
73
 
72
- x = Cumo::SFloat.ones([500,500])
74
+ x = Cumo::SFloat.ones([500, 500])
73
75
  r.report "x.max(axis: 1)" do
74
76
  num_iteration.times do
75
77
  x.max(axis: 1)
@@ -77,7 +79,7 @@ Benchmark.bm 30 do |r|
77
79
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
78
80
  end
79
81
 
80
- x = Cumo::SFloat.ones([500,500])
82
+ x = Cumo::SFloat.ones([500, 500])
81
83
  r.report "x.max(keepdims: true)" do
82
84
  num_iteration.times do
83
85
  x.max(keepdims: true)
@@ -85,7 +87,7 @@ Benchmark.bm 30 do |r|
85
87
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
86
88
  end
87
89
 
88
- x = Cumo::SFloat.ones([500,500])
90
+ x = Cumo::SFloat.ones([500, 500])
89
91
  r.report "x.max(axis: 0, keepdims: true)" do
90
92
  num_iteration.times do
91
93
  x.max(axis: 0, keepdims: true)
@@ -93,7 +95,7 @@ Benchmark.bm 30 do |r|
93
95
  Cumo::CUDA::Runtime.cudaDeviceSynchronize
94
96
  end
95
97
 
96
- x = Cumo::SFloat.ones([500,500])
98
+ x = Cumo::SFloat.ones([500, 500])
97
99
  r.report "x.max(axis: 1, keepdims: true)" do
98
100
  num_iteration.times do
99
101
  x.max(axis: 1, keepdims: true)
data/bin/console CHANGED
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require "bundler/setup"
4
5
  require "cumo"
data/cumo.gemspec CHANGED
@@ -1,9 +1,9 @@
1
- # coding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  lib = File.expand_path("../lib", __FILE__)
3
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
5
 
5
6
  cumo_version = File.read(File.join(__dir__, "ext/cumo/include/cumo.h")).match(/CUMO_VERSION "([^"]+)"/)[1]
6
- numo_narray_version = File.read(File.join(__dir__, "numo-narray-version")).strip
7
7
 
8
8
  Gem::Specification.new do |spec|
9
9
  spec.name = "cumo"
@@ -16,17 +16,14 @@ Gem::Specification.new do |spec|
16
16
  spec.homepage = "https://github.com/sonots/cumo"
17
17
  spec.license = "BSD-3-Clause"
18
18
 
19
- spec.files = `git ls-files -z`.split("\x0").reject do |f|
20
- f.match(%r{^(test|spec|features)/})
19
+ spec.required_ruby_version = ">= 3.0.0"
20
+
21
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
22
+ f.match(%r{^(test|spec|features|docker)/})
21
23
  end
22
24
  spec.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
23
25
  spec.bindir = "exe"
24
26
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
25
27
  spec.require_paths = ["lib"]
26
28
  spec.extensions = ["ext/cumo/extconf.rb"]
27
-
28
- spec.add_runtime_dependency "numo-narray", numo_narray_version
29
-
30
- spec.add_development_dependency "bundler", "~> 1.15"
31
- spec.add_development_dependency "rake", "~> 10.0"
32
29
  end
data/docker-build.sh ADDED
@@ -0,0 +1,4 @@
1
+ #!/bin/bash
2
+
3
+ script_dir=$(cd $(dirname ${BASH_SOURCE:-$0}); pwd)
4
+ docker build -t cumo-dev ${script_dir}
data/docker-launch.sh ADDED
@@ -0,0 +1,4 @@
1
+ #!/bin/bash
2
+
3
+ project_dir="$(cd $(dirname ${BASH_SOURCE:-$0}); pwd)/"
4
+ docker run --gpus all -v $project_dir:/workspace -it cumo-dev bash
data/docs/src-tree.md CHANGED
@@ -6,7 +6,7 @@
6
6
  * Technically, it is not possible to use CRuby API such as `VALUE` in .cu files.
7
7
  * CRuby API is not callable from CUDA kernel because they do not have `__device__` modifier.
8
8
  * nvcc does not support `#include RUBY_EXTCONF_H`, so can not include `ruby.h`.
9
- * (RULE) It is allowed to use C++14 codes in .cu files.
9
+ * (RULE) It is allowed to use C++17 codes in .cu files.
10
10
  * Rest of `*.{h,c}` files are for host (CPU).
11
11
  * Call C wrapper functions defined in .cu files.
12
12
  * It can use CRuby API.
@@ -50,7 +50,7 @@ cumo_cuda_cudnn_handle()
50
50
  @return [Boolean] Returns true if cuDNN is available
51
51
  */
52
52
  static VALUE
53
- rb_cudnn_available_p()
53
+ rb_cudnn_available_p(VALUE self)
54
54
  {
55
55
  #if CUDNN_FOUND
56
56
  return Qtrue;
@@ -72,7 +72,7 @@ Init_cumo_cuda_cudnn(void)
72
72
  rb_define_const(mCUDA, "Cudnn", mCUDNN); // alias
73
73
  eCUDNNError = rb_define_class_under(mCUDA, "CUDNNError", rb_eStandardError);
74
74
 
75
- rb_define_singleton_method(mCUDNN, "available?", RUBY_METHOD_FUNC(rb_cudnn_available_p), 0);
75
+ rb_define_singleton_method(mCUDNN, "available?", rb_cudnn_available_p, 0);
76
76
  #ifdef CUDNN_FOUND
77
77
  rb_define_const(mCUDNN, "CUDNN_POOLING_MAX", INT2NUM(CUDNN_POOLING_MAX));
78
78
  rb_define_const(mCUDNN, "CUDNN_POOLING_MAX_DETERMINISTIC", INT2NUM(CUDNN_POOLING_MAX_DETERMINISTIC));
@@ -74,6 +74,25 @@ cumo_cuda_cudnn_CreateTensorDescriptor(
74
74
  status = cudnnSetTensor4dDescriptor(
75
75
  *desc, CUDNN_TENSOR_NCHW, cudnn_dtype, shape[0], shape[1], shape[2], shape[3]);
76
76
  }
77
+ else if (ndim < 4) {
78
+ // cuDNN 9 fix: Force 4D (N, C, H, W)
79
+ int pad_shape[4] = {1, 1, 1, 1};
80
+
81
+ if (ndim == 1) {
82
+ // 1D: arrays are treated as "Channel" (1, C, 1, 1)
83
+ pad_shape[1] = (int)(shape[0]);
84
+ } else {
85
+ // 2D: [N, C] -> [N, C, 1, 1]
86
+ // 3D: [N, C, H] -> [N, C, H, 1]
87
+ for (int idim = 0; idim < ndim; ++idim) {
88
+ pad_shape[idim] = (int)(shape[idim]);
89
+ }
90
+ }
91
+
92
+ status = cudnnSetTensor4dDescriptor(
93
+ *desc, CUDNN_TENSOR_NCHW, cudnn_dtype,
94
+ pad_shape[0], pad_shape[1], pad_shape[2], pad_shape[3]);
95
+ }
77
96
  else {
78
97
  int int_shape[CUMO_NA_MAX_DIMENSION];
79
98
  for (int idim = 0; idim < ndim; ++idim) {
@@ -514,8 +533,11 @@ cumo_cuda_cudnn_FindConvolutionBackwardFilterAlgorithm(
514
533
  // TODO(sonots): Support other than 4, 5 dimensional arrays by reshaping into 4-dimensional arrays as Chainer does.
515
534
  cudnnBatchNormMode_t
516
535
  cumo_cuda_cudnn_GetBatchNormMode(size_t ndim, int* axis) {
517
- if (ndim == 1 && axis[0] == 0) { // (1, channels, (depth, )height, width)
518
- return CUDNN_BATCHNORM_PER_ACTIVATION;
536
+ if (ndim == 1) {
537
+ return CUDNN_BATCHNORM_SPATIAL;
538
+ }
539
+ if (ndim == 2) {
540
+ return CUDNN_BATCHNORM_SPATIAL;
519
541
  }
520
542
  if ((ndim == 3 && axis[0] == 0 && axis[1] == 2 && axis[2] == 3) ||
521
543
  (ndim == 4 && axis[0] == 0 && axis[1] == 2 && axis[2] == 3 && axis[3] == 4)) { // (1, channels, (1, )1, 1)
@@ -533,7 +555,7 @@ cumo_cuda_cudnn_CreateBNTensorDescriptor(
533
555
  {
534
556
  cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
535
557
  status = cudnnCreateTensorDescriptor(desc);
536
- if (status = CUDNN_STATUS_SUCCESS) return status;
558
+ if (status == CUDNN_STATUS_SUCCESS) return status;
537
559
 
538
560
  status = cudnnDeriveBNTensorDescriptor(*desc, x_desc, mode);
539
561
  return status;
@@ -33,7 +33,11 @@ rb_cuCtxCreate(VALUE self, VALUE flags, VALUE dev)
33
33
  CUcontext _pctx;
34
34
  CUresult status;
35
35
 
36
+ #if defined(CUDA_VERSION) && CUDA_VERSION >= 13000
37
+ status = cuCtxCreate(&_pctx, NULL, _flags, _dev);
38
+ #else
36
39
  status = cuCtxCreate(&_pctx, _flags, _dev);
40
+ #endif
37
41
 
38
42
  check_status(status);
39
43
  return SIZET2NUM((size_t)_pctx);
@@ -418,5 +422,9 @@ Init_cumo_cuda_driver()
418
422
 
419
423
  cuInit(0);
420
424
  cuDeviceGet(&cuDevice, 0);
425
+ #if defined(CUDA_VERSION) && CUDA_VERSION >= 13000
426
+ cuCtxCreate(&context, NULL, 0, cuDevice);
427
+ #else
421
428
  cuCtxCreate(&context, 0, cuDevice);
429
+ #endif
422
430
  }
data/ext/cumo/cumo.c CHANGED
@@ -114,13 +114,17 @@ Init_cumo()
114
114
  const char* env;
115
115
  VALUE mCumo;
116
116
 
117
+ #ifdef HAVE_RB_EXT_RACTOR_SAFE
118
+ rb_ext_ractor_safe(true);
119
+ #endif
120
+
117
121
  mCumo = rb_define_module("Cumo");
118
122
 
119
123
  rb_define_const(mCumo, "VERSION", rb_str_new2(CUMO_VERSION));
120
124
 
121
- rb_define_singleton_method(mCumo, "enable_compatible_mode", RUBY_METHOD_FUNC(rb_enable_compatible_mode), 0);
122
- rb_define_singleton_method(mCumo, "disable_compatible_mode", RUBY_METHOD_FUNC(rb_disable_compatible_mode), 0);
123
- rb_define_singleton_method(mCumo, "compatible_mode_enabled?", RUBY_METHOD_FUNC(rb_compatible_mode_enabled_p), 0);
125
+ rb_define_singleton_method(mCumo, "enable_compatible_mode", rb_enable_compatible_mode, 0);
126
+ rb_define_singleton_method(mCumo, "disable_compatible_mode", rb_disable_compatible_mode, 0);
127
+ rb_define_singleton_method(mCumo, "compatible_mode_enabled?", rb_compatible_mode_enabled_p, 0);
124
128
 
125
129
  // default is false
126
130
  env = getenv("CUMO_COMPATIBLE_MODE");