cumo 0.4.3 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/.rubocop.yml +15 -0
- data/.rubocop_todo.yml +1252 -0
- data/3rd_party/mkmf-cu/Gemfile +2 -0
- data/3rd_party/mkmf-cu/Rakefile +2 -1
- data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +2 -0
- data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +43 -7
- data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +51 -45
- data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +2 -0
- data/3rd_party/mkmf-cu/mkmf-cu.gemspec +3 -1
- data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +5 -3
- data/CHANGELOG.md +85 -0
- data/Dockerfile +34 -0
- data/Gemfile +6 -1
- data/README.md +2 -10
- data/Rakefile +8 -11
- data/bench/broadcast_fp32.rb +28 -26
- data/bench/cumo_bench.rb +18 -16
- data/bench/numo_bench.rb +18 -16
- data/bench/reduction_fp32.rb +14 -12
- data/bin/console +1 -0
- data/cumo.gemspec +6 -9
- data/docker-build.sh +4 -0
- data/docker-launch.sh +4 -0
- data/docs/src-tree.md +1 -1
- data/ext/cumo/cuda/cudnn.c +2 -2
- data/ext/cumo/cuda/cudnn_impl.cpp +25 -3
- data/ext/cumo/cuda/driver.c +8 -0
- data/ext/cumo/cumo.c +7 -3
- data/ext/cumo/depend.erb +15 -13
- data/ext/cumo/extconf.rb +33 -47
- data/ext/cumo/include/cumo/cuda/cudnn.h +3 -1
- data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +13 -6
- data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +3 -3
- data/ext/cumo/include/cumo/intern.h +1 -0
- data/ext/cumo/include/cumo/narray.h +13 -1
- data/ext/cumo/include/cumo/template.h +2 -4
- data/ext/cumo/include/cumo/types/complex_macro.h +1 -1
- data/ext/cumo/include/cumo/types/complex_macro_kernel.h +15 -4
- data/ext/cumo/include/cumo/types/float_macro.h +2 -2
- data/ext/cumo/include/cumo/types/real_accum_kernel.h +15 -4
- data/ext/cumo/include/cumo/types/xint_macro.h +3 -2
- data/ext/cumo/include/cumo/types/xint_macro_kernel.h +11 -3
- data/ext/cumo/include/cumo.h +2 -2
- data/ext/cumo/narray/array.c +8 -6
- data/ext/cumo/narray/data.c +48 -28
- data/ext/cumo/narray/gen/cogen.rb +8 -7
- data/ext/cumo/narray/gen/cogen_kernel.rb +8 -7
- data/ext/cumo/narray/gen/def/bit.rb +3 -1
- data/ext/cumo/narray/gen/def/dcomplex.rb +2 -0
- data/ext/cumo/narray/gen/def/dfloat.rb +2 -0
- data/ext/cumo/narray/gen/def/int16.rb +2 -0
- data/ext/cumo/narray/gen/def/int32.rb +2 -0
- data/ext/cumo/narray/gen/def/int64.rb +2 -0
- data/ext/cumo/narray/gen/def/int8.rb +2 -0
- data/ext/cumo/narray/gen/def/robject.rb +2 -0
- data/ext/cumo/narray/gen/def/scomplex.rb +2 -0
- data/ext/cumo/narray/gen/def/sfloat.rb +2 -0
- data/ext/cumo/narray/gen/def/uint16.rb +2 -0
- data/ext/cumo/narray/gen/def/uint32.rb +2 -0
- data/ext/cumo/narray/gen/def/uint64.rb +2 -0
- data/ext/cumo/narray/gen/def/uint8.rb +2 -0
- data/ext/cumo/narray/gen/erbln.rb +9 -7
- data/ext/cumo/narray/gen/erbpp2.rb +26 -24
- data/ext/cumo/narray/gen/narray_def.rb +13 -11
- data/ext/cumo/narray/gen/spec.rb +58 -55
- data/ext/cumo/narray/gen/tmpl/accum.c +2 -2
- data/ext/cumo/narray/gen/tmpl/accum_binary.c +1 -1
- data/ext/cumo/narray/gen/tmpl/alloc_func.c +1 -1
- data/ext/cumo/narray/gen/tmpl/aref.c +18 -18
- data/ext/cumo/narray/gen/tmpl/aset.c +16 -16
- data/ext/cumo/narray/gen/tmpl/at.c +34 -0
- data/ext/cumo/narray/gen/tmpl/batch_norm.c +5 -2
- data/ext/cumo/narray/gen/tmpl/batch_norm_backward.c +6 -3
- data/ext/cumo/narray/gen/tmpl/bincount.c +7 -7
- data/ext/cumo/narray/gen/tmpl/clip.c +11 -15
- data/ext/cumo/narray/gen/tmpl/conv.c +1 -1
- data/ext/cumo/narray/gen/tmpl/conv_grad_w.c +3 -1
- data/ext/cumo/narray/gen/tmpl/conv_transpose.c +1 -1
- data/ext/cumo/narray/gen/tmpl/cum.c +1 -1
- data/ext/cumo/narray/gen/tmpl/each.c +4 -2
- data/ext/cumo/narray/gen/tmpl/each_with_index.c +5 -2
- data/ext/cumo/narray/gen/tmpl/fixed_batch_norm.c +5 -2
- data/ext/cumo/narray/gen/tmpl/init_class.c +1 -0
- data/ext/cumo/narray/gen/tmpl/logseq.c +6 -5
- data/ext/cumo/narray/gen/tmpl/map_with_index.c +5 -6
- data/ext/cumo/narray/gen/tmpl/median.c +2 -2
- data/ext/cumo/narray/gen/tmpl/minmax.c +1 -1
- data/ext/cumo/narray/gen/tmpl/poly.c +4 -4
- data/ext/cumo/narray/gen/tmpl/pooling_backward.c +1 -1
- data/ext/cumo/narray/gen/tmpl/pooling_forward.c +1 -1
- data/ext/cumo/narray/gen/tmpl/qsort.c +1 -5
- data/ext/cumo/narray/gen/tmpl/rand.c +8 -6
- data/ext/cumo/narray/gen/tmpl/rand_norm.c +18 -16
- data/ext/cumo/narray/gen/tmpl/seq.c +5 -4
- data/ext/cumo/narray/gen/tmpl/sort.c +3 -3
- data/ext/cumo/narray/gen/tmpl/sort_index.c +2 -2
- data/ext/cumo/narray/gen/tmpl_bit/aref.c +26 -32
- data/ext/cumo/narray/gen/tmpl_bit/aset.c +18 -30
- data/ext/cumo/narray/gen/tmpl_bit/binary.c +42 -14
- data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +5 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +5 -0
- data/ext/cumo/narray/gen/tmpl_bit/mask.c +27 -7
- data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +21 -7
- data/ext/cumo/narray/gen/tmpl_bit/unary.c +21 -7
- data/ext/cumo/narray/index.c +244 -40
- data/ext/cumo/narray/index_kernel.cu +84 -0
- data/ext/cumo/narray/narray.c +57 -19
- data/ext/cumo/narray/ndloop.c +1 -1
- data/ext/cumo/narray/struct.c +1 -1
- data/lib/cumo/cuda/compile_error.rb +1 -1
- data/lib/cumo/cuda/compiler.rb +23 -22
- data/lib/cumo/cuda/cudnn.rb +1 -1
- data/lib/cumo/cuda/device.rb +1 -1
- data/lib/cumo/cuda/link_state.rb +2 -2
- data/lib/cumo/cuda/module.rb +1 -2
- data/lib/cumo/cuda/nvrtc_program.rb +3 -2
- data/lib/cumo/cuda.rb +2 -0
- data/lib/cumo/linalg.rb +2 -0
- data/lib/cumo/narray/extra.rb +297 -341
- data/lib/cumo/narray.rb +2 -0
- data/lib/cumo.rb +3 -1
- data/test/bit_test.rb +157 -0
- data/test/cuda/compiler_test.rb +69 -0
- data/test/cuda/device_test.rb +31 -0
- data/test/cuda/memory_pool_test.rb +45 -0
- data/test/cuda/nvrtc_test.rb +51 -0
- data/test/cuda/runtime_test.rb +28 -0
- data/test/cudnn_test.rb +498 -0
- data/test/cumo_test.rb +27 -0
- data/test/narray_test.rb +745 -0
- data/test/ractor_test.rb +52 -0
- data/test/test_helper.rb +31 -0
- metadata +34 -54
- data/.travis.yml +0 -5
- data/numo-narray-version +0 -1
data/bench/broadcast_fp32.rb
CHANGED
|
@@ -1,11 +1,13 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require 'benchmark'
|
|
2
4
|
require 'cumo/narray'
|
|
3
5
|
|
|
4
6
|
num_iteration = 1000
|
|
5
7
|
|
|
6
8
|
Benchmark.bm 20 do |r|
|
|
7
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
8
|
-
y = Cumo::SFloat.ones([1000,784])
|
|
9
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
10
|
+
y = Cumo::SFloat.ones([1000, 784])
|
|
9
11
|
r.report "x.inplace + y" do
|
|
10
12
|
num_iteration.times do
|
|
11
13
|
x.inplace + y
|
|
@@ -13,8 +15,8 @@ Benchmark.bm 20 do |r|
|
|
|
13
15
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
14
16
|
end
|
|
15
17
|
|
|
16
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
17
|
-
y = Cumo::SFloat.ones([1000,784])
|
|
18
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
19
|
+
y = Cumo::SFloat.ones([1000, 784])
|
|
18
20
|
r.report "x + y" do
|
|
19
21
|
num_iteration.times do
|
|
20
22
|
(x + y).free
|
|
@@ -22,8 +24,8 @@ Benchmark.bm 20 do |r|
|
|
|
22
24
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
23
25
|
end
|
|
24
26
|
|
|
25
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
26
|
-
y = Cumo::SFloat.ones([1000,784])
|
|
27
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
28
|
+
y = Cumo::SFloat.ones([1000, 784])
|
|
27
29
|
r.report "x.inplace + 1.0" do
|
|
28
30
|
num_iteration.times do
|
|
29
31
|
x.inplace + 1.0
|
|
@@ -31,8 +33,8 @@ Benchmark.bm 20 do |r|
|
|
|
31
33
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
32
34
|
end
|
|
33
35
|
|
|
34
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
35
|
-
z = Cumo::SFloat.ones([1000,1])
|
|
36
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
37
|
+
z = Cumo::SFloat.ones([1000, 1])
|
|
36
38
|
r.report "x.inplace + z" do
|
|
37
39
|
num_iteration.times do
|
|
38
40
|
x.inplace + z
|
|
@@ -40,8 +42,8 @@ Benchmark.bm 20 do |r|
|
|
|
40
42
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
41
43
|
end
|
|
42
44
|
|
|
43
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
44
|
-
y = Cumo::SFloat.ones([1000,784])
|
|
45
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
46
|
+
y = Cumo::SFloat.ones([1000, 784])
|
|
45
47
|
r.report "x.inplace - y" do
|
|
46
48
|
num_iteration.times do
|
|
47
49
|
x.inplace - y
|
|
@@ -49,8 +51,8 @@ Benchmark.bm 20 do |r|
|
|
|
49
51
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
50
52
|
end
|
|
51
53
|
|
|
52
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
53
|
-
y = Cumo::SFloat.ones([1000,784])
|
|
54
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
55
|
+
y = Cumo::SFloat.ones([1000, 784])
|
|
54
56
|
r.report "x.inplace - 1.0" do
|
|
55
57
|
num_iteration.times do
|
|
56
58
|
x.inplace - 1.0
|
|
@@ -58,8 +60,8 @@ Benchmark.bm 20 do |r|
|
|
|
58
60
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
59
61
|
end
|
|
60
62
|
|
|
61
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
62
|
-
z = Cumo::SFloat.ones([1000,1])
|
|
63
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
64
|
+
z = Cumo::SFloat.ones([1000, 1])
|
|
63
65
|
r.report "x.inplace - z" do
|
|
64
66
|
num_iteration.times do
|
|
65
67
|
x.inplace - z
|
|
@@ -67,8 +69,8 @@ Benchmark.bm 20 do |r|
|
|
|
67
69
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
68
70
|
end
|
|
69
71
|
|
|
70
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
71
|
-
y = Cumo::SFloat.ones([1000,784])
|
|
72
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
73
|
+
y = Cumo::SFloat.ones([1000, 784])
|
|
72
74
|
r.report "x.inplace * y" do
|
|
73
75
|
num_iteration.times do
|
|
74
76
|
x.inplace * y
|
|
@@ -76,8 +78,8 @@ Benchmark.bm 20 do |r|
|
|
|
76
78
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
77
79
|
end
|
|
78
80
|
|
|
79
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
80
|
-
y = Cumo::SFloat.ones([1000,784])
|
|
81
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
82
|
+
y = Cumo::SFloat.ones([1000, 784])
|
|
81
83
|
r.report "x.inplace * 1.0" do
|
|
82
84
|
num_iteration.times do
|
|
83
85
|
x.inplace * 1.0
|
|
@@ -85,8 +87,8 @@ Benchmark.bm 20 do |r|
|
|
|
85
87
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
86
88
|
end
|
|
87
89
|
|
|
88
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
89
|
-
z = Cumo::SFloat.ones([1000,1])
|
|
90
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
91
|
+
z = Cumo::SFloat.ones([1000, 1])
|
|
90
92
|
r.report "x.inplace * z" do
|
|
91
93
|
num_iteration.times do
|
|
92
94
|
x.inplace * z
|
|
@@ -94,8 +96,8 @@ Benchmark.bm 20 do |r|
|
|
|
94
96
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
95
97
|
end
|
|
96
98
|
|
|
97
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
98
|
-
y = Cumo::SFloat.ones([1000,784])
|
|
99
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
100
|
+
y = Cumo::SFloat.ones([1000, 784])
|
|
99
101
|
r.report "x.inplace / y" do
|
|
100
102
|
num_iteration.times do
|
|
101
103
|
x.inplace / y
|
|
@@ -103,8 +105,8 @@ Benchmark.bm 20 do |r|
|
|
|
103
105
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
104
106
|
end
|
|
105
107
|
|
|
106
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
107
|
-
y = Cumo::SFloat.ones([1000,784])
|
|
108
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
109
|
+
y = Cumo::SFloat.ones([1000, 784])
|
|
108
110
|
r.report "x.inplace / 1.0" do
|
|
109
111
|
num_iteration.times do
|
|
110
112
|
x.inplace / 1.0
|
|
@@ -112,8 +114,8 @@ Benchmark.bm 20 do |r|
|
|
|
112
114
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
113
115
|
end
|
|
114
116
|
|
|
115
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
116
|
-
z = Cumo::SFloat.ones([1000,1])
|
|
117
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
118
|
+
z = Cumo::SFloat.ones([1000, 1])
|
|
117
119
|
r.report "x.inplace / z" do
|
|
118
120
|
num_iteration.times do
|
|
119
121
|
x.inplace / z
|
data/bench/cumo_bench.rb
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require 'cumo/narray'
|
|
2
4
|
require 'benchmark'
|
|
3
5
|
|
|
@@ -5,7 +7,7 @@ NUM = (ARGV.first || 100).to_i
|
|
|
5
7
|
|
|
6
8
|
# warm up
|
|
7
9
|
a = Cumo::Float32.new(10).seq(1)
|
|
8
|
-
b = Cumo::Float32.new(10).seq(10,10)
|
|
10
|
+
b = Cumo::Float32.new(10).seq(10, 10)
|
|
9
11
|
c = a + b
|
|
10
12
|
c.free
|
|
11
13
|
|
|
@@ -14,7 +16,7 @@ def elementwise(num = nil)
|
|
|
14
16
|
puts "elementwise(#{num})"
|
|
15
17
|
Benchmark.bm do |r|
|
|
16
18
|
a = Cumo::Float32.new(10000).seq(1)
|
|
17
|
-
b = Cumo::Float32.new(10000).seq(10,10)
|
|
19
|
+
b = Cumo::Float32.new(10000).seq(10, 10)
|
|
18
20
|
(a + b).free # warm up
|
|
19
21
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
20
22
|
r.report('10**4') do
|
|
@@ -25,7 +27,7 @@ def elementwise(num = nil)
|
|
|
25
27
|
end
|
|
26
28
|
|
|
27
29
|
a = Cumo::Float32.new(100000).seq(1)
|
|
28
|
-
b = Cumo::Float32.new(100000).seq(10,10)
|
|
30
|
+
b = Cumo::Float32.new(100000).seq(10, 10)
|
|
29
31
|
(a + b).free # warm up
|
|
30
32
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
31
33
|
r.report('10**5') do
|
|
@@ -36,7 +38,7 @@ def elementwise(num = nil)
|
|
|
36
38
|
end
|
|
37
39
|
|
|
38
40
|
a = Cumo::Float32.new(1000000).seq(1)
|
|
39
|
-
b = Cumo::Float32.new(1000000).seq(10,10)
|
|
41
|
+
b = Cumo::Float32.new(1000000).seq(10, 10)
|
|
40
42
|
(a + b).free # warm up
|
|
41
43
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
42
44
|
r.report('10**6') do
|
|
@@ -47,7 +49,7 @@ def elementwise(num = nil)
|
|
|
47
49
|
end
|
|
48
50
|
|
|
49
51
|
a = Cumo::Float32.new(10000000).seq(1)
|
|
50
|
-
b = Cumo::Float32.new(10000000).seq(10,10)
|
|
52
|
+
b = Cumo::Float32.new(10000000).seq(10, 10)
|
|
51
53
|
(a + b).free # warm up
|
|
52
54
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
53
55
|
r.report('10**7') do
|
|
@@ -58,7 +60,7 @@ def elementwise(num = nil)
|
|
|
58
60
|
end
|
|
59
61
|
|
|
60
62
|
a = Cumo::Float32.new(100000000).seq(1)
|
|
61
|
-
b = Cumo::Float32.new(100000000).seq(10,10)
|
|
63
|
+
b = Cumo::Float32.new(100000000).seq(10, 10)
|
|
62
64
|
(a + b).free # warm up
|
|
63
65
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
64
66
|
r.report('10**8') do
|
|
@@ -130,8 +132,8 @@ def dot(num = nil)
|
|
|
130
132
|
num ||= 1
|
|
131
133
|
puts "dot(#{num})"
|
|
132
134
|
Benchmark.bm do |r|
|
|
133
|
-
a = Cumo::Float32.new(100,100).seq(1)
|
|
134
|
-
b = Cumo::Float32.new(100,100).seq(10,10)
|
|
135
|
+
a = Cumo::Float32.new(100, 100).seq(1)
|
|
136
|
+
b = Cumo::Float32.new(100, 100).seq(10, 10)
|
|
135
137
|
a.dot(b).free # warm up
|
|
136
138
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
137
139
|
r.report('10**4') do
|
|
@@ -141,8 +143,8 @@ def dot(num = nil)
|
|
|
141
143
|
end
|
|
142
144
|
end
|
|
143
145
|
|
|
144
|
-
a = Cumo::Float32.new(100,1000).seq(1)
|
|
145
|
-
b = Cumo::Float32.new(1000,100).seq(10,10)
|
|
146
|
+
a = Cumo::Float32.new(100, 1000).seq(1)
|
|
147
|
+
b = Cumo::Float32.new(1000, 100).seq(10, 10)
|
|
146
148
|
a.dot(b).free # warm up
|
|
147
149
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
148
150
|
r.report('10**5') do
|
|
@@ -152,8 +154,8 @@ def dot(num = nil)
|
|
|
152
154
|
end
|
|
153
155
|
end
|
|
154
156
|
|
|
155
|
-
a = Cumo::Float32.new(100,10000).seq(1)
|
|
156
|
-
b = Cumo::Float32.new(10000,100).seq(10,10)
|
|
157
|
+
a = Cumo::Float32.new(100, 10000).seq(1)
|
|
158
|
+
b = Cumo::Float32.new(10000, 100).seq(10, 10)
|
|
157
159
|
a.dot(b).free # warm up
|
|
158
160
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
159
161
|
r.report('10**6') do
|
|
@@ -163,8 +165,8 @@ def dot(num = nil)
|
|
|
163
165
|
end
|
|
164
166
|
end
|
|
165
167
|
|
|
166
|
-
a = Cumo::Float32.new(100,100000).seq(1)
|
|
167
|
-
b = Cumo::Float32.new(100000,100).seq(10,10)
|
|
168
|
+
a = Cumo::Float32.new(100, 100000).seq(1)
|
|
169
|
+
b = Cumo::Float32.new(100000, 100).seq(10, 10)
|
|
168
170
|
a.dot(b).free # warm up
|
|
169
171
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
170
172
|
r.report('10**7') do
|
|
@@ -174,8 +176,8 @@ def dot(num = nil)
|
|
|
174
176
|
end
|
|
175
177
|
end
|
|
176
178
|
|
|
177
|
-
a = Cumo::Float32.new(100,1000000).seq(1)
|
|
178
|
-
b = Cumo::Float32.new(1000000,100).seq(10,10)
|
|
179
|
+
a = Cumo::Float32.new(100, 1000000).seq(1)
|
|
180
|
+
b = Cumo::Float32.new(1000000, 100).seq(10, 10)
|
|
179
181
|
a.dot(b).free # warm up
|
|
180
182
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
181
183
|
r.report('10**8') do
|
data/bench/numo_bench.rb
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require 'numo/narray'
|
|
2
4
|
require 'benchmark'
|
|
3
5
|
|
|
@@ -5,7 +7,7 @@ NUM = (ARGV.first || 100).to_i
|
|
|
5
7
|
|
|
6
8
|
# warm up
|
|
7
9
|
a = Numo::Float32.new(10).seq(1)
|
|
8
|
-
b = Numo::Float32.new(10).seq(10,10)
|
|
10
|
+
b = Numo::Float32.new(10).seq(10, 10)
|
|
9
11
|
c = a + b
|
|
10
12
|
|
|
11
13
|
def elementwise(num = nil)
|
|
@@ -13,31 +15,31 @@ def elementwise(num = nil)
|
|
|
13
15
|
puts "elementwise(#{num})"
|
|
14
16
|
Benchmark.bm do |r|
|
|
15
17
|
a = Numo::Float32.new(10000).seq(1)
|
|
16
|
-
b = Numo::Float32.new(10000).seq(10,10)
|
|
18
|
+
b = Numo::Float32.new(10000).seq(10, 10)
|
|
17
19
|
r.report('10**4') do
|
|
18
20
|
NUM.times { (a + b) }
|
|
19
21
|
end
|
|
20
22
|
|
|
21
23
|
a = Numo::Float32.new(100000).seq(1)
|
|
22
|
-
b = Numo::Float32.new(100000).seq(10,10)
|
|
24
|
+
b = Numo::Float32.new(100000).seq(10, 10)
|
|
23
25
|
r.report('10**5') do
|
|
24
26
|
NUM.times { (a + b) }
|
|
25
27
|
end
|
|
26
28
|
|
|
27
29
|
a = Numo::Float32.new(1000000).seq(1)
|
|
28
|
-
b = Numo::Float32.new(1000000).seq(10,10)
|
|
30
|
+
b = Numo::Float32.new(1000000).seq(10, 10)
|
|
29
31
|
r.report('10**6') do
|
|
30
32
|
NUM.times { (a + b) }
|
|
31
33
|
end
|
|
32
34
|
|
|
33
35
|
a = Numo::Float32.new(10000000).seq(1)
|
|
34
|
-
b = Numo::Float32.new(10000000).seq(10,10)
|
|
36
|
+
b = Numo::Float32.new(10000000).seq(10, 10)
|
|
35
37
|
r.report('10**7') do
|
|
36
38
|
NUM.times { (a + b) }
|
|
37
39
|
end
|
|
38
40
|
|
|
39
41
|
a = Numo::Float32.new(100000000).seq(1)
|
|
40
|
-
b = Numo::Float32.new(100000000).seq(10,10)
|
|
42
|
+
b = Numo::Float32.new(100000000).seq(10, 10)
|
|
41
43
|
r.report('10**8') do
|
|
42
44
|
NUM.times { (a + b) }
|
|
43
45
|
end
|
|
@@ -79,32 +81,32 @@ def dot(num = nil)
|
|
|
79
81
|
num ||= 1
|
|
80
82
|
puts "dot(#{num})"
|
|
81
83
|
Benchmark.bm do |r|
|
|
82
|
-
a = Numo::Float32.new(100,100).seq(1)
|
|
83
|
-
b = Numo::Float32.new(100,100).seq(10,10)
|
|
84
|
+
a = Numo::Float32.new(100, 100).seq(1)
|
|
85
|
+
b = Numo::Float32.new(100, 100).seq(10, 10)
|
|
84
86
|
r.report('10**4') do
|
|
85
87
|
num.times { a.dot(b) }
|
|
86
88
|
end
|
|
87
89
|
|
|
88
|
-
a = Numo::Float32.new(100,1000).seq(1)
|
|
89
|
-
b = Numo::Float32.new(1000,100).seq(10,10)
|
|
90
|
+
a = Numo::Float32.new(100, 1000).seq(1)
|
|
91
|
+
b = Numo::Float32.new(1000, 100).seq(10, 10)
|
|
90
92
|
r.report('10**5') do
|
|
91
93
|
num.times { a.dot(b) }
|
|
92
94
|
end
|
|
93
95
|
|
|
94
|
-
a = Numo::Float32.new(100,10000).seq(1)
|
|
95
|
-
b = Numo::Float32.new(10000,100).seq(10,10)
|
|
96
|
+
a = Numo::Float32.new(100, 10000).seq(1)
|
|
97
|
+
b = Numo::Float32.new(10000, 100).seq(10, 10)
|
|
96
98
|
r.report('10**6') do
|
|
97
99
|
num.times { a.dot(b) }
|
|
98
100
|
end
|
|
99
101
|
|
|
100
|
-
a = Numo::Float32.new(100,100000).seq(1)
|
|
101
|
-
b = Numo::Float32.new(100000,100).seq(10,10)
|
|
102
|
+
a = Numo::Float32.new(100, 100000).seq(1)
|
|
103
|
+
b = Numo::Float32.new(100000, 100).seq(10, 10)
|
|
102
104
|
r.report('10**7') do
|
|
103
105
|
num.times { a.dot(b) }
|
|
104
106
|
end
|
|
105
107
|
|
|
106
|
-
a = Numo::Float32.new(100,1000000).seq(1)
|
|
107
|
-
b = Numo::Float32.new(1000000,100).seq(10,10)
|
|
108
|
+
a = Numo::Float32.new(100, 1000000).seq(1)
|
|
109
|
+
b = Numo::Float32.new(1000000, 100).seq(10, 10)
|
|
108
110
|
r.report('10**8') do
|
|
109
111
|
num.times { a.dot(b) }
|
|
110
112
|
end
|
data/bench/reduction_fp32.rb
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require 'benchmark'
|
|
2
4
|
require 'cumo/narray'
|
|
3
5
|
|
|
@@ -5,7 +7,7 @@ num_iteration = 100
|
|
|
5
7
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
6
8
|
|
|
7
9
|
Benchmark.bm 30 do |r|
|
|
8
|
-
x = Cumo::SFloat.ones([500,500])
|
|
10
|
+
x = Cumo::SFloat.ones([500, 500])
|
|
9
11
|
r.report "x.sum" do
|
|
10
12
|
num_iteration.times do
|
|
11
13
|
x.sum
|
|
@@ -13,7 +15,7 @@ Benchmark.bm 30 do |r|
|
|
|
13
15
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
14
16
|
end
|
|
15
17
|
|
|
16
|
-
x = Cumo::SFloat.ones([500,500])
|
|
18
|
+
x = Cumo::SFloat.ones([500, 500])
|
|
17
19
|
r.report "x.sum(axis: 0)" do
|
|
18
20
|
num_iteration.times do
|
|
19
21
|
x.sum(axis: 0)
|
|
@@ -21,7 +23,7 @@ Benchmark.bm 30 do |r|
|
|
|
21
23
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
22
24
|
end
|
|
23
25
|
|
|
24
|
-
x = Cumo::SFloat.ones([500,500])
|
|
26
|
+
x = Cumo::SFloat.ones([500, 500])
|
|
25
27
|
r.report "x.sum(axis: 1)" do
|
|
26
28
|
num_iteration.times do
|
|
27
29
|
x.sum(axis: 1)
|
|
@@ -29,7 +31,7 @@ Benchmark.bm 30 do |r|
|
|
|
29
31
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
30
32
|
end
|
|
31
33
|
|
|
32
|
-
x = Cumo::SFloat.ones([500,500])
|
|
34
|
+
x = Cumo::SFloat.ones([500, 500])
|
|
33
35
|
r.report "x.sum(keepdims: true)" do
|
|
34
36
|
num_iteration.times do
|
|
35
37
|
x.sum(keepdims: true)
|
|
@@ -37,7 +39,7 @@ Benchmark.bm 30 do |r|
|
|
|
37
39
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
38
40
|
end
|
|
39
41
|
|
|
40
|
-
x = Cumo::SFloat.ones([500,500])
|
|
42
|
+
x = Cumo::SFloat.ones([500, 500])
|
|
41
43
|
r.report "x.sum(axis: 0, keepdims: true)" do
|
|
42
44
|
num_iteration.times do
|
|
43
45
|
x.sum(axis: 0, keepdims: true)
|
|
@@ -45,7 +47,7 @@ Benchmark.bm 30 do |r|
|
|
|
45
47
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
46
48
|
end
|
|
47
49
|
|
|
48
|
-
x = Cumo::SFloat.ones([500,500])
|
|
50
|
+
x = Cumo::SFloat.ones([500, 500])
|
|
49
51
|
r.report "x.sum(axis: 1, keepdims: true)" do
|
|
50
52
|
num_iteration.times do
|
|
51
53
|
x.sum(axis: 1, keepdims: true)
|
|
@@ -53,7 +55,7 @@ Benchmark.bm 30 do |r|
|
|
|
53
55
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
54
56
|
end
|
|
55
57
|
|
|
56
|
-
x = Cumo::SFloat.ones([500,500])
|
|
58
|
+
x = Cumo::SFloat.ones([500, 500])
|
|
57
59
|
r.report "x.max" do
|
|
58
60
|
num_iteration.times do
|
|
59
61
|
x.max
|
|
@@ -61,7 +63,7 @@ Benchmark.bm 30 do |r|
|
|
|
61
63
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
62
64
|
end
|
|
63
65
|
|
|
64
|
-
x = Cumo::SFloat.ones([500,500])
|
|
66
|
+
x = Cumo::SFloat.ones([500, 500])
|
|
65
67
|
r.report "x.max(axis: 0)" do
|
|
66
68
|
num_iteration.times do
|
|
67
69
|
x.max(axis: 0)
|
|
@@ -69,7 +71,7 @@ Benchmark.bm 30 do |r|
|
|
|
69
71
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
70
72
|
end
|
|
71
73
|
|
|
72
|
-
x = Cumo::SFloat.ones([500,500])
|
|
74
|
+
x = Cumo::SFloat.ones([500, 500])
|
|
73
75
|
r.report "x.max(axis: 1)" do
|
|
74
76
|
num_iteration.times do
|
|
75
77
|
x.max(axis: 1)
|
|
@@ -77,7 +79,7 @@ Benchmark.bm 30 do |r|
|
|
|
77
79
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
78
80
|
end
|
|
79
81
|
|
|
80
|
-
x = Cumo::SFloat.ones([500,500])
|
|
82
|
+
x = Cumo::SFloat.ones([500, 500])
|
|
81
83
|
r.report "x.max(keepdims: true)" do
|
|
82
84
|
num_iteration.times do
|
|
83
85
|
x.max(keepdims: true)
|
|
@@ -85,7 +87,7 @@ Benchmark.bm 30 do |r|
|
|
|
85
87
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
86
88
|
end
|
|
87
89
|
|
|
88
|
-
x = Cumo::SFloat.ones([500,500])
|
|
90
|
+
x = Cumo::SFloat.ones([500, 500])
|
|
89
91
|
r.report "x.max(axis: 0, keepdims: true)" do
|
|
90
92
|
num_iteration.times do
|
|
91
93
|
x.max(axis: 0, keepdims: true)
|
|
@@ -93,7 +95,7 @@ Benchmark.bm 30 do |r|
|
|
|
93
95
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
94
96
|
end
|
|
95
97
|
|
|
96
|
-
x = Cumo::SFloat.ones([500,500])
|
|
98
|
+
x = Cumo::SFloat.ones([500, 500])
|
|
97
99
|
r.report "x.max(axis: 1, keepdims: true)" do
|
|
98
100
|
num_iteration.times do
|
|
99
101
|
x.max(axis: 1, keepdims: true)
|
data/bin/console
CHANGED
data/cumo.gemspec
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
#
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
2
3
|
lib = File.expand_path("../lib", __FILE__)
|
|
3
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
5
|
|
|
5
6
|
cumo_version = File.read(File.join(__dir__, "ext/cumo/include/cumo.h")).match(/CUMO_VERSION "([^"]+)"/)[1]
|
|
6
|
-
numo_narray_version = File.read(File.join(__dir__, "numo-narray-version")).strip
|
|
7
7
|
|
|
8
8
|
Gem::Specification.new do |spec|
|
|
9
9
|
spec.name = "cumo"
|
|
@@ -16,17 +16,14 @@ Gem::Specification.new do |spec|
|
|
|
16
16
|
spec.homepage = "https://github.com/sonots/cumo"
|
|
17
17
|
spec.license = "BSD-3-Clause"
|
|
18
18
|
|
|
19
|
-
spec.
|
|
20
|
-
|
|
19
|
+
spec.required_ruby_version = ">= 3.0.0"
|
|
20
|
+
|
|
21
|
+
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
|
22
|
+
f.match(%r{^(test|spec|features|docker)/})
|
|
21
23
|
end
|
|
22
24
|
spec.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
|
23
25
|
spec.bindir = "exe"
|
|
24
26
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
|
25
27
|
spec.require_paths = ["lib"]
|
|
26
28
|
spec.extensions = ["ext/cumo/extconf.rb"]
|
|
27
|
-
|
|
28
|
-
spec.add_runtime_dependency "numo-narray", numo_narray_version
|
|
29
|
-
|
|
30
|
-
spec.add_development_dependency "bundler", "~> 1.15"
|
|
31
|
-
spec.add_development_dependency "rake", "~> 10.0"
|
|
32
29
|
end
|
data/docker-build.sh
ADDED
data/docker-launch.sh
ADDED
data/docs/src-tree.md
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
* Technically, it is not possible to use CRuby API such as `VALUE` in .cu files.
|
|
7
7
|
* CRuby API is not callable from CUDA kernel because they do not have `__device__` modifier.
|
|
8
8
|
* nvcc does not support `#include RUBY_EXTCONF_H`, so can not include `ruby.h`.
|
|
9
|
-
* (RULE) It is allowed to use C++
|
|
9
|
+
* (RULE) It is allowed to use C++17 codes in .cu files.
|
|
10
10
|
* Rest of `*.{h,c}` files are for host (CPU).
|
|
11
11
|
* Call C wrapper functions defined in .cu files.
|
|
12
12
|
* It can use CRuby API.
|
data/ext/cumo/cuda/cudnn.c
CHANGED
|
@@ -50,7 +50,7 @@ cumo_cuda_cudnn_handle()
|
|
|
50
50
|
@return [Boolean] Returns true if cuDNN is available
|
|
51
51
|
*/
|
|
52
52
|
static VALUE
|
|
53
|
-
rb_cudnn_available_p()
|
|
53
|
+
rb_cudnn_available_p(VALUE self)
|
|
54
54
|
{
|
|
55
55
|
#if CUDNN_FOUND
|
|
56
56
|
return Qtrue;
|
|
@@ -72,7 +72,7 @@ Init_cumo_cuda_cudnn(void)
|
|
|
72
72
|
rb_define_const(mCUDA, "Cudnn", mCUDNN); // alias
|
|
73
73
|
eCUDNNError = rb_define_class_under(mCUDA, "CUDNNError", rb_eStandardError);
|
|
74
74
|
|
|
75
|
-
rb_define_singleton_method(mCUDNN, "available?",
|
|
75
|
+
rb_define_singleton_method(mCUDNN, "available?", rb_cudnn_available_p, 0);
|
|
76
76
|
#ifdef CUDNN_FOUND
|
|
77
77
|
rb_define_const(mCUDNN, "CUDNN_POOLING_MAX", INT2NUM(CUDNN_POOLING_MAX));
|
|
78
78
|
rb_define_const(mCUDNN, "CUDNN_POOLING_MAX_DETERMINISTIC", INT2NUM(CUDNN_POOLING_MAX_DETERMINISTIC));
|
|
@@ -74,6 +74,25 @@ cumo_cuda_cudnn_CreateTensorDescriptor(
|
|
|
74
74
|
status = cudnnSetTensor4dDescriptor(
|
|
75
75
|
*desc, CUDNN_TENSOR_NCHW, cudnn_dtype, shape[0], shape[1], shape[2], shape[3]);
|
|
76
76
|
}
|
|
77
|
+
else if (ndim < 4) {
|
|
78
|
+
// cuDNN 9 fix: Force 4D (N, C, H, W)
|
|
79
|
+
int pad_shape[4] = {1, 1, 1, 1};
|
|
80
|
+
|
|
81
|
+
if (ndim == 1) {
|
|
82
|
+
// 1D: arrays are treated as "Channel" (1, C, 1, 1)
|
|
83
|
+
pad_shape[1] = (int)(shape[0]);
|
|
84
|
+
} else {
|
|
85
|
+
// 2D: [N, C] -> [N, C, 1, 1]
|
|
86
|
+
// 3D: [N, C, H] -> [N, C, H, 1]
|
|
87
|
+
for (int idim = 0; idim < ndim; ++idim) {
|
|
88
|
+
pad_shape[idim] = (int)(shape[idim]);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
status = cudnnSetTensor4dDescriptor(
|
|
93
|
+
*desc, CUDNN_TENSOR_NCHW, cudnn_dtype,
|
|
94
|
+
pad_shape[0], pad_shape[1], pad_shape[2], pad_shape[3]);
|
|
95
|
+
}
|
|
77
96
|
else {
|
|
78
97
|
int int_shape[CUMO_NA_MAX_DIMENSION];
|
|
79
98
|
for (int idim = 0; idim < ndim; ++idim) {
|
|
@@ -514,8 +533,11 @@ cumo_cuda_cudnn_FindConvolutionBackwardFilterAlgorithm(
|
|
|
514
533
|
// TODO(sonots): Support other than 4, 5 dimensional arrays by reshaping into 4-dimensional arrays as Chainer does.
|
|
515
534
|
cudnnBatchNormMode_t
|
|
516
535
|
cumo_cuda_cudnn_GetBatchNormMode(size_t ndim, int* axis) {
|
|
517
|
-
if (ndim == 1
|
|
518
|
-
return
|
|
536
|
+
if (ndim == 1) {
|
|
537
|
+
return CUDNN_BATCHNORM_SPATIAL;
|
|
538
|
+
}
|
|
539
|
+
if (ndim == 2) {
|
|
540
|
+
return CUDNN_BATCHNORM_SPATIAL;
|
|
519
541
|
}
|
|
520
542
|
if ((ndim == 3 && axis[0] == 0 && axis[1] == 2 && axis[2] == 3) ||
|
|
521
543
|
(ndim == 4 && axis[0] == 0 && axis[1] == 2 && axis[2] == 3 && axis[3] == 4)) { // (1, channels, (1, )1, 1)
|
|
@@ -533,7 +555,7 @@ cumo_cuda_cudnn_CreateBNTensorDescriptor(
|
|
|
533
555
|
{
|
|
534
556
|
cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
|
|
535
557
|
status = cudnnCreateTensorDescriptor(desc);
|
|
536
|
-
if (status
|
|
558
|
+
if (status == CUDNN_STATUS_SUCCESS) return status;
|
|
537
559
|
|
|
538
560
|
status = cudnnDeriveBNTensorDescriptor(*desc, x_desc, mode);
|
|
539
561
|
return status;
|
data/ext/cumo/cuda/driver.c
CHANGED
|
@@ -33,7 +33,11 @@ rb_cuCtxCreate(VALUE self, VALUE flags, VALUE dev)
|
|
|
33
33
|
CUcontext _pctx;
|
|
34
34
|
CUresult status;
|
|
35
35
|
|
|
36
|
+
#if defined(CUDA_VERSION) && CUDA_VERSION >= 13000
|
|
37
|
+
status = cuCtxCreate(&_pctx, NULL, _flags, _dev);
|
|
38
|
+
#else
|
|
36
39
|
status = cuCtxCreate(&_pctx, _flags, _dev);
|
|
40
|
+
#endif
|
|
37
41
|
|
|
38
42
|
check_status(status);
|
|
39
43
|
return SIZET2NUM((size_t)_pctx);
|
|
@@ -418,5 +422,9 @@ Init_cumo_cuda_driver()
|
|
|
418
422
|
|
|
419
423
|
cuInit(0);
|
|
420
424
|
cuDeviceGet(&cuDevice, 0);
|
|
425
|
+
#if defined(CUDA_VERSION) && CUDA_VERSION >= 13000
|
|
426
|
+
cuCtxCreate(&context, NULL, 0, cuDevice);
|
|
427
|
+
#else
|
|
421
428
|
cuCtxCreate(&context, 0, cuDevice);
|
|
429
|
+
#endif
|
|
422
430
|
}
|
data/ext/cumo/cumo.c
CHANGED
|
@@ -114,13 +114,17 @@ Init_cumo()
|
|
|
114
114
|
const char* env;
|
|
115
115
|
VALUE mCumo;
|
|
116
116
|
|
|
117
|
+
#ifdef HAVE_RB_EXT_RACTOR_SAFE
|
|
118
|
+
rb_ext_ractor_safe(true);
|
|
119
|
+
#endif
|
|
120
|
+
|
|
117
121
|
mCumo = rb_define_module("Cumo");
|
|
118
122
|
|
|
119
123
|
rb_define_const(mCumo, "VERSION", rb_str_new2(CUMO_VERSION));
|
|
120
124
|
|
|
121
|
-
rb_define_singleton_method(mCumo, "enable_compatible_mode",
|
|
122
|
-
rb_define_singleton_method(mCumo, "disable_compatible_mode",
|
|
123
|
-
rb_define_singleton_method(mCumo, "compatible_mode_enabled?",
|
|
125
|
+
rb_define_singleton_method(mCumo, "enable_compatible_mode", rb_enable_compatible_mode, 0);
|
|
126
|
+
rb_define_singleton_method(mCumo, "disable_compatible_mode", rb_disable_compatible_mode, 0);
|
|
127
|
+
rb_define_singleton_method(mCumo, "compatible_mode_enabled?", rb_compatible_mode_enabled_p, 0);
|
|
124
128
|
|
|
125
129
|
// default is false
|
|
126
130
|
env = getenv("CUMO_COMPATIBLE_MODE");
|