cumo 0.4.3 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/.rubocop.yml +15 -0
- data/.rubocop_todo.yml +1272 -0
- data/3rd_party/mkmf-cu/Gemfile +2 -0
- data/3rd_party/mkmf-cu/Rakefile +2 -1
- data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +2 -0
- data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +36 -7
- data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +51 -45
- data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +2 -0
- data/3rd_party/mkmf-cu/mkmf-cu.gemspec +3 -1
- data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +5 -3
- data/CHANGELOG.md +69 -0
- data/Gemfile +6 -1
- data/README.md +2 -10
- data/Rakefile +8 -11
- data/bench/broadcast_fp32.rb +28 -26
- data/bench/cumo_bench.rb +18 -16
- data/bench/numo_bench.rb +18 -16
- data/bench/reduction_fp32.rb +14 -12
- data/bin/console +1 -0
- data/cumo.gemspec +5 -8
- data/ext/cumo/cuda/cudnn.c +2 -2
- data/ext/cumo/cumo.c +7 -3
- data/ext/cumo/depend.erb +15 -13
- data/ext/cumo/extconf.rb +32 -46
- data/ext/cumo/include/cumo/cuda/cudnn.h +3 -1
- data/ext/cumo/include/cumo/intern.h +1 -0
- data/ext/cumo/include/cumo/narray.h +13 -1
- data/ext/cumo/include/cumo/template.h +2 -4
- data/ext/cumo/include/cumo/types/complex_macro.h +1 -1
- data/ext/cumo/include/cumo/types/float_macro.h +2 -2
- data/ext/cumo/include/cumo/types/xint_macro.h +3 -2
- data/ext/cumo/include/cumo.h +2 -2
- data/ext/cumo/narray/array.c +3 -3
- data/ext/cumo/narray/data.c +23 -2
- data/ext/cumo/narray/gen/cogen.rb +8 -7
- data/ext/cumo/narray/gen/cogen_kernel.rb +8 -7
- data/ext/cumo/narray/gen/def/bit.rb +3 -1
- data/ext/cumo/narray/gen/def/dcomplex.rb +2 -0
- data/ext/cumo/narray/gen/def/dfloat.rb +2 -0
- data/ext/cumo/narray/gen/def/int16.rb +2 -0
- data/ext/cumo/narray/gen/def/int32.rb +2 -0
- data/ext/cumo/narray/gen/def/int64.rb +2 -0
- data/ext/cumo/narray/gen/def/int8.rb +2 -0
- data/ext/cumo/narray/gen/def/robject.rb +2 -0
- data/ext/cumo/narray/gen/def/scomplex.rb +2 -0
- data/ext/cumo/narray/gen/def/sfloat.rb +2 -0
- data/ext/cumo/narray/gen/def/uint16.rb +2 -0
- data/ext/cumo/narray/gen/def/uint32.rb +2 -0
- data/ext/cumo/narray/gen/def/uint64.rb +2 -0
- data/ext/cumo/narray/gen/def/uint8.rb +2 -0
- data/ext/cumo/narray/gen/erbln.rb +9 -7
- data/ext/cumo/narray/gen/erbpp2.rb +26 -24
- data/ext/cumo/narray/gen/narray_def.rb +13 -11
- data/ext/cumo/narray/gen/spec.rb +58 -55
- data/ext/cumo/narray/gen/tmpl/alloc_func.c +1 -1
- data/ext/cumo/narray/gen/tmpl/at.c +34 -0
- data/ext/cumo/narray/gen/tmpl/batch_norm.c +1 -1
- data/ext/cumo/narray/gen/tmpl/batch_norm_backward.c +2 -2
- data/ext/cumo/narray/gen/tmpl/conv.c +1 -1
- data/ext/cumo/narray/gen/tmpl/conv_grad_w.c +3 -1
- data/ext/cumo/narray/gen/tmpl/conv_transpose.c +1 -1
- data/ext/cumo/narray/gen/tmpl/fixed_batch_norm.c +1 -1
- data/ext/cumo/narray/gen/tmpl/init_class.c +1 -0
- data/ext/cumo/narray/gen/tmpl/pooling_backward.c +1 -1
- data/ext/cumo/narray/gen/tmpl/pooling_forward.c +1 -1
- data/ext/cumo/narray/gen/tmpl/qsort.c +1 -5
- data/ext/cumo/narray/gen/tmpl/sort.c +1 -1
- data/ext/cumo/narray/gen/tmpl_bit/binary.c +42 -14
- data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +5 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +5 -0
- data/ext/cumo/narray/gen/tmpl_bit/mask.c +27 -7
- data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +21 -7
- data/ext/cumo/narray/gen/tmpl_bit/unary.c +21 -7
- data/ext/cumo/narray/index.c +243 -39
- data/ext/cumo/narray/index_kernel.cu +84 -0
- data/ext/cumo/narray/narray.c +38 -1
- data/ext/cumo/narray/ndloop.c +1 -1
- data/ext/cumo/narray/struct.c +1 -1
- data/lib/cumo/cuda/compile_error.rb +1 -1
- data/lib/cumo/cuda/compiler.rb +23 -22
- data/lib/cumo/cuda/cudnn.rb +1 -1
- data/lib/cumo/cuda/device.rb +1 -1
- data/lib/cumo/cuda/link_state.rb +2 -2
- data/lib/cumo/cuda/module.rb +1 -2
- data/lib/cumo/cuda/nvrtc_program.rb +3 -2
- data/lib/cumo/cuda.rb +2 -0
- data/lib/cumo/linalg.rb +2 -0
- data/lib/cumo/narray/extra.rb +137 -185
- data/lib/cumo/narray.rb +2 -0
- data/lib/cumo.rb +3 -1
- data/test/bit_test.rb +157 -0
- data/test/cuda/compiler_test.rb +69 -0
- data/test/cuda/device_test.rb +30 -0
- data/test/cuda/memory_pool_test.rb +45 -0
- data/test/cuda/nvrtc_test.rb +51 -0
- data/test/cuda/runtime_test.rb +28 -0
- data/test/cudnn_test.rb +498 -0
- data/test/cumo_test.rb +27 -0
- data/test/narray_test.rb +745 -0
- data/test/ractor_test.rb +52 -0
- data/test/test_helper.rb +31 -0
- metadata +31 -54
- data/.travis.yml +0 -5
- data/numo-narray-version +0 -1
data/3rd_party/mkmf-cu/Gemfile
CHANGED
data/3rd_party/mkmf-cu/Rakefile
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require "open3"
|
|
2
4
|
require_relative "nvcc"
|
|
3
5
|
|
|
@@ -6,7 +8,7 @@ module MakeMakefileCuda
|
|
|
6
8
|
attr_reader :argv
|
|
7
9
|
|
|
8
10
|
def initialize(argv)
|
|
9
|
-
@argv = argv.map{|e| e.dup }
|
|
11
|
+
@argv = argv.map { |e| e.dup }
|
|
10
12
|
end
|
|
11
13
|
|
|
12
14
|
def run
|
|
@@ -32,17 +34,35 @@ module MakeMakefileCuda
|
|
|
32
34
|
# TODO(sonots): Make it possible to configure "nvcc" and additional arguments
|
|
33
35
|
def nvcc_command
|
|
34
36
|
s = MakeMakefileCuda::Nvcc.generate(argv)
|
|
35
|
-
cmd = "nvcc "
|
|
37
|
+
cmd = "nvcc #{s}"
|
|
36
38
|
if ENV['CUMO_NVCC_GENERATE_CODE']
|
|
37
39
|
cmd << " --generate-code=#{ENV['CUMO_NVCC_GENERATE_CODE']}"
|
|
38
40
|
elsif ENV['DEBUG']
|
|
39
41
|
cmd << " -arch=sm_35"
|
|
40
42
|
else
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
43
|
+
# Ref. https://en.wikipedia.org/wiki/CUDA
|
|
44
|
+
if cuda_version >= Gem::Version.new("13.0")
|
|
45
|
+
# CUDA 13.0
|
|
46
|
+
capability = [75, 87, 89, 90, 121]
|
|
47
|
+
elsif cuda_version >= Gem::Version.new("12.9")
|
|
48
|
+
# CUDA 12.9
|
|
49
|
+
capability = [50, 60, 70, 75, 87, 89, 90, 121]
|
|
50
|
+
elsif cuda_version >= Gem::Version.new("12.8")
|
|
51
|
+
# CUDA 12.8
|
|
52
|
+
capability = [50, 60, 70, 75, 87, 89, 90, 120]
|
|
53
|
+
elsif cuda_version >= Gem::Version.new("12.0")
|
|
54
|
+
# CUDA 12.0 – 12.6
|
|
55
|
+
capability = [50, 60, 70, 75, 87, 89, 90]
|
|
56
|
+
elsif cuda_version >= Gem::Version.new("11.8")
|
|
57
|
+
# CUDA 11.8
|
|
58
|
+
capability = [35, 50, 60, 70, 75, 87, 89, 90]
|
|
59
|
+
else
|
|
60
|
+
# CUDA 11.0
|
|
61
|
+
capability = [35, 50, 60, 70, 75, 80]
|
|
62
|
+
end
|
|
63
|
+
capability.each do |arch|
|
|
64
|
+
cmd << " --generate-code=arch=compute_#{arch},code=sm_#{arch}"
|
|
65
|
+
end
|
|
46
66
|
end
|
|
47
67
|
cmd
|
|
48
68
|
end
|
|
@@ -88,5 +108,14 @@ module MakeMakefileCuda
|
|
|
88
108
|
raise "#{color_code} is not supported" unless COLOR_CODES[code]
|
|
89
109
|
"\e[#{COLOR_CODES[code]}m#{str}\e[0m"
|
|
90
110
|
end
|
|
111
|
+
|
|
112
|
+
def cuda_version
|
|
113
|
+
@cuda_version ||= begin
|
|
114
|
+
output = `nvcc --version`
|
|
115
|
+
if output =~ /Cuda compilation tools, release ([^,]*),/
|
|
116
|
+
Gem::Version.new($1)
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
91
120
|
end
|
|
92
121
|
end
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require "optparse"
|
|
2
4
|
require "rbconfig"
|
|
3
5
|
|
|
@@ -29,50 +31,50 @@ module MakeMakefileCuda
|
|
|
29
31
|
|
|
30
32
|
def build_optparser
|
|
31
33
|
opt = OptionParser.new
|
|
32
|
-
opt_h = Hash.new{|h, k| h[k] = [] }
|
|
33
|
-
|
|
34
|
-
opt.on("--arch arg") {|v| opt_h["-arch"] << v }
|
|
35
|
-
opt.on("--std arg") {|v| opt_h["-std"] << v }
|
|
36
|
-
opt.on("--stdlib arg") {|v| opt_h["-stdlib"] << v }
|
|
37
|
-
|
|
38
|
-
opt.on("--Wl arg") {|v| opt_h["-Wl"] << v }
|
|
39
|
-
|
|
40
|
-
opt.on('--profile') {|v| opt_h["-pg"] << "" }
|
|
41
|
-
opt.on('-g') {|v| opt_h["-g"] << "" }
|
|
42
|
-
opt.on('-G', "--device-debug") {|v| opt_h["-G"] << "" }
|
|
43
|
-
|
|
44
|
-
opt.on('-I path') {|v| opt_h["-I"] << v }
|
|
45
|
-
opt.on('-D flag') {|v| opt_h["-D"] << v }
|
|
46
|
-
opt.on('-W flag') {|v| opt_h["-W"] << v }
|
|
47
|
-
opt.on('-o output') {|v| opt_h["-o"] << v }
|
|
48
|
-
opt.on('-c file') {|v| opt_h["-c"] << v }
|
|
49
|
-
opt.on('-f flag') {|v| opt_h["-f"] << v }
|
|
50
|
-
opt.on('-l file') {|v| opt_h["-l"] << v }
|
|
51
|
-
opt.on('-L path') {|v| opt_h["-L"] << v }
|
|
52
|
-
opt.on('-x pat', "--x pat") {|v| opt_h["-x"] << v }
|
|
53
|
-
opt.on('-O num'){|v| opt_h["-O"] << v if /[0-9]/ =~ v }
|
|
54
|
-
opt.on('--mkmf-cu-ext ext'){|v| opt_h["--mkmf-cu-ext"] << v}
|
|
34
|
+
opt_h = Hash.new { |h, k| h[k] = [] }
|
|
35
|
+
|
|
36
|
+
opt.on("--arch arg") { |v| opt_h["-arch"] << v }
|
|
37
|
+
opt.on("--std arg") { |v| opt_h["-std"] << v }
|
|
38
|
+
opt.on("--stdlib arg") { |v| opt_h["-stdlib"] << v }
|
|
39
|
+
|
|
40
|
+
opt.on("--Wl arg") { |v| opt_h["-Wl"] << v }
|
|
41
|
+
|
|
42
|
+
opt.on('--profile') { |v| opt_h["-pg"] << "" }
|
|
43
|
+
opt.on('-g') { |v| opt_h["-g"] << "" }
|
|
44
|
+
opt.on('-G', "--device-debug") { |v| opt_h["-G"] << "" }
|
|
45
|
+
|
|
46
|
+
opt.on('-I path') { |v| opt_h["-I"] << quote(v) }
|
|
47
|
+
opt.on('-D flag') { |v| opt_h["-D"] << v }
|
|
48
|
+
opt.on('-W flag') { |v| opt_h["-W"] << v }
|
|
49
|
+
opt.on('-o output') { |v| opt_h["-o"] << quote(v) }
|
|
50
|
+
opt.on('-c file') { |v| opt_h["-c"] << quote(v) }
|
|
51
|
+
opt.on('-f flag') { |v| opt_h["-f"] << v }
|
|
52
|
+
opt.on('-l file') { |v| opt_h["-l"] << quote(v) }
|
|
53
|
+
opt.on('-L path') { |v| opt_h["-L"] << quote(v) }
|
|
54
|
+
opt.on('-x pat', "--x pat") { |v| opt_h["-x"] << v }
|
|
55
|
+
opt.on('-O num') { |v| opt_h["-O"] << v if /[0-9]/ =~ v }
|
|
56
|
+
opt.on('--mkmf-cu-ext ext') { |v| opt_h["--mkmf-cu-ext"] << v }
|
|
55
57
|
|
|
56
58
|
return [opt, opt_h]
|
|
57
59
|
end
|
|
58
60
|
|
|
59
61
|
def parse_ill_short(argv, opt_h)
|
|
60
|
-
["-shared", "-rdynamic", "-dynamic", "-bundle",
|
|
62
|
+
["-shared", "-rdynamic", "-dynamic", "-bundle", "-pipe", "-pg", "-ggdb3"].each { |opt|
|
|
61
63
|
if ind = argv.find_index(opt)
|
|
62
64
|
opt_h[opt] << ""
|
|
63
65
|
argv.delete_at(ind)
|
|
64
66
|
end
|
|
65
67
|
}
|
|
66
|
-
["-arch", "-std", "-stdlib"].each{|opt|
|
|
68
|
+
["-arch", "-std", "-stdlib"].each { |opt|
|
|
67
69
|
if ind = argv.find_index(opt)
|
|
68
70
|
argv[ind] = "-" + opt
|
|
69
71
|
end
|
|
70
72
|
}
|
|
71
73
|
end
|
|
72
74
|
|
|
73
|
-
def parse_ill_short_with_arg(argv, opt_h)
|
|
74
|
-
[/\A(\-stdlib)=(.*)/, /\A(\-std)=(.*)/, /\A(\-Wl),(.*)/].each{|reg|
|
|
75
|
-
argv.each{|e|
|
|
75
|
+
def parse_ill_short_with_arg(argv, opt_h)
|
|
76
|
+
[/\A(\-stdlib)=(.*)/, /\A(\-std)=(.*)/, /\A(\-Wl),(.*)/].each { |reg|
|
|
77
|
+
argv.each { |e|
|
|
76
78
|
if reg =~ e
|
|
77
79
|
e[0..-1] = "-" + $1 + '=' + $2
|
|
78
80
|
end
|
|
@@ -81,14 +83,14 @@ module MakeMakefileCuda
|
|
|
81
83
|
end
|
|
82
84
|
|
|
83
85
|
def compiler_option(opt_h)
|
|
84
|
-
ret = ""
|
|
85
|
-
["-f", "-W", "-pipe"].each{|op|
|
|
86
|
-
opt_h[op].each{|e|
|
|
86
|
+
ret = +""
|
|
87
|
+
["-f", "-W", "-pipe"].each { |op|
|
|
88
|
+
opt_h[op].each { |e|
|
|
87
89
|
ret << " --compiler-options " + "#{op}#{e}"
|
|
88
90
|
}
|
|
89
91
|
}
|
|
90
|
-
["-stdlib", "-std"].each{|op|
|
|
91
|
-
opt_h[op].each{|e|
|
|
92
|
+
["-stdlib", "-std"].each { |op|
|
|
93
|
+
opt_h[op].each { |e|
|
|
92
94
|
ret << " --compiler-options " + "#{op}=#{e}"
|
|
93
95
|
}
|
|
94
96
|
}
|
|
@@ -96,13 +98,13 @@ module MakeMakefileCuda
|
|
|
96
98
|
end
|
|
97
99
|
|
|
98
100
|
def linker_option(opt_h)
|
|
99
|
-
ret = " -shared "
|
|
100
|
-
["-dynamic", "-bundle"].each{|op|
|
|
101
|
-
opt_h[op].each{|e|
|
|
101
|
+
ret = +" -shared "
|
|
102
|
+
["-dynamic", "-bundle"].each { |op|
|
|
103
|
+
opt_h[op].each { |e|
|
|
102
104
|
ret << " --linker-options " + op
|
|
103
105
|
}
|
|
104
106
|
}
|
|
105
|
-
opt_h["-Wl"].each{|e|
|
|
107
|
+
opt_h["-Wl"].each { |e|
|
|
106
108
|
ret << " --linker-options " + e
|
|
107
109
|
}
|
|
108
110
|
return ret
|
|
@@ -110,17 +112,17 @@ module MakeMakefileCuda
|
|
|
110
112
|
|
|
111
113
|
def compiler_bin(opt_h)
|
|
112
114
|
if opt_h["--mkmf-cu-ext"][0] == "c"
|
|
113
|
-
" --compiler-bindir " + RbConfig::CONFIG["CC"]
|
|
115
|
+
" --compiler-bindir " + ENV.fetch("NVCC_CCBIN", RbConfig::CONFIG["CC"])
|
|
114
116
|
elsif opt_h["--mkmf-cu-ext"][0] == "cxx"
|
|
115
|
-
" --compiler-bindir " + RbConfig::CONFIG["CXX"]
|
|
117
|
+
" --compiler-bindir " + ENV.fetch("NVCC_CCBIN", RbConfig::CONFIG["CXX"])
|
|
116
118
|
end
|
|
117
119
|
end
|
|
118
120
|
|
|
119
121
|
def generate_compiling_command_line(opt_h)
|
|
120
|
-
s = ""
|
|
122
|
+
s = +""
|
|
121
123
|
# options nvcc can uderstatnd
|
|
122
|
-
["-std", "-pg", "-g", "-G", "-x", "-I", "-D", "-o", "-c", "-O"].each{|op|
|
|
123
|
-
opt_h[op].each{|e|
|
|
124
|
+
["-std", "-pg", "-g", "-G", "-x", "-I", "-D", "-o", "-c", "-O"].each { |op|
|
|
125
|
+
opt_h[op].each { |e|
|
|
124
126
|
case op
|
|
125
127
|
when "-o", "-c", "-x", "-std"
|
|
126
128
|
s << " #{op} #{e}"
|
|
@@ -136,9 +138,9 @@ module MakeMakefileCuda
|
|
|
136
138
|
end
|
|
137
139
|
|
|
138
140
|
def generate_linking_command_line(argv, opt_h)
|
|
139
|
-
s = ""
|
|
140
|
-
["-L", "-l", "-o", "-c", "-O"].each{|op|
|
|
141
|
-
opt_h[op].each{|e|
|
|
141
|
+
s = +""
|
|
142
|
+
["-L", "-l", "-o", "-c", "-O"].each { |op|
|
|
143
|
+
opt_h[op].each { |e|
|
|
142
144
|
case op
|
|
143
145
|
when "-o", "-c"
|
|
144
146
|
s << " #{op} #{e}"
|
|
@@ -153,5 +155,9 @@ module MakeMakefileCuda
|
|
|
153
155
|
s << compiler_bin(opt_h)
|
|
154
156
|
return s
|
|
155
157
|
end
|
|
158
|
+
|
|
159
|
+
def quote(str)
|
|
160
|
+
"\"#{str}\""
|
|
161
|
+
end
|
|
156
162
|
end
|
|
157
163
|
end
|
|
@@ -1,10 +1,12 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
Gem::Specification.new do |s|
|
|
2
4
|
s.name = 'mkmf-cu'
|
|
3
5
|
s.version = '0.1.2'
|
|
4
6
|
s.date = '2016-03-26'
|
|
5
7
|
s.summary = "Write Ruby extension in C/C++ with NVIDIA CUDA."
|
|
6
8
|
s.description =
|
|
7
|
-
|
|
9
|
+
"Write Ruby extension in C/C++ with NVIDIA CUDA. A simple wrapper command for nvcc and a monkey patch for mkmf."
|
|
8
10
|
s.authors = ["Takashi Tamura"]
|
|
9
11
|
s.email = ''
|
|
10
12
|
s.files = ["lib/mkmf-cu.rb", "lib/mkmf-cu/opt.rb", "LICENSE", "README.md"]
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require "test/unit"
|
|
2
4
|
require "mkmf-cu/opt"
|
|
3
5
|
require "mkmf-cu"
|
|
@@ -23,7 +25,7 @@ class TestMkmfCuOpt < Test::Unit::TestCase
|
|
|
23
25
|
end
|
|
24
26
|
|
|
25
27
|
def test_compiler_option
|
|
26
|
-
@opt_h.merge!({"-shared"=>[""], "-pipe"=>[""]})
|
|
28
|
+
@opt_h.merge!({"-shared" => [""], "-pipe" => [""]})
|
|
27
29
|
assert_equal(" --compiler-options -pipe", compiler_option(@opt_h))
|
|
28
30
|
end
|
|
29
31
|
|
|
@@ -37,7 +39,7 @@ class TestMkmfCuOpt < Test::Unit::TestCase
|
|
|
37
39
|
end
|
|
38
40
|
|
|
39
41
|
def test_linker_option
|
|
40
|
-
@opt_h.merge!({"-Wl"=>["-a", "-b"]})
|
|
42
|
+
@opt_h.merge!({"-Wl" => ["-a", "-b"]})
|
|
41
43
|
assert_equal(" --linker-options -a --linker-options -b",
|
|
42
44
|
linker_option(@opt_h))
|
|
43
45
|
end
|
|
@@ -50,7 +52,7 @@ class TestMkmfCuOpt < Test::Unit::TestCase
|
|
|
50
52
|
end
|
|
51
53
|
|
|
52
54
|
def test_compiler_bin
|
|
53
|
-
h = Hash.new{|h, k| h[k] = [] }.merge({"-shared"=>[""], "-pipe"=>[""], "--mkmf-cu-ext"=>["c"]})
|
|
55
|
+
h = Hash.new { |h, k| h[k] = [] }.merge({"-shared" => [""], "-pipe" => [""], "--mkmf-cu-ext" => ["c"]})
|
|
54
56
|
assert_equal(" --compiler-bindir " + RbConfig::CONFIG["CC"],
|
|
55
57
|
compiler_bin(h))
|
|
56
58
|
end
|
data/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,72 @@
|
|
|
1
|
+
# 0.5.0 (2025/11/01)
|
|
2
|
+
|
|
3
|
+
Fixes:
|
|
4
|
+
|
|
5
|
+
* Remove unnecessary numo-narray dependency
|
|
6
|
+
* Fix Errno::EXDEV for Invalid cross-device link
|
|
7
|
+
* Remove clobber from default task
|
|
8
|
+
* Enable parallel build by default
|
|
9
|
+
* Add magic comment for frozen_string_literal
|
|
10
|
+
* Backport: fix na_flatten_dim(): SEGV when flattening an empty narray view
|
|
11
|
+
* Backport: bug in reshape!: stridx in NArrayView should be reconstructed
|
|
12
|
+
* Backport: mask and masked arrays must have the same shape
|
|
13
|
+
* Backport: fix na_parse_range() to suppress warnings
|
|
14
|
+
* Backport: FIXNUM length is based on LONG, not VALUE
|
|
15
|
+
* Backport: fix bug in NArray#sort: qsort() does not support strided loop
|
|
16
|
+
* Backport: fix na_aref_md_protected(): na2->stridx should be zero-inizialized
|
|
17
|
+
* Backport: q[i].idx should be freed when i != ndim-1
|
|
18
|
+
* Backport: fix variable type
|
|
19
|
+
* Backport: add tests for Bit view arrays
|
|
20
|
+
* Backport: fix macro: STORE_BIT STORE_BIT_STEP: requires mask to leave the lowest bit
|
|
21
|
+
* Backport: fix NArray::Bit#any?,all?: empty array should return false
|
|
22
|
+
* Backport: fix NArray::Bit#count_true/false: empty array should return zero
|
|
23
|
+
* Backport: bug in NArray::Bit; fix bit operation in tmpl_bit/{store_bit,unary,binary}.c
|
|
24
|
+
* Fix typo
|
|
25
|
+
* Backport 135: Make all empty arrays equal
|
|
26
|
+
* Backport: minor fixes in na_get_result_dimension(), check_index_count()
|
|
27
|
+
* Backport 116: new method: NArray#fortran_contiguous?
|
|
28
|
+
* Backport 186: Fix NMath.sinc(0)
|
|
29
|
+
* Backport 188: Fix a typo
|
|
30
|
+
* Fix FrozenError
|
|
31
|
+
* Use add_dependency instead of add_runtime_dependency
|
|
32
|
+
* Remove unused variable
|
|
33
|
+
* Remove unused .travis.yml
|
|
34
|
+
* Remove unnecessary require to fix warnings of "loading in progress, circular require considered harmful"
|
|
35
|
+
* Remove unused variable
|
|
36
|
+
* Fix numo-narray library path
|
|
37
|
+
* Add extconf_compile_commands_json as development dependency
|
|
38
|
+
* Add extconf_compile_commands_json for clangd LSP
|
|
39
|
+
* Remove unnecessary loop if disable assert()
|
|
40
|
+
* Fix cross-platform negative value conversion for unsigned integer types
|
|
41
|
+
* Revert "Fix cross-platform negative value conversion for unsigned integer types"
|
|
42
|
+
* Remove unnecessary require
|
|
43
|
+
* Add Ractor support
|
|
44
|
+
* Update minimum CUDA version
|
|
45
|
+
* Update minimum ruby supported version
|
|
46
|
+
* Use rake-compiler
|
|
47
|
+
* Use absolute file path
|
|
48
|
+
* Allow convert nil to NaN in Numo::DFloat.cast
|
|
49
|
+
* Fix cross-platform negative value conversion for unsigned integer types
|
|
50
|
+
* Fix old-style function definitions
|
|
51
|
+
* Fix old-style function definition in qsort.c
|
|
52
|
+
* Add required_ruby_version in gemspec
|
|
53
|
+
* Use released version of power_assert gem
|
|
54
|
+
* Fix LoadError
|
|
55
|
+
* Quoted file path
|
|
56
|
+
* Add CUDA compute capability (#151)
|
|
57
|
+
* extconf.rb: Use File::PATH_SEPARATOR
|
|
58
|
+
* Fix build error with cuDNN features
|
|
59
|
+
* Link c++ library
|
|
60
|
+
* Fix link error with "multiple definition of `cumo_cuda_eCudnnError'"
|
|
61
|
+
* Fix failure with Ruby 3.3
|
|
62
|
+
* Fix keyword argument expansion
|
|
63
|
+
* Remove compute_35 because it was removed at CUDA 12
|
|
64
|
+
* Use NVCC_CCBIN env var to detect compiler for cuda code on GCC 15 environment
|
|
65
|
+
* Fix build error with GCC 15
|
|
66
|
+
* Use rb_cObject instead of rb_cData
|
|
67
|
+
* Remove unnecessary dependency
|
|
68
|
+
* at() method was rewritten in C.
|
|
69
|
+
|
|
1
70
|
# 0.4.3 (2019-06-11)
|
|
2
71
|
|
|
3
72
|
Fixes:
|
data/Gemfile
CHANGED
|
@@ -1,8 +1,13 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
source "https://rubygems.org"
|
|
2
4
|
|
|
3
5
|
gemspec
|
|
4
6
|
|
|
7
|
+
gem 'extconf_compile_commands_json'
|
|
8
|
+
gem 'rake-compiler'
|
|
5
9
|
gem 'test-unit'
|
|
6
10
|
gem 'yard'
|
|
7
11
|
gem 'pry-byebug'
|
|
8
|
-
gem 'power_assert'
|
|
12
|
+
gem 'power_assert'
|
|
13
|
+
gem 'rubocop'
|
data/README.md
CHANGED
|
@@ -6,9 +6,9 @@ Cumo (pronounced "koomo") is a CUDA-aware, GPU-optimized numerical library that
|
|
|
6
6
|
|
|
7
7
|
## Requirements
|
|
8
8
|
|
|
9
|
-
* Ruby
|
|
9
|
+
* Ruby 3.0 or later
|
|
10
10
|
* NVIDIA GPU Compute Capability 3.5 (Kepler) or later
|
|
11
|
-
* CUDA
|
|
11
|
+
* CUDA 11.0 or later
|
|
12
12
|
|
|
13
13
|
## Preparation
|
|
14
14
|
|
|
@@ -195,14 +195,6 @@ ln -sf "$HOME/opt/ccache/bin/ccache" "$HOME/opt/ccache/bin/g++"
|
|
|
195
195
|
ln -sf "$HOME/opt/ccache/bin/ccache" "$HOME/opt/ccache/bin/nvcc"
|
|
196
196
|
```
|
|
197
197
|
|
|
198
|
-
### Build in parallel
|
|
199
|
-
|
|
200
|
-
Set `MAKEFLAGS` to specify `make` command options. You can build in parallel as:
|
|
201
|
-
|
|
202
|
-
```
|
|
203
|
-
bundle exec env MAKEFLAG=-j8 rake compile
|
|
204
|
-
```
|
|
205
|
-
|
|
206
198
|
### Specify nvcc --generate-code options
|
|
207
199
|
|
|
208
200
|
```
|
data/Rakefile
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require "bundler/gem_tasks"
|
|
2
4
|
require "rake/testtask"
|
|
3
5
|
|
|
@@ -7,22 +9,17 @@ Rake::TestTask.new(:test) do |t|
|
|
|
7
9
|
t.test_files = FileList["test/**/*_test.rb"]
|
|
8
10
|
end
|
|
9
11
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
end
|
|
12
|
+
require "rake/extensiontask"
|
|
13
|
+
Rake::ExtensionTask.new("cumo")
|
|
13
14
|
|
|
14
15
|
task :ctest do
|
|
15
|
-
sh 'cd ext/cumo && ruby extconf.rb && make run-ctest'
|
|
16
|
-
end
|
|
17
|
-
|
|
18
|
-
task :clean do
|
|
19
|
-
sh 'cd ext/cumo && make clean'
|
|
16
|
+
sh 'cd ext/cumo && ruby extconf.rb && make && make build-ctest && make run-ctest'
|
|
20
17
|
end
|
|
21
18
|
|
|
22
19
|
task :docs do
|
|
23
20
|
dir = "ext/cumo"
|
|
24
|
-
srcs = %w[array.c data.c index.c math.c narray.c rand.c struct.c].map{|s| File.join(dir, "narray", s)}
|
|
25
|
-
srcs += %w[cublas.c driver.c nvrtc.c runtime.c memory_pool.cpp].map{|s| File.join(dir, "cuda", s) }
|
|
21
|
+
srcs = %w[array.c data.c index.c math.c narray.c rand.c struct.c].map { |s| File.join(dir, "narray", s) }
|
|
22
|
+
srcs += %w[cublas.c driver.c nvrtc.c runtime.c memory_pool.cpp].map { |s| File.join(dir, "cuda", s) }
|
|
26
23
|
srcs << File.join(dir, "narray", "types/*.c")
|
|
27
24
|
srcs << "lib/cumo/narray/extra.rb"
|
|
28
25
|
sh "cd ext/cumo; ruby extconf.rb; make src"
|
|
@@ -34,7 +31,7 @@ task :gdb do
|
|
|
34
31
|
sh "gdb -x run.gdb --args ruby -I. ./test.rb"
|
|
35
32
|
end
|
|
36
33
|
|
|
37
|
-
task :default => [:
|
|
34
|
+
task :default => [:compile, :test]
|
|
38
35
|
|
|
39
36
|
desc 'Open an irb session preloaded with the gem library'
|
|
40
37
|
task :console do
|
data/bench/broadcast_fp32.rb
CHANGED
|
@@ -1,11 +1,13 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require 'benchmark'
|
|
2
4
|
require 'cumo/narray'
|
|
3
5
|
|
|
4
6
|
num_iteration = 1000
|
|
5
7
|
|
|
6
8
|
Benchmark.bm 20 do |r|
|
|
7
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
8
|
-
y = Cumo::SFloat.ones([1000,784])
|
|
9
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
10
|
+
y = Cumo::SFloat.ones([1000, 784])
|
|
9
11
|
r.report "x.inplace + y" do
|
|
10
12
|
num_iteration.times do
|
|
11
13
|
x.inplace + y
|
|
@@ -13,8 +15,8 @@ Benchmark.bm 20 do |r|
|
|
|
13
15
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
14
16
|
end
|
|
15
17
|
|
|
16
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
17
|
-
y = Cumo::SFloat.ones([1000,784])
|
|
18
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
19
|
+
y = Cumo::SFloat.ones([1000, 784])
|
|
18
20
|
r.report "x + y" do
|
|
19
21
|
num_iteration.times do
|
|
20
22
|
(x + y).free
|
|
@@ -22,8 +24,8 @@ Benchmark.bm 20 do |r|
|
|
|
22
24
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
23
25
|
end
|
|
24
26
|
|
|
25
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
26
|
-
y = Cumo::SFloat.ones([1000,784])
|
|
27
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
28
|
+
y = Cumo::SFloat.ones([1000, 784])
|
|
27
29
|
r.report "x.inplace + 1.0" do
|
|
28
30
|
num_iteration.times do
|
|
29
31
|
x.inplace + 1.0
|
|
@@ -31,8 +33,8 @@ Benchmark.bm 20 do |r|
|
|
|
31
33
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
32
34
|
end
|
|
33
35
|
|
|
34
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
35
|
-
z = Cumo::SFloat.ones([1000,1])
|
|
36
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
37
|
+
z = Cumo::SFloat.ones([1000, 1])
|
|
36
38
|
r.report "x.inplace + z" do
|
|
37
39
|
num_iteration.times do
|
|
38
40
|
x.inplace + z
|
|
@@ -40,8 +42,8 @@ Benchmark.bm 20 do |r|
|
|
|
40
42
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
41
43
|
end
|
|
42
44
|
|
|
43
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
44
|
-
y = Cumo::SFloat.ones([1000,784])
|
|
45
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
46
|
+
y = Cumo::SFloat.ones([1000, 784])
|
|
45
47
|
r.report "x.inplace - y" do
|
|
46
48
|
num_iteration.times do
|
|
47
49
|
x.inplace - y
|
|
@@ -49,8 +51,8 @@ Benchmark.bm 20 do |r|
|
|
|
49
51
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
50
52
|
end
|
|
51
53
|
|
|
52
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
53
|
-
y = Cumo::SFloat.ones([1000,784])
|
|
54
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
55
|
+
y = Cumo::SFloat.ones([1000, 784])
|
|
54
56
|
r.report "x.inplace - 1.0" do
|
|
55
57
|
num_iteration.times do
|
|
56
58
|
x.inplace - 1.0
|
|
@@ -58,8 +60,8 @@ Benchmark.bm 20 do |r|
|
|
|
58
60
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
59
61
|
end
|
|
60
62
|
|
|
61
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
62
|
-
z = Cumo::SFloat.ones([1000,1])
|
|
63
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
64
|
+
z = Cumo::SFloat.ones([1000, 1])
|
|
63
65
|
r.report "x.inplace - z" do
|
|
64
66
|
num_iteration.times do
|
|
65
67
|
x.inplace - z
|
|
@@ -67,8 +69,8 @@ Benchmark.bm 20 do |r|
|
|
|
67
69
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
68
70
|
end
|
|
69
71
|
|
|
70
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
71
|
-
y = Cumo::SFloat.ones([1000,784])
|
|
72
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
73
|
+
y = Cumo::SFloat.ones([1000, 784])
|
|
72
74
|
r.report "x.inplace * y" do
|
|
73
75
|
num_iteration.times do
|
|
74
76
|
x.inplace * y
|
|
@@ -76,8 +78,8 @@ Benchmark.bm 20 do |r|
|
|
|
76
78
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
77
79
|
end
|
|
78
80
|
|
|
79
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
80
|
-
y = Cumo::SFloat.ones([1000,784])
|
|
81
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
82
|
+
y = Cumo::SFloat.ones([1000, 784])
|
|
81
83
|
r.report "x.inplace * 1.0" do
|
|
82
84
|
num_iteration.times do
|
|
83
85
|
x.inplace * 1.0
|
|
@@ -85,8 +87,8 @@ Benchmark.bm 20 do |r|
|
|
|
85
87
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
86
88
|
end
|
|
87
89
|
|
|
88
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
89
|
-
z = Cumo::SFloat.ones([1000,1])
|
|
90
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
91
|
+
z = Cumo::SFloat.ones([1000, 1])
|
|
90
92
|
r.report "x.inplace * z" do
|
|
91
93
|
num_iteration.times do
|
|
92
94
|
x.inplace * z
|
|
@@ -94,8 +96,8 @@ Benchmark.bm 20 do |r|
|
|
|
94
96
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
95
97
|
end
|
|
96
98
|
|
|
97
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
98
|
-
y = Cumo::SFloat.ones([1000,784])
|
|
99
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
100
|
+
y = Cumo::SFloat.ones([1000, 784])
|
|
99
101
|
r.report "x.inplace / y" do
|
|
100
102
|
num_iteration.times do
|
|
101
103
|
x.inplace / y
|
|
@@ -103,8 +105,8 @@ Benchmark.bm 20 do |r|
|
|
|
103
105
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
104
106
|
end
|
|
105
107
|
|
|
106
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
107
|
-
y = Cumo::SFloat.ones([1000,784])
|
|
108
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
109
|
+
y = Cumo::SFloat.ones([1000, 784])
|
|
108
110
|
r.report "x.inplace / 1.0" do
|
|
109
111
|
num_iteration.times do
|
|
110
112
|
x.inplace / 1.0
|
|
@@ -112,8 +114,8 @@ Benchmark.bm 20 do |r|
|
|
|
112
114
|
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
113
115
|
end
|
|
114
116
|
|
|
115
|
-
x = Cumo::SFloat.ones([1000,784])
|
|
116
|
-
z = Cumo::SFloat.ones([1000,1])
|
|
117
|
+
x = Cumo::SFloat.ones([1000, 784])
|
|
118
|
+
z = Cumo::SFloat.ones([1000, 1])
|
|
117
119
|
r.report "x.inplace / z" do
|
|
118
120
|
num_iteration.times do
|
|
119
121
|
x.inplace / z
|