cumo 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/3rd_party/LICENSE.txt +60 -0
- data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +13 -1
- data/LICENSE.txt +1 -62
- data/README.md +33 -29
- data/bench/cumo_bench.rb +47 -25
- data/bench/numo_bench.rb +27 -25
- data/docs/src-tree.md +16 -0
- data/ext/cumo/cuda/cublas.c +69 -219
- data/ext/cumo/cuda/memory_pool_impl.hpp +1 -0
- data/ext/cumo/cuda/runtime.c +2 -14
- data/ext/cumo/cumo.c +16 -16
- data/ext/cumo/include/cumo.h +2 -2
- data/ext/cumo/include/cumo/cuda/cublas.h +6 -129
- data/ext/cumo/include/cumo/cuda/runtime.h +16 -0
- data/ext/cumo/include/cumo/indexer.h +46 -63
- data/ext/cumo/include/cumo/intern.h +58 -112
- data/ext/cumo/include/cumo/narray.h +214 -185
- data/ext/cumo/include/cumo/narray_kernel.h +66 -37
- data/ext/cumo/include/cumo/ndloop.h +42 -42
- data/ext/cumo/include/cumo/reduce_kernel.h +55 -71
- data/ext/cumo/include/cumo/template.h +56 -51
- data/ext/cumo/include/cumo/template_kernel.h +31 -31
- data/ext/cumo/include/cumo/types/bit.h +3 -3
- data/ext/cumo/include/cumo/types/bit_kernel.h +2 -2
- data/ext/cumo/include/cumo/types/complex.h +126 -126
- data/ext/cumo/include/cumo/types/complex_kernel.h +126 -126
- data/ext/cumo/include/cumo/types/complex_macro.h +28 -28
- data/ext/cumo/include/cumo/types/complex_macro_kernel.h +20 -20
- data/ext/cumo/include/cumo/types/dcomplex.h +5 -5
- data/ext/cumo/include/cumo/types/dcomplex_kernel.h +1 -1
- data/ext/cumo/include/cumo/types/int_macro.h +1 -1
- data/ext/cumo/include/cumo/types/int_macro_kernel.h +1 -1
- data/ext/cumo/include/cumo/types/robj_macro.h +30 -30
- data/ext/cumo/include/cumo/types/scomplex.h +5 -5
- data/ext/cumo/include/cumo/types/scomplex_kernel.h +1 -1
- data/ext/cumo/narray/array.c +143 -143
- data/ext/cumo/narray/data.c +184 -184
- data/ext/cumo/narray/gen/cogen.rb +5 -2
- data/ext/cumo/narray/gen/cogen_kernel.rb +5 -2
- data/ext/cumo/narray/gen/def/dcomplex.rb +1 -1
- data/ext/cumo/narray/gen/def/scomplex.rb +1 -1
- data/ext/cumo/narray/gen/erbln.rb +132 -0
- data/ext/cumo/narray/gen/erbpp2.rb +18 -13
- data/ext/cumo/narray/gen/narray_def.rb +3 -3
- data/ext/cumo/narray/gen/spec.rb +2 -2
- data/ext/cumo/narray/gen/tmpl/accum.c +15 -15
- data/ext/cumo/narray/gen/tmpl/accum_binary.c +22 -22
- data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/accum_index.c +30 -30
- data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +2 -2
- data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/alloc_func.c +14 -14
- data/ext/cumo/narray/gen/tmpl/allocate.c +11 -11
- data/ext/cumo/narray/gen/tmpl/aref.c +2 -2
- data/ext/cumo/narray/gen/tmpl/aref_cpu.c +4 -4
- data/ext/cumo/narray/gen/tmpl/aset.c +2 -2
- data/ext/cumo/narray/gen/tmpl/binary.c +28 -28
- data/ext/cumo/narray/gen/tmpl/binary2.c +18 -18
- data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/binary_s.c +13 -13
- data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/bincount.c +23 -23
- data/ext/cumo/narray/gen/tmpl/cast.c +7 -7
- data/ext/cumo/narray/gen/tmpl/cast_array.c +3 -3
- data/ext/cumo/narray/gen/tmpl/clip.c +38 -38
- data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +2 -2
- data/ext/cumo/narray/gen/tmpl/cond_binary.c +19 -19
- data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +7 -7
- data/ext/cumo/narray/gen/tmpl/cond_unary.c +15 -15
- data/ext/cumo/narray/gen/tmpl/cum.c +15 -15
- data/ext/cumo/narray/gen/tmpl/each.c +9 -9
- data/ext/cumo/narray/gen/tmpl/each_with_index.c +9 -9
- data/ext/cumo/narray/gen/tmpl/ewcomp.c +15 -15
- data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/extract_cpu.c +5 -5
- data/ext/cumo/narray/gen/tmpl/extract_data.c +12 -12
- data/ext/cumo/narray/gen/tmpl/eye.c +9 -9
- data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/fill.c +9 -9
- data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +1 -1
- data/ext/cumo/narray/gen/tmpl/format.c +11 -11
- data/ext/cumo/narray/gen/tmpl/format_to_a.c +8 -8
- data/ext/cumo/narray/gen/tmpl/frexp.c +13 -13
- data/ext/cumo/narray/gen/tmpl/gemm.c +252 -108
- data/ext/cumo/narray/gen/tmpl/inspect.c +1 -1
- data/ext/cumo/narray/gen/tmpl/lib.c +2 -2
- data/ext/cumo/narray/gen/tmpl/logseq.c +7 -7
- data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/map_with_index.c +17 -17
- data/ext/cumo/narray/gen/tmpl/median.c +10 -10
- data/ext/cumo/narray/gen/tmpl/minmax.c +10 -10
- data/ext/cumo/narray/gen/tmpl/new_dim0.c +3 -3
- data/ext/cumo/narray/gen/tmpl/poly.c +6 -6
- data/ext/cumo/narray/gen/tmpl/pow.c +28 -28
- data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/rand.c +10 -10
- data/ext/cumo/narray/gen/tmpl/rand_norm.c +7 -7
- data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/seq.c +7 -7
- data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/set2.c +20 -20
- data/ext/cumo/narray/gen/tmpl/sort.c +11 -11
- data/ext/cumo/narray/gen/tmpl/sort_index.c +18 -18
- data/ext/cumo/narray/gen/tmpl/store.c +6 -6
- data/ext/cumo/narray/gen/tmpl/store_array.c +19 -19
- data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +12 -12
- data/ext/cumo/narray/gen/tmpl/store_bit.c +23 -23
- data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +28 -28
- data/ext/cumo/narray/gen/tmpl/store_from.c +16 -16
- data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +12 -12
- data/ext/cumo/narray/gen/tmpl/to_a.c +10 -10
- data/ext/cumo/narray/gen/tmpl/unary.c +25 -25
- data/ext/cumo/narray/gen/tmpl/unary2.c +17 -17
- data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +15 -15
- data/ext/cumo/narray/gen/tmpl/unary_ret2.c +13 -13
- data/ext/cumo/narray/gen/tmpl/unary_s.c +17 -17
- data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +12 -12
- data/ext/cumo/narray/gen/tmpl_bit/allocate.c +9 -9
- data/ext/cumo/narray/gen/tmpl_bit/aref.c +2 -2
- data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +5 -5
- data/ext/cumo/narray/gen/tmpl_bit/aset.c +2 -2
- data/ext/cumo/narray/gen/tmpl_bit/binary.c +29 -29
- data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +14 -14
- data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +21 -21
- data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +28 -28
- data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +29 -29
- data/ext/cumo/narray/gen/tmpl_bit/each.c +10 -10
- data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +10 -10
- data/ext/cumo/narray/gen/tmpl_bit/extract.c +8 -8
- data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +8 -8
- data/ext/cumo/narray/gen/tmpl_bit/fill.c +17 -17
- data/ext/cumo/narray/gen/tmpl_bit/format.c +14 -14
- data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +11 -11
- data/ext/cumo/narray/gen/tmpl_bit/inspect.c +3 -3
- data/ext/cumo/narray/gen/tmpl_bit/mask.c +33 -33
- data/ext/cumo/narray/gen/tmpl_bit/store_array.c +19 -19
- data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +22 -22
- data/ext/cumo/narray/gen/tmpl_bit/store_from.c +18 -18
- data/ext/cumo/narray/gen/tmpl_bit/to_a.c +12 -12
- data/ext/cumo/narray/gen/tmpl_bit/unary.c +24 -24
- data/ext/cumo/narray/gen/tmpl_bit/where.c +16 -16
- data/ext/cumo/narray/gen/tmpl_bit/where2.c +20 -20
- data/ext/cumo/narray/index.c +213 -213
- data/ext/cumo/narray/math.c +27 -27
- data/ext/cumo/narray/narray.c +484 -484
- data/ext/cumo/narray/ndloop.c +259 -258
- data/ext/cumo/narray/rand.c +3 -3
- data/ext/cumo/narray/step.c +70 -70
- data/ext/cumo/narray/struct.c +139 -139
- metadata +6 -7
- data/ext/cumo/include/cumo/intern_fwd.h +0 -38
- data/lib/erbpp.rb +0 -294
- data/lib/erbpp/line_number.rb +0 -137
- data/lib/erbpp/narray_def.rb +0 -381
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 621f319a4a92862c60267ace14b39968fb7e5c203da94bb9d1f01b86412b77b3
|
4
|
+
data.tar.gz: db4964d257fcbe79c6a6f4b8e84ac71f7657c87168d43b7c205d9de5059b02e3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a703d9f146c556af7ef60c869dd44989fdd67d8c2b45f99d337b1cc5dd606a1eace434b46d0b6a7e4a404930351565f2ca3e93f116886a5b915f285a8b1b85d0
|
7
|
+
data.tar.gz: 9fc0c1c794aeb12f9f5a3cf7866bec32439b09083c3b34869301ac5dc3c602845ec3c51e7caa7bf5022d375e448db5c4c6d3f16f77660645b07fdd5dc2bf29c1
|
data/.gitignore
CHANGED
@@ -0,0 +1,60 @@
|
|
1
|
+
######################################################################
|
2
|
+
# The Cumo is a fork of Numo NArray v0.9.0.9.
|
3
|
+
# Cumo's source code and documents contain the original Ruby/Numo ones.
|
4
|
+
######################################################################
|
5
|
+
BSD 3-Clause License
|
6
|
+
|
7
|
+
Copyright (c) 1999-2017, Masahiro TANAKA
|
8
|
+
All rights reserved.
|
9
|
+
|
10
|
+
Redistribution and use in source and binary forms, with or without
|
11
|
+
modification, are permitted provided that the following conditions are met:
|
12
|
+
|
13
|
+
* Redistributions of source code must retain the above copyright notice, this
|
14
|
+
list of conditions and the following disclaimer.
|
15
|
+
|
16
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
17
|
+
this list of conditions and the following disclaimer in the documentation
|
18
|
+
and/or other materials provided with the distribution.
|
19
|
+
|
20
|
+
* Neither the name of the copyright holder nor the names of its
|
21
|
+
contributors may be used to endorse or promote products derived from
|
22
|
+
this software without specific prior written permission.
|
23
|
+
|
24
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
25
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
26
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
27
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
28
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
29
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
30
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
31
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
32
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
33
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
34
|
+
|
35
|
+
######################################################################
|
36
|
+
# The Cumo refers some implementations of CuPy such as memory pool.
|
37
|
+
# Cumo's source code and documents contain the original CuPy ones.
|
38
|
+
######################################################################
|
39
|
+
MIT License
|
40
|
+
|
41
|
+
Copyright (c) 2015 Preferred Infrastructure, Inc.
|
42
|
+
Copyright (c) 2015 Preferred Networks, Inc.
|
43
|
+
|
44
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
45
|
+
of this software and associated documentation files (the "Software"), to deal
|
46
|
+
in the Software without restriction, including without limitation the rights
|
47
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
48
|
+
copies of the Software, and to permit persons to whom the Software is
|
49
|
+
furnished to do so, subject to the following conditions:
|
50
|
+
|
51
|
+
The above copyright notice and this permission notice shall be included in
|
52
|
+
all copies or substantial portions of the Software.
|
53
|
+
|
54
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
55
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
56
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
57
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
58
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
59
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
60
|
+
THE SOFTWARE.
|
@@ -32,7 +32,19 @@ module MakeMakefileCuda
|
|
32
32
|
# TODO(sonots): Make it possible to configure "nvcc" and additional arguments
|
33
33
|
def nvcc_command
|
34
34
|
s = MakeMakefileCuda::Nvcc.generate(argv)
|
35
|
-
|
35
|
+
cmd = "nvcc " << s
|
36
|
+
if ENV['CUMO_NVCC_GENERATE_CODE']
|
37
|
+
cmd << " --generate-code=#{ENV['CUMO_NVCC_GENERATE_CODE']}"
|
38
|
+
elsif ENV['DEBUG']
|
39
|
+
cmd << " -arch=sm_35"
|
40
|
+
else
|
41
|
+
cmd << " --generate-code=arch=compute_35,code=sm_35"
|
42
|
+
cmd << " --generate-code=arch=compute_50,code=sm_50"
|
43
|
+
cmd << " --generate-code=arch=compute_60,code=sm_60"
|
44
|
+
cmd << " --generate-code=arch=compute_70,code=sm_70"
|
45
|
+
cmd << " --generate-code=arch=compute_70,code=compute_70"
|
46
|
+
end
|
47
|
+
cmd
|
36
48
|
end
|
37
49
|
|
38
50
|
def c_command
|
data/LICENSE.txt
CHANGED
@@ -1,67 +1,6 @@
|
|
1
|
-
BSD 3-Clause License
|
2
|
-
|
3
|
-
Copyright (c) 2017 Naotoshi Seo
|
4
|
-
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
-
of this software and associated documentation files (the "Software"), to deal
|
7
|
-
in the Software without restriction, including without limitation the rights
|
8
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
-
copies of the Software, and to permit persons to whom the Software is
|
10
|
-
furnished to do so, subject to the following conditions:
|
11
|
-
|
12
|
-
The above copyright notice and this permission notice shall be included in
|
13
|
-
all copies or substantial portions of the Software.
|
14
|
-
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
-
THE SOFTWARE.
|
22
|
-
|
23
|
-
######################################################################
|
24
|
-
# The Cumo is a fork of Numo NArray v0.9.0.9.
|
25
|
-
# Cumo's source code and documents contain the original Ruby/Numo ones.
|
26
|
-
######################################################################
|
27
|
-
BSD 3-Clause License
|
28
|
-
|
29
|
-
Copyright (c) 1999-2017, Masahiro TANAKA
|
30
|
-
All rights reserved.
|
31
|
-
|
32
|
-
Redistribution and use in source and binary forms, with or without
|
33
|
-
modification, are permitted provided that the following conditions are met:
|
34
|
-
|
35
|
-
* Redistributions of source code must retain the above copyright notice, this
|
36
|
-
list of conditions and the following disclaimer.
|
37
|
-
|
38
|
-
* Redistributions in binary form must reproduce the above copyright notice,
|
39
|
-
this list of conditions and the following disclaimer in the documentation
|
40
|
-
and/or other materials provided with the distribution.
|
41
|
-
|
42
|
-
* Neither the name of the copyright holder nor the names of its
|
43
|
-
contributors may be used to endorse or promote products derived from
|
44
|
-
this software without specific prior written permission.
|
45
|
-
|
46
|
-
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
47
|
-
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
48
|
-
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
49
|
-
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
50
|
-
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
51
|
-
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
52
|
-
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
53
|
-
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
54
|
-
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
55
|
-
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
56
|
-
|
57
|
-
######################################################################
|
58
|
-
# The Cumo refers some implementations of CuPy such as memory pool.
|
59
|
-
# Cumo's source code and documents contain the original CuPy ones.
|
60
|
-
######################################################################
|
61
1
|
MIT License
|
62
2
|
|
63
|
-
Copyright (c)
|
64
|
-
Copyright (c) 2015 Preferred Networks, Inc.
|
3
|
+
Copyright (c) 2017 Naotoshi Seo
|
65
4
|
|
66
5
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
67
6
|
of this software and associated documentation files (the "Software"), to deal
|
data/README.md
CHANGED
@@ -5,10 +5,11 @@
|
|
5
5
|
Cumo (pronounced like "koomo") is CUDA-aware numerical library whose interface is highly compatible with [Ruby Numo](https://github.com/ruby-numo).
|
6
6
|
This library provides the benefit of speedup using GPU by replacing Numo with only a small piece of codes.
|
7
7
|
|
8
|
+
<img src="https://raw.githubusercontent.com/sonots/cumo-logo/master/logo_transparent.png" alt="cumo logo" title="cumo logo" width="50%">
|
8
9
|
|
9
10
|
## Requirements
|
10
11
|
|
11
|
-
* Ruby 2.
|
12
|
+
* Ruby 2.5 or later
|
12
13
|
* NVIDIA GPU Compute Capability 6.0 (Pascal) or later
|
13
14
|
* CUDA 9.0 or later
|
14
15
|
|
@@ -169,23 +170,6 @@ Generate docs:
|
|
169
170
|
bundle exec rake docs
|
170
171
|
```
|
171
172
|
|
172
|
-
## Source code organizations
|
173
|
-
|
174
|
-
* `*_kernel.{h,cuh,cu}` files are for device (CUDA kernels).
|
175
|
-
* .cu files are compiled via nvcc.
|
176
|
-
* .cu files define C wrapper functions to launch CUDA kernels to enable to be called from .c files.
|
177
|
-
* Technically, it is not possible to use CRuby API such as `VALUE` in .cu files.
|
178
|
-
* CRuby API is not callable from CUDA kernel because they do not have `__device__` modifier.
|
179
|
-
* nvcc does not support `#include RUBY_EXTCONF_H`, so can not include `ruby.h`.
|
180
|
-
* (RULE) It is allowed to use C++14 codes in .cu files.
|
181
|
-
* Rest of `*.{h,c}` files are for host (CPU).
|
182
|
-
* Call C wrapper functions defined in .cu files.
|
183
|
-
* It can use CRuby API.
|
184
|
-
* (RULE) It is not allowed to use C++ codes in host files.
|
185
|
-
|
186
|
-
Ruby's `mkmf` (or `extconf.rb`) does not support to specify 3rd compiler such as NVCC for another files of extensions `.cu`.
|
187
|
-
Therefore, cumo specify a wrapper command `bin/mkmf-cu-nvcc` as a compiler and changes its behavor depending on extensions of files to compile.
|
188
|
-
|
189
173
|
## Advanced Tips on Development
|
190
174
|
|
191
175
|
### ccache
|
@@ -201,25 +185,21 @@ ln -sf "$HOME/opt/ccache/bin/ccache" "$HOME/opt/ccache/bin/g++"
|
|
201
185
|
ln -sf "$HOME/opt/ccache/bin/ccache" "$HOME/opt/ccache/bin/nvcc"
|
202
186
|
```
|
203
187
|
|
204
|
-
|
188
|
+
### Build in parallel
|
205
189
|
|
206
|
-
|
190
|
+
Use `MAKEFLAGS` environment variable to specify `make` command options. You can build in parallel as:
|
207
191
|
|
208
192
|
```
|
209
|
-
bundle exec
|
193
|
+
bundle exec MAKEFLAG=-j8 rake compile
|
210
194
|
```
|
211
195
|
|
212
|
-
###
|
213
|
-
|
214
|
-
`DTYPE` environment variable is available as:
|
196
|
+
### Specify nvcc --generate-code options
|
215
197
|
|
216
198
|
```
|
217
|
-
bundle exec
|
199
|
+
bundle exec env CUMO_NVCC_GENERATE_CODE=arch=compute_60,code=sm_60 rake compile
|
218
200
|
```
|
219
201
|
|
220
|
-
|
221
|
-
bundle exec DTYPE=dfloat ruby test/narray_test.rb
|
222
|
-
```
|
202
|
+
This is useful even on development because it makes possible to skip JIT compilation of PTX to cubin occurring on runtime.
|
223
203
|
|
224
204
|
### Run tests with gdb
|
225
205
|
|
@@ -237,6 +217,25 @@ bundle exec gdb -x run.gdb --args ruby test/narray_test.rb
|
|
237
217
|
|
238
218
|
You may put a breakpoint by calling `cumo_debug_breakpoint()` at C source codes.
|
239
219
|
|
220
|
+
### Run tests only a specific line
|
221
|
+
`--location` option is available as:
|
222
|
+
|
223
|
+
```
|
224
|
+
bundle exec ruby test/narray_test.rb --location 121
|
225
|
+
```
|
226
|
+
|
227
|
+
### Compile and run tests only a specific type
|
228
|
+
|
229
|
+
`DTYPE` environment variable is available as:
|
230
|
+
|
231
|
+
```
|
232
|
+
bundle exec DTYPE=dfloat rake compile
|
233
|
+
```
|
234
|
+
|
235
|
+
```
|
236
|
+
bundle exec DTYPE=dfloat ruby test/narray_test.rb
|
237
|
+
```
|
238
|
+
|
240
239
|
### Run program always synchronizing CPU and GPU
|
241
240
|
|
242
241
|
```
|
@@ -249,4 +248,9 @@ Bug reports and pull requests are welcome on GitHub at https://github.com/sonots
|
|
249
248
|
|
250
249
|
## License
|
251
250
|
|
252
|
-
[LICENSE.txt](./LICENSE.txt)
|
251
|
+
* [LICENSE.txt](./LICENSE.txt)
|
252
|
+
* [3rd_party/LICENSE.txt](./3rd_party/LICENSE.txt)
|
253
|
+
|
254
|
+
## Related Materials
|
255
|
+
|
256
|
+
* [Fast Numerical Computing and Deep Learning in Ruby with Cumo](https://speakerdeck.com/sonots/fast-numerical-computing-and-deep-learning-in-ruby-with-cumo) - Presentation Slide at [RubyKaigi 2018](https://rubykaigi.org/2018/presentations/sonots.html#may31)
|
data/bench/cumo_bench.rb
CHANGED
@@ -8,12 +8,14 @@ b = Cumo::Float32.new(10).seq(10,10)
|
|
8
8
|
c = a + b
|
9
9
|
c.free
|
10
10
|
|
11
|
-
def elementwise
|
12
|
-
|
11
|
+
def elementwise(num = nil)
|
12
|
+
num ||= NUM
|
13
|
+
puts "elementwise(#{num})"
|
13
14
|
Benchmark.bm do |r|
|
14
15
|
a = Cumo::Float32.new(10000).seq(1)
|
15
16
|
b = Cumo::Float32.new(10000).seq(10,10)
|
16
17
|
(a + b).free # warm up
|
18
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
17
19
|
r.report('10**4') do
|
18
20
|
NUM.times do
|
19
21
|
(a + b).free
|
@@ -24,6 +26,7 @@ def elementwise
|
|
24
26
|
a = Cumo::Float32.new(100000).seq(1)
|
25
27
|
b = Cumo::Float32.new(100000).seq(10,10)
|
26
28
|
(a + b).free # warm up
|
29
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
27
30
|
r.report('10**5') do
|
28
31
|
NUM.times do
|
29
32
|
(a + b).free
|
@@ -34,6 +37,7 @@ def elementwise
|
|
34
37
|
a = Cumo::Float32.new(1000000).seq(1)
|
35
38
|
b = Cumo::Float32.new(1000000).seq(10,10)
|
36
39
|
(a + b).free # warm up
|
40
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
37
41
|
r.report('10**6') do
|
38
42
|
NUM.times do
|
39
43
|
(a + b).free
|
@@ -44,6 +48,7 @@ def elementwise
|
|
44
48
|
a = Cumo::Float32.new(10000000).seq(1)
|
45
49
|
b = Cumo::Float32.new(10000000).seq(10,10)
|
46
50
|
(a + b).free # warm up
|
51
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
47
52
|
r.report('10**7') do
|
48
53
|
NUM.times do
|
49
54
|
(a + b).free
|
@@ -54,6 +59,7 @@ def elementwise
|
|
54
59
|
a = Cumo::Float32.new(100000000).seq(1)
|
55
60
|
b = Cumo::Float32.new(100000000).seq(10,10)
|
56
61
|
(a + b).free # warm up
|
62
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
57
63
|
r.report('10**8') do
|
58
64
|
NUM.times do
|
59
65
|
(a + b).free
|
@@ -63,10 +69,13 @@ def elementwise
|
|
63
69
|
end
|
64
70
|
end
|
65
71
|
|
66
|
-
def reduction
|
67
|
-
|
72
|
+
def reduction(num = nil)
|
73
|
+
num ||= NUM
|
74
|
+
puts "reduction(#{num})"
|
68
75
|
Benchmark.bm do |r|
|
69
76
|
a = Cumo::Float32.new(10000).seq(1)
|
77
|
+
(a.sum).free # warm up
|
78
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
70
79
|
r.report('10**4') do
|
71
80
|
NUM.times do
|
72
81
|
(a.sum).free
|
@@ -75,6 +84,8 @@ def reduction
|
|
75
84
|
end
|
76
85
|
|
77
86
|
a = Cumo::Float32.new(100000).seq(1)
|
87
|
+
(a.sum).free # warm up
|
88
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
78
89
|
r.report('10**5') do
|
79
90
|
NUM.times do
|
80
91
|
(a.sum).free
|
@@ -83,6 +94,8 @@ def reduction
|
|
83
94
|
end
|
84
95
|
|
85
96
|
a = Cumo::Float32.new(1000000).seq(1)
|
97
|
+
(a.sum).free # warm up
|
98
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
86
99
|
r.report('10**6') do
|
87
100
|
NUM.times do
|
88
101
|
(a.sum).free
|
@@ -91,6 +104,8 @@ def reduction
|
|
91
104
|
end
|
92
105
|
|
93
106
|
a = Cumo::Float32.new(10000000).seq(1)
|
107
|
+
(a.sum).free # warm up
|
108
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
94
109
|
r.report('10**7') do
|
95
110
|
NUM.times do
|
96
111
|
(a.sum).free
|
@@ -99,6 +114,8 @@ def reduction
|
|
99
114
|
end
|
100
115
|
|
101
116
|
a = Cumo::Float32.new(100000000).seq(1)
|
117
|
+
(a.sum).free # warm up
|
118
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
102
119
|
r.report('10**8') do
|
103
120
|
NUM.times do
|
104
121
|
(a.sum).free
|
@@ -108,13 +125,14 @@ def reduction
|
|
108
125
|
end
|
109
126
|
end
|
110
127
|
|
111
|
-
def dot
|
112
|
-
num
|
113
|
-
puts
|
128
|
+
def dot(num = nil)
|
129
|
+
num ||= 1
|
130
|
+
puts "dot(#{num})"
|
114
131
|
Benchmark.bm do |r|
|
115
132
|
a = Cumo::Float32.new(100,100).seq(1)
|
116
133
|
b = Cumo::Float32.new(100,100).seq(10,10)
|
117
134
|
a.dot(b).free # warm up
|
135
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
118
136
|
r.report('10**4') do
|
119
137
|
num.times do
|
120
138
|
a.dot(b).free
|
@@ -125,6 +143,7 @@ def dot
|
|
125
143
|
a = Cumo::Float32.new(100,1000).seq(1)
|
126
144
|
b = Cumo::Float32.new(1000,100).seq(10,10)
|
127
145
|
a.dot(b).free # warm up
|
146
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
128
147
|
r.report('10**5') do
|
129
148
|
num.times do
|
130
149
|
a.dot(b).free
|
@@ -135,6 +154,7 @@ def dot
|
|
135
154
|
a = Cumo::Float32.new(100,10000).seq(1)
|
136
155
|
b = Cumo::Float32.new(10000,100).seq(10,10)
|
137
156
|
a.dot(b).free # warm up
|
157
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
138
158
|
r.report('10**6') do
|
139
159
|
num.times do
|
140
160
|
a.dot(b).free
|
@@ -145,6 +165,7 @@ def dot
|
|
145
165
|
a = Cumo::Float32.new(100,100000).seq(1)
|
146
166
|
b = Cumo::Float32.new(100000,100).seq(10,10)
|
147
167
|
a.dot(b).free # warm up
|
168
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
148
169
|
r.report('10**7') do
|
149
170
|
num.times do
|
150
171
|
a.dot(b).free
|
@@ -155,6 +176,7 @@ def dot
|
|
155
176
|
a = Cumo::Float32.new(100,1000000).seq(1)
|
156
177
|
b = Cumo::Float32.new(1000000,100).seq(10,10)
|
157
178
|
a.dot(b).free # warm up
|
179
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
158
180
|
r.report('10**8') do
|
159
181
|
num.times do
|
160
182
|
a.dot(b).free
|
@@ -170,24 +192,24 @@ dot
|
|
170
192
|
|
171
193
|
# Tesla V100-SXM2...
|
172
194
|
#
|
173
|
-
# element-wise
|
195
|
+
# element-wise(100)
|
174
196
|
# user system total real
|
175
|
-
# 10**4 0.000000 0.000000 0.000000 ( 0.
|
176
|
-
# 10**5 0.
|
177
|
-
# 10**6 0.
|
178
|
-
# 10**7 0.
|
179
|
-
# 10**8 0.
|
180
|
-
# reduction
|
197
|
+
# 10**4 0.000000 0.000000 0.000000 ( 0.006332)
|
198
|
+
# 10**5 0.000000 0.000000 0.000000 ( 0.006280)
|
199
|
+
# 10**6 0.010000 0.000000 0.010000 ( 0.008123)
|
200
|
+
# 10**7 0.000000 0.010000 0.010000 ( 0.022176)
|
201
|
+
# 10**8 0.100000 0.050000 0.150000 ( 0.151999)
|
202
|
+
# reduction(100)
|
181
203
|
# user system total real
|
182
|
-
# 10**4 0.010000 0.000000 0.010000 ( 0.
|
183
|
-
# 10**5 0.
|
184
|
-
# 10**6 0.
|
185
|
-
# 10**7 1.
|
186
|
-
# 10**8 11.
|
187
|
-
# dot
|
204
|
+
# 10**4 0.010000 0.000000 0.010000 ( 0.009735)
|
205
|
+
# 10**5 0.010000 0.010000 0.020000 ( 0.022882)
|
206
|
+
# 10**6 0.110000 0.050000 0.160000 ( 0.154641)
|
207
|
+
# 10**7 1.220000 0.590000 1.810000 ( 1.805643)
|
208
|
+
# 10**8 11.840000 6.110000 17.950000 ( 17.946511)
|
209
|
+
# dot(1)
|
188
210
|
# user system total real
|
189
|
-
# 10**4 0.000000 0.000000 0.000000 ( 0.
|
190
|
-
# 10**5 0.000000 0.000000 0.000000 ( 0.
|
191
|
-
# 10**6 0.000000 0.000000 0.000000 ( 0.
|
192
|
-
# 10**7 0.
|
193
|
-
# 10**8 0.
|
211
|
+
# 10**4 0.000000 0.000000 0.000000 ( 0.000206)
|
212
|
+
# 10**5 0.000000 0.000000 0.000000 ( 0.000195)
|
213
|
+
# 10**6 0.000000 0.000000 0.000000 ( 0.000239)
|
214
|
+
# 10**7 0.000000 0.000000 0.000000 ( 0.000719)
|
215
|
+
# 10**8 0.010000 0.000000 0.010000 ( 0.004636)
|