cumo 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/3rd_party/LICENSE.txt +60 -0
- data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +13 -1
- data/LICENSE.txt +1 -62
- data/README.md +33 -29
- data/bench/cumo_bench.rb +47 -25
- data/bench/numo_bench.rb +27 -25
- data/docs/src-tree.md +16 -0
- data/ext/cumo/cuda/cublas.c +69 -219
- data/ext/cumo/cuda/memory_pool_impl.hpp +1 -0
- data/ext/cumo/cuda/runtime.c +2 -14
- data/ext/cumo/cumo.c +16 -16
- data/ext/cumo/include/cumo.h +2 -2
- data/ext/cumo/include/cumo/cuda/cublas.h +6 -129
- data/ext/cumo/include/cumo/cuda/runtime.h +16 -0
- data/ext/cumo/include/cumo/indexer.h +46 -63
- data/ext/cumo/include/cumo/intern.h +58 -112
- data/ext/cumo/include/cumo/narray.h +214 -185
- data/ext/cumo/include/cumo/narray_kernel.h +66 -37
- data/ext/cumo/include/cumo/ndloop.h +42 -42
- data/ext/cumo/include/cumo/reduce_kernel.h +55 -71
- data/ext/cumo/include/cumo/template.h +56 -51
- data/ext/cumo/include/cumo/template_kernel.h +31 -31
- data/ext/cumo/include/cumo/types/bit.h +3 -3
- data/ext/cumo/include/cumo/types/bit_kernel.h +2 -2
- data/ext/cumo/include/cumo/types/complex.h +126 -126
- data/ext/cumo/include/cumo/types/complex_kernel.h +126 -126
- data/ext/cumo/include/cumo/types/complex_macro.h +28 -28
- data/ext/cumo/include/cumo/types/complex_macro_kernel.h +20 -20
- data/ext/cumo/include/cumo/types/dcomplex.h +5 -5
- data/ext/cumo/include/cumo/types/dcomplex_kernel.h +1 -1
- data/ext/cumo/include/cumo/types/int_macro.h +1 -1
- data/ext/cumo/include/cumo/types/int_macro_kernel.h +1 -1
- data/ext/cumo/include/cumo/types/robj_macro.h +30 -30
- data/ext/cumo/include/cumo/types/scomplex.h +5 -5
- data/ext/cumo/include/cumo/types/scomplex_kernel.h +1 -1
- data/ext/cumo/narray/array.c +143 -143
- data/ext/cumo/narray/data.c +184 -184
- data/ext/cumo/narray/gen/cogen.rb +5 -2
- data/ext/cumo/narray/gen/cogen_kernel.rb +5 -2
- data/ext/cumo/narray/gen/def/dcomplex.rb +1 -1
- data/ext/cumo/narray/gen/def/scomplex.rb +1 -1
- data/ext/cumo/narray/gen/erbln.rb +132 -0
- data/ext/cumo/narray/gen/erbpp2.rb +18 -13
- data/ext/cumo/narray/gen/narray_def.rb +3 -3
- data/ext/cumo/narray/gen/spec.rb +2 -2
- data/ext/cumo/narray/gen/tmpl/accum.c +15 -15
- data/ext/cumo/narray/gen/tmpl/accum_binary.c +22 -22
- data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/accum_index.c +30 -30
- data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +2 -2
- data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/alloc_func.c +14 -14
- data/ext/cumo/narray/gen/tmpl/allocate.c +11 -11
- data/ext/cumo/narray/gen/tmpl/aref.c +2 -2
- data/ext/cumo/narray/gen/tmpl/aref_cpu.c +4 -4
- data/ext/cumo/narray/gen/tmpl/aset.c +2 -2
- data/ext/cumo/narray/gen/tmpl/binary.c +28 -28
- data/ext/cumo/narray/gen/tmpl/binary2.c +18 -18
- data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/binary_s.c +13 -13
- data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/bincount.c +23 -23
- data/ext/cumo/narray/gen/tmpl/cast.c +7 -7
- data/ext/cumo/narray/gen/tmpl/cast_array.c +3 -3
- data/ext/cumo/narray/gen/tmpl/clip.c +38 -38
- data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +2 -2
- data/ext/cumo/narray/gen/tmpl/cond_binary.c +19 -19
- data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +7 -7
- data/ext/cumo/narray/gen/tmpl/cond_unary.c +15 -15
- data/ext/cumo/narray/gen/tmpl/cum.c +15 -15
- data/ext/cumo/narray/gen/tmpl/each.c +9 -9
- data/ext/cumo/narray/gen/tmpl/each_with_index.c +9 -9
- data/ext/cumo/narray/gen/tmpl/ewcomp.c +15 -15
- data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/extract_cpu.c +5 -5
- data/ext/cumo/narray/gen/tmpl/extract_data.c +12 -12
- data/ext/cumo/narray/gen/tmpl/eye.c +9 -9
- data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/fill.c +9 -9
- data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +1 -1
- data/ext/cumo/narray/gen/tmpl/format.c +11 -11
- data/ext/cumo/narray/gen/tmpl/format_to_a.c +8 -8
- data/ext/cumo/narray/gen/tmpl/frexp.c +13 -13
- data/ext/cumo/narray/gen/tmpl/gemm.c +252 -108
- data/ext/cumo/narray/gen/tmpl/inspect.c +1 -1
- data/ext/cumo/narray/gen/tmpl/lib.c +2 -2
- data/ext/cumo/narray/gen/tmpl/logseq.c +7 -7
- data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/map_with_index.c +17 -17
- data/ext/cumo/narray/gen/tmpl/median.c +10 -10
- data/ext/cumo/narray/gen/tmpl/minmax.c +10 -10
- data/ext/cumo/narray/gen/tmpl/new_dim0.c +3 -3
- data/ext/cumo/narray/gen/tmpl/poly.c +6 -6
- data/ext/cumo/narray/gen/tmpl/pow.c +28 -28
- data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/rand.c +10 -10
- data/ext/cumo/narray/gen/tmpl/rand_norm.c +7 -7
- data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/seq.c +7 -7
- data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/set2.c +20 -20
- data/ext/cumo/narray/gen/tmpl/sort.c +11 -11
- data/ext/cumo/narray/gen/tmpl/sort_index.c +18 -18
- data/ext/cumo/narray/gen/tmpl/store.c +6 -6
- data/ext/cumo/narray/gen/tmpl/store_array.c +19 -19
- data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +12 -12
- data/ext/cumo/narray/gen/tmpl/store_bit.c +23 -23
- data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +28 -28
- data/ext/cumo/narray/gen/tmpl/store_from.c +16 -16
- data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +12 -12
- data/ext/cumo/narray/gen/tmpl/to_a.c +10 -10
- data/ext/cumo/narray/gen/tmpl/unary.c +25 -25
- data/ext/cumo/narray/gen/tmpl/unary2.c +17 -17
- data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +15 -15
- data/ext/cumo/narray/gen/tmpl/unary_ret2.c +13 -13
- data/ext/cumo/narray/gen/tmpl/unary_s.c +17 -17
- data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +12 -12
- data/ext/cumo/narray/gen/tmpl_bit/allocate.c +9 -9
- data/ext/cumo/narray/gen/tmpl_bit/aref.c +2 -2
- data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +5 -5
- data/ext/cumo/narray/gen/tmpl_bit/aset.c +2 -2
- data/ext/cumo/narray/gen/tmpl_bit/binary.c +29 -29
- data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +14 -14
- data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +21 -21
- data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +28 -28
- data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +29 -29
- data/ext/cumo/narray/gen/tmpl_bit/each.c +10 -10
- data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +10 -10
- data/ext/cumo/narray/gen/tmpl_bit/extract.c +8 -8
- data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +8 -8
- data/ext/cumo/narray/gen/tmpl_bit/fill.c +17 -17
- data/ext/cumo/narray/gen/tmpl_bit/format.c +14 -14
- data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +11 -11
- data/ext/cumo/narray/gen/tmpl_bit/inspect.c +3 -3
- data/ext/cumo/narray/gen/tmpl_bit/mask.c +33 -33
- data/ext/cumo/narray/gen/tmpl_bit/store_array.c +19 -19
- data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +22 -22
- data/ext/cumo/narray/gen/tmpl_bit/store_from.c +18 -18
- data/ext/cumo/narray/gen/tmpl_bit/to_a.c +12 -12
- data/ext/cumo/narray/gen/tmpl_bit/unary.c +24 -24
- data/ext/cumo/narray/gen/tmpl_bit/where.c +16 -16
- data/ext/cumo/narray/gen/tmpl_bit/where2.c +20 -20
- data/ext/cumo/narray/index.c +213 -213
- data/ext/cumo/narray/math.c +27 -27
- data/ext/cumo/narray/narray.c +484 -484
- data/ext/cumo/narray/ndloop.c +259 -258
- data/ext/cumo/narray/rand.c +3 -3
- data/ext/cumo/narray/step.c +70 -70
- data/ext/cumo/narray/struct.c +139 -139
- metadata +6 -7
- data/ext/cumo/include/cumo/intern_fwd.h +0 -38
- data/lib/erbpp.rb +0 -294
- data/lib/erbpp/line_number.rb +0 -137
- data/lib/erbpp/narray_def.rb +0 -381
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
|
-
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 621f319a4a92862c60267ace14b39968fb7e5c203da94bb9d1f01b86412b77b3
|
|
4
|
+
data.tar.gz: db4964d257fcbe79c6a6f4b8e84ac71f7657c87168d43b7c205d9de5059b02e3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a703d9f146c556af7ef60c869dd44989fdd67d8c2b45f99d337b1cc5dd606a1eace434b46d0b6a7e4a404930351565f2ca3e93f116886a5b915f285a8b1b85d0
|
|
7
|
+
data.tar.gz: 9fc0c1c794aeb12f9f5a3cf7866bec32439b09083c3b34869301ac5dc3c602845ec3c51e7caa7bf5022d375e448db5c4c6d3f16f77660645b07fdd5dc2bf29c1
|
data/.gitignore
CHANGED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
######################################################################
|
|
2
|
+
# The Cumo is a fork of Numo NArray v0.9.0.9.
|
|
3
|
+
# Cumo's source code and documents contain the original Ruby/Numo ones.
|
|
4
|
+
######################################################################
|
|
5
|
+
BSD 3-Clause License
|
|
6
|
+
|
|
7
|
+
Copyright (c) 1999-2017, Masahiro TANAKA
|
|
8
|
+
All rights reserved.
|
|
9
|
+
|
|
10
|
+
Redistribution and use in source and binary forms, with or without
|
|
11
|
+
modification, are permitted provided that the following conditions are met:
|
|
12
|
+
|
|
13
|
+
* Redistributions of source code must retain the above copyright notice, this
|
|
14
|
+
list of conditions and the following disclaimer.
|
|
15
|
+
|
|
16
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
|
17
|
+
this list of conditions and the following disclaimer in the documentation
|
|
18
|
+
and/or other materials provided with the distribution.
|
|
19
|
+
|
|
20
|
+
* Neither the name of the copyright holder nor the names of its
|
|
21
|
+
contributors may be used to endorse or promote products derived from
|
|
22
|
+
this software without specific prior written permission.
|
|
23
|
+
|
|
24
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
25
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
26
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
27
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
28
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
29
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
30
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
31
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
32
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
33
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
34
|
+
|
|
35
|
+
######################################################################
|
|
36
|
+
# The Cumo refers some implementations of CuPy such as memory pool.
|
|
37
|
+
# Cumo's source code and documents contain the original CuPy ones.
|
|
38
|
+
######################################################################
|
|
39
|
+
MIT License
|
|
40
|
+
|
|
41
|
+
Copyright (c) 2015 Preferred Infrastructure, Inc.
|
|
42
|
+
Copyright (c) 2015 Preferred Networks, Inc.
|
|
43
|
+
|
|
44
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
45
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
46
|
+
in the Software without restriction, including without limitation the rights
|
|
47
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
48
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
49
|
+
furnished to do so, subject to the following conditions:
|
|
50
|
+
|
|
51
|
+
The above copyright notice and this permission notice shall be included in
|
|
52
|
+
all copies or substantial portions of the Software.
|
|
53
|
+
|
|
54
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
55
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
56
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
57
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
58
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
59
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
60
|
+
THE SOFTWARE.
|
|
@@ -32,7 +32,19 @@ module MakeMakefileCuda
|
|
|
32
32
|
# TODO(sonots): Make it possible to configure "nvcc" and additional arguments
|
|
33
33
|
def nvcc_command
|
|
34
34
|
s = MakeMakefileCuda::Nvcc.generate(argv)
|
|
35
|
-
|
|
35
|
+
cmd = "nvcc " << s
|
|
36
|
+
if ENV['CUMO_NVCC_GENERATE_CODE']
|
|
37
|
+
cmd << " --generate-code=#{ENV['CUMO_NVCC_GENERATE_CODE']}"
|
|
38
|
+
elsif ENV['DEBUG']
|
|
39
|
+
cmd << " -arch=sm_35"
|
|
40
|
+
else
|
|
41
|
+
cmd << " --generate-code=arch=compute_35,code=sm_35"
|
|
42
|
+
cmd << " --generate-code=arch=compute_50,code=sm_50"
|
|
43
|
+
cmd << " --generate-code=arch=compute_60,code=sm_60"
|
|
44
|
+
cmd << " --generate-code=arch=compute_70,code=sm_70"
|
|
45
|
+
cmd << " --generate-code=arch=compute_70,code=compute_70"
|
|
46
|
+
end
|
|
47
|
+
cmd
|
|
36
48
|
end
|
|
37
49
|
|
|
38
50
|
def c_command
|
data/LICENSE.txt
CHANGED
|
@@ -1,67 +1,6 @@
|
|
|
1
|
-
BSD 3-Clause License
|
|
2
|
-
|
|
3
|
-
Copyright (c) 2017 Naotoshi Seo
|
|
4
|
-
|
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
-
in the Software without restriction, including without limitation the rights
|
|
8
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
-
furnished to do so, subject to the following conditions:
|
|
11
|
-
|
|
12
|
-
The above copyright notice and this permission notice shall be included in
|
|
13
|
-
all copies or substantial portions of the Software.
|
|
14
|
-
|
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
-
THE SOFTWARE.
|
|
22
|
-
|
|
23
|
-
######################################################################
|
|
24
|
-
# The Cumo is a fork of Numo NArray v0.9.0.9.
|
|
25
|
-
# Cumo's source code and documents contain the original Ruby/Numo ones.
|
|
26
|
-
######################################################################
|
|
27
|
-
BSD 3-Clause License
|
|
28
|
-
|
|
29
|
-
Copyright (c) 1999-2017, Masahiro TANAKA
|
|
30
|
-
All rights reserved.
|
|
31
|
-
|
|
32
|
-
Redistribution and use in source and binary forms, with or without
|
|
33
|
-
modification, are permitted provided that the following conditions are met:
|
|
34
|
-
|
|
35
|
-
* Redistributions of source code must retain the above copyright notice, this
|
|
36
|
-
list of conditions and the following disclaimer.
|
|
37
|
-
|
|
38
|
-
* Redistributions in binary form must reproduce the above copyright notice,
|
|
39
|
-
this list of conditions and the following disclaimer in the documentation
|
|
40
|
-
and/or other materials provided with the distribution.
|
|
41
|
-
|
|
42
|
-
* Neither the name of the copyright holder nor the names of its
|
|
43
|
-
contributors may be used to endorse or promote products derived from
|
|
44
|
-
this software without specific prior written permission.
|
|
45
|
-
|
|
46
|
-
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
47
|
-
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
48
|
-
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
49
|
-
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
50
|
-
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
51
|
-
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
52
|
-
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
53
|
-
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
54
|
-
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
55
|
-
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
56
|
-
|
|
57
|
-
######################################################################
|
|
58
|
-
# The Cumo refers some implementations of CuPy such as memory pool.
|
|
59
|
-
# Cumo's source code and documents contain the original CuPy ones.
|
|
60
|
-
######################################################################
|
|
61
1
|
MIT License
|
|
62
2
|
|
|
63
|
-
Copyright (c)
|
|
64
|
-
Copyright (c) 2015 Preferred Networks, Inc.
|
|
3
|
+
Copyright (c) 2017 Naotoshi Seo
|
|
65
4
|
|
|
66
5
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
67
6
|
of this software and associated documentation files (the "Software"), to deal
|
data/README.md
CHANGED
|
@@ -5,10 +5,11 @@
|
|
|
5
5
|
Cumo (pronounced like "koomo") is CUDA-aware numerical library whose interface is highly compatible with [Ruby Numo](https://github.com/ruby-numo).
|
|
6
6
|
This library provides the benefit of speedup using GPU by replacing Numo with only a small piece of codes.
|
|
7
7
|
|
|
8
|
+
<img src="https://raw.githubusercontent.com/sonots/cumo-logo/master/logo_transparent.png" alt="cumo logo" title="cumo logo" width="50%">
|
|
8
9
|
|
|
9
10
|
## Requirements
|
|
10
11
|
|
|
11
|
-
* Ruby 2.
|
|
12
|
+
* Ruby 2.5 or later
|
|
12
13
|
* NVIDIA GPU Compute Capability 6.0 (Pascal) or later
|
|
13
14
|
* CUDA 9.0 or later
|
|
14
15
|
|
|
@@ -169,23 +170,6 @@ Generate docs:
|
|
|
169
170
|
bundle exec rake docs
|
|
170
171
|
```
|
|
171
172
|
|
|
172
|
-
## Source code organizations
|
|
173
|
-
|
|
174
|
-
* `*_kernel.{h,cuh,cu}` files are for device (CUDA kernels).
|
|
175
|
-
* .cu files are compiled via nvcc.
|
|
176
|
-
* .cu files define C wrapper functions to launch CUDA kernels to enable to be called from .c files.
|
|
177
|
-
* Technically, it is not possible to use CRuby API such as `VALUE` in .cu files.
|
|
178
|
-
* CRuby API is not callable from CUDA kernel because they do not have `__device__` modifier.
|
|
179
|
-
* nvcc does not support `#include RUBY_EXTCONF_H`, so can not include `ruby.h`.
|
|
180
|
-
* (RULE) It is allowed to use C++14 codes in .cu files.
|
|
181
|
-
* Rest of `*.{h,c}` files are for host (CPU).
|
|
182
|
-
* Call C wrapper functions defined in .cu files.
|
|
183
|
-
* It can use CRuby API.
|
|
184
|
-
* (RULE) It is not allowed to use C++ codes in host files.
|
|
185
|
-
|
|
186
|
-
Ruby's `mkmf` (or `extconf.rb`) does not support to specify 3rd compiler such as NVCC for another files of extensions `.cu`.
|
|
187
|
-
Therefore, cumo specify a wrapper command `bin/mkmf-cu-nvcc` as a compiler and changes its behavor depending on extensions of files to compile.
|
|
188
|
-
|
|
189
173
|
## Advanced Tips on Development
|
|
190
174
|
|
|
191
175
|
### ccache
|
|
@@ -201,25 +185,21 @@ ln -sf "$HOME/opt/ccache/bin/ccache" "$HOME/opt/ccache/bin/g++"
|
|
|
201
185
|
ln -sf "$HOME/opt/ccache/bin/ccache" "$HOME/opt/ccache/bin/nvcc"
|
|
202
186
|
```
|
|
203
187
|
|
|
204
|
-
|
|
188
|
+
### Build in parallel
|
|
205
189
|
|
|
206
|
-
|
|
190
|
+
Use `MAKEFLAGS` environment variable to specify `make` command options. You can build in parallel as:
|
|
207
191
|
|
|
208
192
|
```
|
|
209
|
-
bundle exec
|
|
193
|
+
bundle exec MAKEFLAG=-j8 rake compile
|
|
210
194
|
```
|
|
211
195
|
|
|
212
|
-
###
|
|
213
|
-
|
|
214
|
-
`DTYPE` environment variable is available as:
|
|
196
|
+
### Specify nvcc --generate-code options
|
|
215
197
|
|
|
216
198
|
```
|
|
217
|
-
bundle exec
|
|
199
|
+
bundle exec env CUMO_NVCC_GENERATE_CODE=arch=compute_60,code=sm_60 rake compile
|
|
218
200
|
```
|
|
219
201
|
|
|
220
|
-
|
|
221
|
-
bundle exec DTYPE=dfloat ruby test/narray_test.rb
|
|
222
|
-
```
|
|
202
|
+
This is useful even on development because it makes possible to skip JIT compilation of PTX to cubin occurring on runtime.
|
|
223
203
|
|
|
224
204
|
### Run tests with gdb
|
|
225
205
|
|
|
@@ -237,6 +217,25 @@ bundle exec gdb -x run.gdb --args ruby test/narray_test.rb
|
|
|
237
217
|
|
|
238
218
|
You may put a breakpoint by calling `cumo_debug_breakpoint()` at C source codes.
|
|
239
219
|
|
|
220
|
+
### Run tests only a specific line
|
|
221
|
+
`--location` option is available as:
|
|
222
|
+
|
|
223
|
+
```
|
|
224
|
+
bundle exec ruby test/narray_test.rb --location 121
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### Compile and run tests only a specific type
|
|
228
|
+
|
|
229
|
+
`DTYPE` environment variable is available as:
|
|
230
|
+
|
|
231
|
+
```
|
|
232
|
+
bundle exec DTYPE=dfloat rake compile
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
```
|
|
236
|
+
bundle exec DTYPE=dfloat ruby test/narray_test.rb
|
|
237
|
+
```
|
|
238
|
+
|
|
240
239
|
### Run program always synchronizing CPU and GPU
|
|
241
240
|
|
|
242
241
|
```
|
|
@@ -249,4 +248,9 @@ Bug reports and pull requests are welcome on GitHub at https://github.com/sonots
|
|
|
249
248
|
|
|
250
249
|
## License
|
|
251
250
|
|
|
252
|
-
[LICENSE.txt](./LICENSE.txt)
|
|
251
|
+
* [LICENSE.txt](./LICENSE.txt)
|
|
252
|
+
* [3rd_party/LICENSE.txt](./3rd_party/LICENSE.txt)
|
|
253
|
+
|
|
254
|
+
## Related Materials
|
|
255
|
+
|
|
256
|
+
* [Fast Numerical Computing and Deep Learning in Ruby with Cumo](https://speakerdeck.com/sonots/fast-numerical-computing-and-deep-learning-in-ruby-with-cumo) - Presentation Slide at [RubyKaigi 2018](https://rubykaigi.org/2018/presentations/sonots.html#may31)
|
data/bench/cumo_bench.rb
CHANGED
|
@@ -8,12 +8,14 @@ b = Cumo::Float32.new(10).seq(10,10)
|
|
|
8
8
|
c = a + b
|
|
9
9
|
c.free
|
|
10
10
|
|
|
11
|
-
def elementwise
|
|
12
|
-
|
|
11
|
+
def elementwise(num = nil)
|
|
12
|
+
num ||= NUM
|
|
13
|
+
puts "elementwise(#{num})"
|
|
13
14
|
Benchmark.bm do |r|
|
|
14
15
|
a = Cumo::Float32.new(10000).seq(1)
|
|
15
16
|
b = Cumo::Float32.new(10000).seq(10,10)
|
|
16
17
|
(a + b).free # warm up
|
|
18
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
17
19
|
r.report('10**4') do
|
|
18
20
|
NUM.times do
|
|
19
21
|
(a + b).free
|
|
@@ -24,6 +26,7 @@ def elementwise
|
|
|
24
26
|
a = Cumo::Float32.new(100000).seq(1)
|
|
25
27
|
b = Cumo::Float32.new(100000).seq(10,10)
|
|
26
28
|
(a + b).free # warm up
|
|
29
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
27
30
|
r.report('10**5') do
|
|
28
31
|
NUM.times do
|
|
29
32
|
(a + b).free
|
|
@@ -34,6 +37,7 @@ def elementwise
|
|
|
34
37
|
a = Cumo::Float32.new(1000000).seq(1)
|
|
35
38
|
b = Cumo::Float32.new(1000000).seq(10,10)
|
|
36
39
|
(a + b).free # warm up
|
|
40
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
37
41
|
r.report('10**6') do
|
|
38
42
|
NUM.times do
|
|
39
43
|
(a + b).free
|
|
@@ -44,6 +48,7 @@ def elementwise
|
|
|
44
48
|
a = Cumo::Float32.new(10000000).seq(1)
|
|
45
49
|
b = Cumo::Float32.new(10000000).seq(10,10)
|
|
46
50
|
(a + b).free # warm up
|
|
51
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
47
52
|
r.report('10**7') do
|
|
48
53
|
NUM.times do
|
|
49
54
|
(a + b).free
|
|
@@ -54,6 +59,7 @@ def elementwise
|
|
|
54
59
|
a = Cumo::Float32.new(100000000).seq(1)
|
|
55
60
|
b = Cumo::Float32.new(100000000).seq(10,10)
|
|
56
61
|
(a + b).free # warm up
|
|
62
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
57
63
|
r.report('10**8') do
|
|
58
64
|
NUM.times do
|
|
59
65
|
(a + b).free
|
|
@@ -63,10 +69,13 @@ def elementwise
|
|
|
63
69
|
end
|
|
64
70
|
end
|
|
65
71
|
|
|
66
|
-
def reduction
|
|
67
|
-
|
|
72
|
+
def reduction(num = nil)
|
|
73
|
+
num ||= NUM
|
|
74
|
+
puts "reduction(#{num})"
|
|
68
75
|
Benchmark.bm do |r|
|
|
69
76
|
a = Cumo::Float32.new(10000).seq(1)
|
|
77
|
+
(a.sum).free # warm up
|
|
78
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
70
79
|
r.report('10**4') do
|
|
71
80
|
NUM.times do
|
|
72
81
|
(a.sum).free
|
|
@@ -75,6 +84,8 @@ def reduction
|
|
|
75
84
|
end
|
|
76
85
|
|
|
77
86
|
a = Cumo::Float32.new(100000).seq(1)
|
|
87
|
+
(a.sum).free # warm up
|
|
88
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
78
89
|
r.report('10**5') do
|
|
79
90
|
NUM.times do
|
|
80
91
|
(a.sum).free
|
|
@@ -83,6 +94,8 @@ def reduction
|
|
|
83
94
|
end
|
|
84
95
|
|
|
85
96
|
a = Cumo::Float32.new(1000000).seq(1)
|
|
97
|
+
(a.sum).free # warm up
|
|
98
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
86
99
|
r.report('10**6') do
|
|
87
100
|
NUM.times do
|
|
88
101
|
(a.sum).free
|
|
@@ -91,6 +104,8 @@ def reduction
|
|
|
91
104
|
end
|
|
92
105
|
|
|
93
106
|
a = Cumo::Float32.new(10000000).seq(1)
|
|
107
|
+
(a.sum).free # warm up
|
|
108
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
94
109
|
r.report('10**7') do
|
|
95
110
|
NUM.times do
|
|
96
111
|
(a.sum).free
|
|
@@ -99,6 +114,8 @@ def reduction
|
|
|
99
114
|
end
|
|
100
115
|
|
|
101
116
|
a = Cumo::Float32.new(100000000).seq(1)
|
|
117
|
+
(a.sum).free # warm up
|
|
118
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
102
119
|
r.report('10**8') do
|
|
103
120
|
NUM.times do
|
|
104
121
|
(a.sum).free
|
|
@@ -108,13 +125,14 @@ def reduction
|
|
|
108
125
|
end
|
|
109
126
|
end
|
|
110
127
|
|
|
111
|
-
def dot
|
|
112
|
-
num
|
|
113
|
-
puts
|
|
128
|
+
def dot(num = nil)
|
|
129
|
+
num ||= 1
|
|
130
|
+
puts "dot(#{num})"
|
|
114
131
|
Benchmark.bm do |r|
|
|
115
132
|
a = Cumo::Float32.new(100,100).seq(1)
|
|
116
133
|
b = Cumo::Float32.new(100,100).seq(10,10)
|
|
117
134
|
a.dot(b).free # warm up
|
|
135
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
118
136
|
r.report('10**4') do
|
|
119
137
|
num.times do
|
|
120
138
|
a.dot(b).free
|
|
@@ -125,6 +143,7 @@ def dot
|
|
|
125
143
|
a = Cumo::Float32.new(100,1000).seq(1)
|
|
126
144
|
b = Cumo::Float32.new(1000,100).seq(10,10)
|
|
127
145
|
a.dot(b).free # warm up
|
|
146
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
128
147
|
r.report('10**5') do
|
|
129
148
|
num.times do
|
|
130
149
|
a.dot(b).free
|
|
@@ -135,6 +154,7 @@ def dot
|
|
|
135
154
|
a = Cumo::Float32.new(100,10000).seq(1)
|
|
136
155
|
b = Cumo::Float32.new(10000,100).seq(10,10)
|
|
137
156
|
a.dot(b).free # warm up
|
|
157
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
138
158
|
r.report('10**6') do
|
|
139
159
|
num.times do
|
|
140
160
|
a.dot(b).free
|
|
@@ -145,6 +165,7 @@ def dot
|
|
|
145
165
|
a = Cumo::Float32.new(100,100000).seq(1)
|
|
146
166
|
b = Cumo::Float32.new(100000,100).seq(10,10)
|
|
147
167
|
a.dot(b).free # warm up
|
|
168
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
148
169
|
r.report('10**7') do
|
|
149
170
|
num.times do
|
|
150
171
|
a.dot(b).free
|
|
@@ -155,6 +176,7 @@ def dot
|
|
|
155
176
|
a = Cumo::Float32.new(100,1000000).seq(1)
|
|
156
177
|
b = Cumo::Float32.new(1000000,100).seq(10,10)
|
|
157
178
|
a.dot(b).free # warm up
|
|
179
|
+
Cumo::CUDA::Runtime.cudaDeviceSynchronize
|
|
158
180
|
r.report('10**8') do
|
|
159
181
|
num.times do
|
|
160
182
|
a.dot(b).free
|
|
@@ -170,24 +192,24 @@ dot
|
|
|
170
192
|
|
|
171
193
|
# Tesla V100-SXM2...
|
|
172
194
|
#
|
|
173
|
-
# element-wise
|
|
195
|
+
# element-wise(100)
|
|
174
196
|
# user system total real
|
|
175
|
-
# 10**4 0.000000 0.000000 0.000000 ( 0.
|
|
176
|
-
# 10**5 0.
|
|
177
|
-
# 10**6 0.
|
|
178
|
-
# 10**7 0.
|
|
179
|
-
# 10**8 0.
|
|
180
|
-
# reduction
|
|
197
|
+
# 10**4 0.000000 0.000000 0.000000 ( 0.006332)
|
|
198
|
+
# 10**5 0.000000 0.000000 0.000000 ( 0.006280)
|
|
199
|
+
# 10**6 0.010000 0.000000 0.010000 ( 0.008123)
|
|
200
|
+
# 10**7 0.000000 0.010000 0.010000 ( 0.022176)
|
|
201
|
+
# 10**8 0.100000 0.050000 0.150000 ( 0.151999)
|
|
202
|
+
# reduction(100)
|
|
181
203
|
# user system total real
|
|
182
|
-
# 10**4 0.010000 0.000000 0.010000 ( 0.
|
|
183
|
-
# 10**5 0.
|
|
184
|
-
# 10**6 0.
|
|
185
|
-
# 10**7 1.
|
|
186
|
-
# 10**8 11.
|
|
187
|
-
# dot
|
|
204
|
+
# 10**4 0.010000 0.000000 0.010000 ( 0.009735)
|
|
205
|
+
# 10**5 0.010000 0.010000 0.020000 ( 0.022882)
|
|
206
|
+
# 10**6 0.110000 0.050000 0.160000 ( 0.154641)
|
|
207
|
+
# 10**7 1.220000 0.590000 1.810000 ( 1.805643)
|
|
208
|
+
# 10**8 11.840000 6.110000 17.950000 ( 17.946511)
|
|
209
|
+
# dot(1)
|
|
188
210
|
# user system total real
|
|
189
|
-
# 10**4 0.000000 0.000000 0.000000 ( 0.
|
|
190
|
-
# 10**5 0.000000 0.000000 0.000000 ( 0.
|
|
191
|
-
# 10**6 0.000000 0.000000 0.000000 ( 0.
|
|
192
|
-
# 10**7 0.
|
|
193
|
-
# 10**8 0.
|
|
211
|
+
# 10**4 0.000000 0.000000 0.000000 ( 0.000206)
|
|
212
|
+
# 10**5 0.000000 0.000000 0.000000 ( 0.000195)
|
|
213
|
+
# 10**6 0.000000 0.000000 0.000000 ( 0.000239)
|
|
214
|
+
# 10**7 0.000000 0.000000 0.000000 ( 0.000719)
|
|
215
|
+
# 10**8 0.010000 0.000000 0.010000 ( 0.004636)
|