cumo 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (158) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +1 -0
  3. data/3rd_party/LICENSE.txt +60 -0
  4. data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +13 -1
  5. data/LICENSE.txt +1 -62
  6. data/README.md +33 -29
  7. data/bench/cumo_bench.rb +47 -25
  8. data/bench/numo_bench.rb +27 -25
  9. data/docs/src-tree.md +16 -0
  10. data/ext/cumo/cuda/cublas.c +69 -219
  11. data/ext/cumo/cuda/memory_pool_impl.hpp +1 -0
  12. data/ext/cumo/cuda/runtime.c +2 -14
  13. data/ext/cumo/cumo.c +16 -16
  14. data/ext/cumo/include/cumo.h +2 -2
  15. data/ext/cumo/include/cumo/cuda/cublas.h +6 -129
  16. data/ext/cumo/include/cumo/cuda/runtime.h +16 -0
  17. data/ext/cumo/include/cumo/indexer.h +46 -63
  18. data/ext/cumo/include/cumo/intern.h +58 -112
  19. data/ext/cumo/include/cumo/narray.h +214 -185
  20. data/ext/cumo/include/cumo/narray_kernel.h +66 -37
  21. data/ext/cumo/include/cumo/ndloop.h +42 -42
  22. data/ext/cumo/include/cumo/reduce_kernel.h +55 -71
  23. data/ext/cumo/include/cumo/template.h +56 -51
  24. data/ext/cumo/include/cumo/template_kernel.h +31 -31
  25. data/ext/cumo/include/cumo/types/bit.h +3 -3
  26. data/ext/cumo/include/cumo/types/bit_kernel.h +2 -2
  27. data/ext/cumo/include/cumo/types/complex.h +126 -126
  28. data/ext/cumo/include/cumo/types/complex_kernel.h +126 -126
  29. data/ext/cumo/include/cumo/types/complex_macro.h +28 -28
  30. data/ext/cumo/include/cumo/types/complex_macro_kernel.h +20 -20
  31. data/ext/cumo/include/cumo/types/dcomplex.h +5 -5
  32. data/ext/cumo/include/cumo/types/dcomplex_kernel.h +1 -1
  33. data/ext/cumo/include/cumo/types/int_macro.h +1 -1
  34. data/ext/cumo/include/cumo/types/int_macro_kernel.h +1 -1
  35. data/ext/cumo/include/cumo/types/robj_macro.h +30 -30
  36. data/ext/cumo/include/cumo/types/scomplex.h +5 -5
  37. data/ext/cumo/include/cumo/types/scomplex_kernel.h +1 -1
  38. data/ext/cumo/narray/array.c +143 -143
  39. data/ext/cumo/narray/data.c +184 -184
  40. data/ext/cumo/narray/gen/cogen.rb +5 -2
  41. data/ext/cumo/narray/gen/cogen_kernel.rb +5 -2
  42. data/ext/cumo/narray/gen/def/dcomplex.rb +1 -1
  43. data/ext/cumo/narray/gen/def/scomplex.rb +1 -1
  44. data/ext/cumo/narray/gen/erbln.rb +132 -0
  45. data/ext/cumo/narray/gen/erbpp2.rb +18 -13
  46. data/ext/cumo/narray/gen/narray_def.rb +3 -3
  47. data/ext/cumo/narray/gen/spec.rb +2 -2
  48. data/ext/cumo/narray/gen/tmpl/accum.c +15 -15
  49. data/ext/cumo/narray/gen/tmpl/accum_binary.c +22 -22
  50. data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +3 -3
  51. data/ext/cumo/narray/gen/tmpl/accum_index.c +30 -30
  52. data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +2 -2
  53. data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +3 -3
  54. data/ext/cumo/narray/gen/tmpl/alloc_func.c +14 -14
  55. data/ext/cumo/narray/gen/tmpl/allocate.c +11 -11
  56. data/ext/cumo/narray/gen/tmpl/aref.c +2 -2
  57. data/ext/cumo/narray/gen/tmpl/aref_cpu.c +4 -4
  58. data/ext/cumo/narray/gen/tmpl/aset.c +2 -2
  59. data/ext/cumo/narray/gen/tmpl/binary.c +28 -28
  60. data/ext/cumo/narray/gen/tmpl/binary2.c +18 -18
  61. data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +3 -3
  62. data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +6 -6
  63. data/ext/cumo/narray/gen/tmpl/binary_s.c +13 -13
  64. data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +3 -3
  65. data/ext/cumo/narray/gen/tmpl/bincount.c +23 -23
  66. data/ext/cumo/narray/gen/tmpl/cast.c +7 -7
  67. data/ext/cumo/narray/gen/tmpl/cast_array.c +3 -3
  68. data/ext/cumo/narray/gen/tmpl/clip.c +38 -38
  69. data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +2 -2
  70. data/ext/cumo/narray/gen/tmpl/cond_binary.c +19 -19
  71. data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +7 -7
  72. data/ext/cumo/narray/gen/tmpl/cond_unary.c +15 -15
  73. data/ext/cumo/narray/gen/tmpl/cum.c +15 -15
  74. data/ext/cumo/narray/gen/tmpl/each.c +9 -9
  75. data/ext/cumo/narray/gen/tmpl/each_with_index.c +9 -9
  76. data/ext/cumo/narray/gen/tmpl/ewcomp.c +15 -15
  77. data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +3 -3
  78. data/ext/cumo/narray/gen/tmpl/extract_cpu.c +5 -5
  79. data/ext/cumo/narray/gen/tmpl/extract_data.c +12 -12
  80. data/ext/cumo/narray/gen/tmpl/eye.c +9 -9
  81. data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +3 -3
  82. data/ext/cumo/narray/gen/tmpl/fill.c +9 -9
  83. data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +6 -6
  84. data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +1 -1
  85. data/ext/cumo/narray/gen/tmpl/format.c +11 -11
  86. data/ext/cumo/narray/gen/tmpl/format_to_a.c +8 -8
  87. data/ext/cumo/narray/gen/tmpl/frexp.c +13 -13
  88. data/ext/cumo/narray/gen/tmpl/gemm.c +252 -108
  89. data/ext/cumo/narray/gen/tmpl/inspect.c +1 -1
  90. data/ext/cumo/narray/gen/tmpl/lib.c +2 -2
  91. data/ext/cumo/narray/gen/tmpl/logseq.c +7 -7
  92. data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +6 -6
  93. data/ext/cumo/narray/gen/tmpl/map_with_index.c +17 -17
  94. data/ext/cumo/narray/gen/tmpl/median.c +10 -10
  95. data/ext/cumo/narray/gen/tmpl/minmax.c +10 -10
  96. data/ext/cumo/narray/gen/tmpl/new_dim0.c +3 -3
  97. data/ext/cumo/narray/gen/tmpl/poly.c +6 -6
  98. data/ext/cumo/narray/gen/tmpl/pow.c +28 -28
  99. data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +6 -6
  100. data/ext/cumo/narray/gen/tmpl/rand.c +10 -10
  101. data/ext/cumo/narray/gen/tmpl/rand_norm.c +7 -7
  102. data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +6 -6
  103. data/ext/cumo/narray/gen/tmpl/seq.c +7 -7
  104. data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +6 -6
  105. data/ext/cumo/narray/gen/tmpl/set2.c +20 -20
  106. data/ext/cumo/narray/gen/tmpl/sort.c +11 -11
  107. data/ext/cumo/narray/gen/tmpl/sort_index.c +18 -18
  108. data/ext/cumo/narray/gen/tmpl/store.c +6 -6
  109. data/ext/cumo/narray/gen/tmpl/store_array.c +19 -19
  110. data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +12 -12
  111. data/ext/cumo/narray/gen/tmpl/store_bit.c +23 -23
  112. data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +28 -28
  113. data/ext/cumo/narray/gen/tmpl/store_from.c +16 -16
  114. data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +12 -12
  115. data/ext/cumo/narray/gen/tmpl/to_a.c +10 -10
  116. data/ext/cumo/narray/gen/tmpl/unary.c +25 -25
  117. data/ext/cumo/narray/gen/tmpl/unary2.c +17 -17
  118. data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +15 -15
  119. data/ext/cumo/narray/gen/tmpl/unary_ret2.c +13 -13
  120. data/ext/cumo/narray/gen/tmpl/unary_s.c +17 -17
  121. data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +12 -12
  122. data/ext/cumo/narray/gen/tmpl_bit/allocate.c +9 -9
  123. data/ext/cumo/narray/gen/tmpl_bit/aref.c +2 -2
  124. data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +5 -5
  125. data/ext/cumo/narray/gen/tmpl_bit/aset.c +2 -2
  126. data/ext/cumo/narray/gen/tmpl_bit/binary.c +29 -29
  127. data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +14 -14
  128. data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +21 -21
  129. data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +28 -28
  130. data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +29 -29
  131. data/ext/cumo/narray/gen/tmpl_bit/each.c +10 -10
  132. data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +10 -10
  133. data/ext/cumo/narray/gen/tmpl_bit/extract.c +8 -8
  134. data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +8 -8
  135. data/ext/cumo/narray/gen/tmpl_bit/fill.c +17 -17
  136. data/ext/cumo/narray/gen/tmpl_bit/format.c +14 -14
  137. data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +11 -11
  138. data/ext/cumo/narray/gen/tmpl_bit/inspect.c +3 -3
  139. data/ext/cumo/narray/gen/tmpl_bit/mask.c +33 -33
  140. data/ext/cumo/narray/gen/tmpl_bit/store_array.c +19 -19
  141. data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +22 -22
  142. data/ext/cumo/narray/gen/tmpl_bit/store_from.c +18 -18
  143. data/ext/cumo/narray/gen/tmpl_bit/to_a.c +12 -12
  144. data/ext/cumo/narray/gen/tmpl_bit/unary.c +24 -24
  145. data/ext/cumo/narray/gen/tmpl_bit/where.c +16 -16
  146. data/ext/cumo/narray/gen/tmpl_bit/where2.c +20 -20
  147. data/ext/cumo/narray/index.c +213 -213
  148. data/ext/cumo/narray/math.c +27 -27
  149. data/ext/cumo/narray/narray.c +484 -484
  150. data/ext/cumo/narray/ndloop.c +259 -258
  151. data/ext/cumo/narray/rand.c +3 -3
  152. data/ext/cumo/narray/step.c +70 -70
  153. data/ext/cumo/narray/struct.c +139 -139
  154. metadata +6 -7
  155. data/ext/cumo/include/cumo/intern_fwd.h +0 -38
  156. data/lib/erbpp.rb +0 -294
  157. data/lib/erbpp/line_number.rb +0 -137
  158. data/lib/erbpp/narray_def.rb +0 -381
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 6898598e4614bd912dfa4e83e31695d7fc5eb256
4
- data.tar.gz: 752d37294a51016c0256bcdab6d324ebc975bdac
2
+ SHA256:
3
+ metadata.gz: 621f319a4a92862c60267ace14b39968fb7e5c203da94bb9d1f01b86412b77b3
4
+ data.tar.gz: db4964d257fcbe79c6a6f4b8e84ac71f7657c87168d43b7c205d9de5059b02e3
5
5
  SHA512:
6
- metadata.gz: 81279774f60ac5f38a23751f52a42ae01022d9a5c33cf24d905302def0bc75e20237ad76bb986f4bdd43a184eca14ff3d27da46095189c29217d8fd894dad9fc
7
- data.tar.gz: 609ff0cbf6c3a28fc4fafde14f90adfc82ff7efcf268071a73605d61615feb21770e5bddfc8252a042657cbb2bc179f839030e4b85b1f4c11475ed9b27e92ef5
6
+ metadata.gz: a703d9f146c556af7ef60c869dd44989fdd67d8c2b45f99d337b1cc5dd606a1eace434b46d0b6a7e4a404930351565f2ca3e93f116886a5b915f285a8b1b85d0
7
+ data.tar.gz: 9fc0c1c794aeb12f9f5a3cf7866bec32439b09083c3b34869301ac5dc3c602845ec3c51e7caa7bf5022d375e448db5c4c6d3f16f77660645b07fdd5dc2bf29c1
data/.gitignore CHANGED
@@ -25,3 +25,4 @@ t.rb
25
25
  *.exe
26
26
  .DS_Store
27
27
  /vendor/
28
+ .kernel_cache/
@@ -0,0 +1,60 @@
1
+ ######################################################################
2
+ # The Cumo is a fork of Numo NArray v0.9.0.9.
3
+ # Cumo's source code and documents contain the original Ruby/Numo ones.
4
+ ######################################################################
5
+ BSD 3-Clause License
6
+
7
+ Copyright (c) 1999-2017, Masahiro TANAKA
8
+ All rights reserved.
9
+
10
+ Redistribution and use in source and binary forms, with or without
11
+ modification, are permitted provided that the following conditions are met:
12
+
13
+ * Redistributions of source code must retain the above copyright notice, this
14
+ list of conditions and the following disclaimer.
15
+
16
+ * Redistributions in binary form must reproduce the above copyright notice,
17
+ this list of conditions and the following disclaimer in the documentation
18
+ and/or other materials provided with the distribution.
19
+
20
+ * Neither the name of the copyright holder nor the names of its
21
+ contributors may be used to endorse or promote products derived from
22
+ this software without specific prior written permission.
23
+
24
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
25
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
27
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
28
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34
+
35
+ ######################################################################
36
+ # The Cumo refers some implementations of CuPy such as memory pool.
37
+ # Cumo's source code and documents contain the original CuPy ones.
38
+ ######################################################################
39
+ MIT License
40
+
41
+ Copyright (c) 2015 Preferred Infrastructure, Inc.
42
+ Copyright (c) 2015 Preferred Networks, Inc.
43
+
44
+ Permission is hereby granted, free of charge, to any person obtaining a copy
45
+ of this software and associated documentation files (the "Software"), to deal
46
+ in the Software without restriction, including without limitation the rights
47
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
48
+ copies of the Software, and to permit persons to whom the Software is
49
+ furnished to do so, subject to the following conditions:
50
+
51
+ The above copyright notice and this permission notice shall be included in
52
+ all copies or substantial portions of the Software.
53
+
54
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
55
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
56
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
57
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
58
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
59
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
60
+ THE SOFTWARE.
@@ -32,7 +32,19 @@ module MakeMakefileCuda
32
32
  # TODO(sonots): Make it possible to configure "nvcc" and additional arguments
33
33
  def nvcc_command
34
34
  s = MakeMakefileCuda::Nvcc.generate(argv)
35
- ["nvcc " << s << " -arch=sm_35"]
35
+ cmd = "nvcc " << s
36
+ if ENV['CUMO_NVCC_GENERATE_CODE']
37
+ cmd << " --generate-code=#{ENV['CUMO_NVCC_GENERATE_CODE']}"
38
+ elsif ENV['DEBUG']
39
+ cmd << " -arch=sm_35"
40
+ else
41
+ cmd << " --generate-code=arch=compute_35,code=sm_35"
42
+ cmd << " --generate-code=arch=compute_50,code=sm_50"
43
+ cmd << " --generate-code=arch=compute_60,code=sm_60"
44
+ cmd << " --generate-code=arch=compute_70,code=sm_70"
45
+ cmd << " --generate-code=arch=compute_70,code=compute_70"
46
+ end
47
+ cmd
36
48
  end
37
49
 
38
50
  def c_command
@@ -1,67 +1,6 @@
1
- BSD 3-Clause License
2
-
3
- Copyright (c) 2017 Naotoshi Seo
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in
13
- all copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
- THE SOFTWARE.
22
-
23
- ######################################################################
24
- # The Cumo is a fork of Numo NArray v0.9.0.9.
25
- # Cumo's source code and documents contain the original Ruby/Numo ones.
26
- ######################################################################
27
- BSD 3-Clause License
28
-
29
- Copyright (c) 1999-2017, Masahiro TANAKA
30
- All rights reserved.
31
-
32
- Redistribution and use in source and binary forms, with or without
33
- modification, are permitted provided that the following conditions are met:
34
-
35
- * Redistributions of source code must retain the above copyright notice, this
36
- list of conditions and the following disclaimer.
37
-
38
- * Redistributions in binary form must reproduce the above copyright notice,
39
- this list of conditions and the following disclaimer in the documentation
40
- and/or other materials provided with the distribution.
41
-
42
- * Neither the name of the copyright holder nor the names of its
43
- contributors may be used to endorse or promote products derived from
44
- this software without specific prior written permission.
45
-
46
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
47
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
49
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
50
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
52
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
53
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
54
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
55
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
56
-
57
- ######################################################################
58
- # The Cumo refers some implementations of CuPy such as memory pool.
59
- # Cumo's source code and documents contain the original CuPy ones.
60
- ######################################################################
61
1
  MIT License
62
2
 
63
- Copyright (c) 2015 Preferred Infrastructure, Inc.
64
- Copyright (c) 2015 Preferred Networks, Inc.
3
+ Copyright (c) 2017 Naotoshi Seo
65
4
 
66
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
67
6
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -5,10 +5,11 @@
5
5
  Cumo (pronounced like "koomo") is CUDA-aware numerical library whose interface is highly compatible with [Ruby Numo](https://github.com/ruby-numo).
6
6
  This library provides the benefit of speedup using GPU by replacing Numo with only a small piece of codes.
7
7
 
8
+ <img src="https://raw.githubusercontent.com/sonots/cumo-logo/master/logo_transparent.png" alt="cumo logo" title="cumo logo" width="50%">
8
9
 
9
10
  ## Requirements
10
11
 
11
- * Ruby 2.0 or later
12
+ * Ruby 2.5 or later
12
13
  * NVIDIA GPU Compute Capability 6.0 (Pascal) or later
13
14
  * CUDA 9.0 or later
14
15
 
@@ -169,23 +170,6 @@ Generate docs:
169
170
  bundle exec rake docs
170
171
  ```
171
172
 
172
- ## Source code organizations
173
-
174
- * `*_kernel.{h,cuh,cu}` files are for device (CUDA kernels).
175
- * .cu files are compiled via nvcc.
176
- * .cu files define C wrapper functions to launch CUDA kernels to enable to be called from .c files.
177
- * Technically, it is not possible to use CRuby API such as `VALUE` in .cu files.
178
- * CRuby API is not callable from CUDA kernel because they do not have `__device__` modifier.
179
- * nvcc does not support `#include RUBY_EXTCONF_H`, so can not include `ruby.h`.
180
- * (RULE) It is allowed to use C++14 codes in .cu files.
181
- * Rest of `*.{h,c}` files are for host (CPU).
182
- * Call C wrapper functions defined in .cu files.
183
- * It can use CRuby API.
184
- * (RULE) It is not allowed to use C++ codes in host files.
185
-
186
- Ruby's `mkmf` (or `extconf.rb`) does not support to specify 3rd compiler such as NVCC for another files of extensions `.cu`.
187
- Therefore, cumo specify a wrapper command `bin/mkmf-cu-nvcc` as a compiler and changes its behavor depending on extensions of files to compile.
188
-
189
173
  ## Advanced Tips on Development
190
174
 
191
175
  ### ccache
@@ -201,25 +185,21 @@ ln -sf "$HOME/opt/ccache/bin/ccache" "$HOME/opt/ccache/bin/g++"
201
185
  ln -sf "$HOME/opt/ccache/bin/ccache" "$HOME/opt/ccache/bin/nvcc"
202
186
  ```
203
187
 
204
- ## Run tests only a specific line
188
+ ### Build in parallel
205
189
 
206
- `--location` option is available as:
190
+ Use `MAKEFLAGS` environment variable to specify `make` command options. You can build in parallel as:
207
191
 
208
192
  ```
209
- bundle exec ruby test/narray_test.rb --location 121
193
+ bundle exec MAKEFLAG=-j8 rake compile
210
194
  ```
211
195
 
212
- ### Compile and run tests only a specific type
213
-
214
- `DTYPE` environment variable is available as:
196
+ ### Specify nvcc --generate-code options
215
197
 
216
198
  ```
217
- bundle exec DTYPE=dfloat rake compile
199
+ bundle exec env CUMO_NVCC_GENERATE_CODE=arch=compute_60,code=sm_60 rake compile
218
200
  ```
219
201
 
220
- ```
221
- bundle exec DTYPE=dfloat ruby test/narray_test.rb
222
- ```
202
+ This is useful even on development because it makes possible to skip JIT compilation of PTX to cubin occurring on runtime.
223
203
 
224
204
  ### Run tests with gdb
225
205
 
@@ -237,6 +217,25 @@ bundle exec gdb -x run.gdb --args ruby test/narray_test.rb
237
217
 
238
218
  You may put a breakpoint by calling `cumo_debug_breakpoint()` at C source codes.
239
219
 
220
+ ### Run tests only a specific line
221
+ `--location` option is available as:
222
+
223
+ ```
224
+ bundle exec ruby test/narray_test.rb --location 121
225
+ ```
226
+
227
+ ### Compile and run tests only a specific type
228
+
229
+ `DTYPE` environment variable is available as:
230
+
231
+ ```
232
+ bundle exec DTYPE=dfloat rake compile
233
+ ```
234
+
235
+ ```
236
+ bundle exec DTYPE=dfloat ruby test/narray_test.rb
237
+ ```
238
+
240
239
  ### Run program always synchronizing CPU and GPU
241
240
 
242
241
  ```
@@ -249,4 +248,9 @@ Bug reports and pull requests are welcome on GitHub at https://github.com/sonots
249
248
 
250
249
  ## License
251
250
 
252
- [LICENSE.txt](./LICENSE.txt)
251
+ * [LICENSE.txt](./LICENSE.txt)
252
+ * [3rd_party/LICENSE.txt](./3rd_party/LICENSE.txt)
253
+
254
+ ## Related Materials
255
+
256
+ * [Fast Numerical Computing and Deep Learning in Ruby with Cumo](https://speakerdeck.com/sonots/fast-numerical-computing-and-deep-learning-in-ruby-with-cumo) - Presentation Slide at [RubyKaigi 2018](https://rubykaigi.org/2018/presentations/sonots.html#may31)
@@ -8,12 +8,14 @@ b = Cumo::Float32.new(10).seq(10,10)
8
8
  c = a + b
9
9
  c.free
10
10
 
11
- def elementwise
12
- puts 'element-wise'
11
+ def elementwise(num = nil)
12
+ num ||= NUM
13
+ puts "elementwise(#{num})"
13
14
  Benchmark.bm do |r|
14
15
  a = Cumo::Float32.new(10000).seq(1)
15
16
  b = Cumo::Float32.new(10000).seq(10,10)
16
17
  (a + b).free # warm up
18
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
17
19
  r.report('10**4') do
18
20
  NUM.times do
19
21
  (a + b).free
@@ -24,6 +26,7 @@ def elementwise
24
26
  a = Cumo::Float32.new(100000).seq(1)
25
27
  b = Cumo::Float32.new(100000).seq(10,10)
26
28
  (a + b).free # warm up
29
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
27
30
  r.report('10**5') do
28
31
  NUM.times do
29
32
  (a + b).free
@@ -34,6 +37,7 @@ def elementwise
34
37
  a = Cumo::Float32.new(1000000).seq(1)
35
38
  b = Cumo::Float32.new(1000000).seq(10,10)
36
39
  (a + b).free # warm up
40
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
37
41
  r.report('10**6') do
38
42
  NUM.times do
39
43
  (a + b).free
@@ -44,6 +48,7 @@ def elementwise
44
48
  a = Cumo::Float32.new(10000000).seq(1)
45
49
  b = Cumo::Float32.new(10000000).seq(10,10)
46
50
  (a + b).free # warm up
51
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
47
52
  r.report('10**7') do
48
53
  NUM.times do
49
54
  (a + b).free
@@ -54,6 +59,7 @@ def elementwise
54
59
  a = Cumo::Float32.new(100000000).seq(1)
55
60
  b = Cumo::Float32.new(100000000).seq(10,10)
56
61
  (a + b).free # warm up
62
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
57
63
  r.report('10**8') do
58
64
  NUM.times do
59
65
  (a + b).free
@@ -63,10 +69,13 @@ def elementwise
63
69
  end
64
70
  end
65
71
 
66
- def reduction
67
- puts 'reduction'
72
+ def reduction(num = nil)
73
+ num ||= NUM
74
+ puts "reduction(#{num})"
68
75
  Benchmark.bm do |r|
69
76
  a = Cumo::Float32.new(10000).seq(1)
77
+ (a.sum).free # warm up
78
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
70
79
  r.report('10**4') do
71
80
  NUM.times do
72
81
  (a.sum).free
@@ -75,6 +84,8 @@ def reduction
75
84
  end
76
85
 
77
86
  a = Cumo::Float32.new(100000).seq(1)
87
+ (a.sum).free # warm up
88
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
78
89
  r.report('10**5') do
79
90
  NUM.times do
80
91
  (a.sum).free
@@ -83,6 +94,8 @@ def reduction
83
94
  end
84
95
 
85
96
  a = Cumo::Float32.new(1000000).seq(1)
97
+ (a.sum).free # warm up
98
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
86
99
  r.report('10**6') do
87
100
  NUM.times do
88
101
  (a.sum).free
@@ -91,6 +104,8 @@ def reduction
91
104
  end
92
105
 
93
106
  a = Cumo::Float32.new(10000000).seq(1)
107
+ (a.sum).free # warm up
108
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
94
109
  r.report('10**7') do
95
110
  NUM.times do
96
111
  (a.sum).free
@@ -99,6 +114,8 @@ def reduction
99
114
  end
100
115
 
101
116
  a = Cumo::Float32.new(100000000).seq(1)
117
+ (a.sum).free # warm up
118
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
102
119
  r.report('10**8') do
103
120
  NUM.times do
104
121
  (a.sum).free
@@ -108,13 +125,14 @@ def reduction
108
125
  end
109
126
  end
110
127
 
111
- def dot
112
- num = 3
113
- puts 'dot'
128
+ def dot(num = nil)
129
+ num ||= 1
130
+ puts "dot(#{num})"
114
131
  Benchmark.bm do |r|
115
132
  a = Cumo::Float32.new(100,100).seq(1)
116
133
  b = Cumo::Float32.new(100,100).seq(10,10)
117
134
  a.dot(b).free # warm up
135
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
118
136
  r.report('10**4') do
119
137
  num.times do
120
138
  a.dot(b).free
@@ -125,6 +143,7 @@ def dot
125
143
  a = Cumo::Float32.new(100,1000).seq(1)
126
144
  b = Cumo::Float32.new(1000,100).seq(10,10)
127
145
  a.dot(b).free # warm up
146
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
128
147
  r.report('10**5') do
129
148
  num.times do
130
149
  a.dot(b).free
@@ -135,6 +154,7 @@ def dot
135
154
  a = Cumo::Float32.new(100,10000).seq(1)
136
155
  b = Cumo::Float32.new(10000,100).seq(10,10)
137
156
  a.dot(b).free # warm up
157
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
138
158
  r.report('10**6') do
139
159
  num.times do
140
160
  a.dot(b).free
@@ -145,6 +165,7 @@ def dot
145
165
  a = Cumo::Float32.new(100,100000).seq(1)
146
166
  b = Cumo::Float32.new(100000,100).seq(10,10)
147
167
  a.dot(b).free # warm up
168
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
148
169
  r.report('10**7') do
149
170
  num.times do
150
171
  a.dot(b).free
@@ -155,6 +176,7 @@ def dot
155
176
  a = Cumo::Float32.new(100,1000000).seq(1)
156
177
  b = Cumo::Float32.new(1000000,100).seq(10,10)
157
178
  a.dot(b).free # warm up
179
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
158
180
  r.report('10**8') do
159
181
  num.times do
160
182
  a.dot(b).free
@@ -170,24 +192,24 @@ dot
170
192
 
171
193
  # Tesla V100-SXM2...
172
194
  #
173
- # element-wise
195
+ # element-wise(100)
174
196
  # user system total real
175
- # 10**4 0.000000 0.000000 0.000000 ( 0.005769)
176
- # 10**5 0.010000 0.000000 0.010000 ( 0.006609)
177
- # 10**6 0.000000 0.010000 0.010000 ( 0.010313)
178
- # 10**7 0.040000 0.010000 0.050000 ( 0.050986)
179
- # 10**8 0.310000 0.130000 0.440000 ( 0.449699)
180
- # reduction
197
+ # 10**4 0.000000 0.000000 0.000000 ( 0.006332)
198
+ # 10**5 0.000000 0.000000 0.000000 ( 0.006280)
199
+ # 10**6 0.010000 0.000000 0.010000 ( 0.008123)
200
+ # 10**7 0.000000 0.010000 0.010000 ( 0.022176)
201
+ # 10**8 0.100000 0.050000 0.150000 ( 0.151999)
202
+ # reduction(100)
181
203
  # user system total real
182
- # 10**4 0.010000 0.000000 0.010000 ( 0.009484)
183
- # 10**5 0.020000 0.010000 0.030000 ( 0.022071)
184
- # 10**6 0.100000 0.050000 0.150000 ( 0.152070)
185
- # 10**7 1.150000 0.600000 1.750000 ( 1.754977)
186
- # 10**8 11.720000 5.750000 17.470000 ( 17.470990)
187
- # dot
204
+ # 10**4 0.010000 0.000000 0.010000 ( 0.009735)
205
+ # 10**5 0.010000 0.010000 0.020000 ( 0.022882)
206
+ # 10**6 0.110000 0.050000 0.160000 ( 0.154641)
207
+ # 10**7 1.220000 0.590000 1.810000 ( 1.805643)
208
+ # 10**8 11.840000 6.110000 17.950000 ( 17.946511)
209
+ # dot(1)
188
210
  # user system total real
189
- # 10**4 0.000000 0.000000 0.000000 ( 0.000351)
190
- # 10**5 0.000000 0.000000 0.000000 ( 0.000838)
191
- # 10**6 0.000000 0.000000 0.000000 ( 0.002702)
192
- # 10**7 0.020000 0.010000 0.030000 ( 0.024650)
193
- # 10**8 0.180000 0.060000 0.240000 ( 0.245101)
211
+ # 10**4 0.000000 0.000000 0.000000 ( 0.000206)
212
+ # 10**5 0.000000 0.000000 0.000000 ( 0.000195)
213
+ # 10**6 0.000000 0.000000 0.000000 ( 0.000239)
214
+ # 10**7 0.000000 0.000000 0.000000 ( 0.000719)
215
+ # 10**8 0.010000 0.000000 0.010000 ( 0.004636)