cumo 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +1 -0
  3. data/3rd_party/LICENSE.txt +60 -0
  4. data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +13 -1
  5. data/LICENSE.txt +1 -62
  6. data/README.md +33 -29
  7. data/bench/cumo_bench.rb +47 -25
  8. data/bench/numo_bench.rb +27 -25
  9. data/docs/src-tree.md +16 -0
  10. data/ext/cumo/cuda/cublas.c +69 -219
  11. data/ext/cumo/cuda/memory_pool_impl.hpp +1 -0
  12. data/ext/cumo/cuda/runtime.c +2 -14
  13. data/ext/cumo/cumo.c +16 -16
  14. data/ext/cumo/include/cumo.h +2 -2
  15. data/ext/cumo/include/cumo/cuda/cublas.h +6 -129
  16. data/ext/cumo/include/cumo/cuda/runtime.h +16 -0
  17. data/ext/cumo/include/cumo/indexer.h +46 -63
  18. data/ext/cumo/include/cumo/intern.h +58 -112
  19. data/ext/cumo/include/cumo/narray.h +214 -185
  20. data/ext/cumo/include/cumo/narray_kernel.h +66 -37
  21. data/ext/cumo/include/cumo/ndloop.h +42 -42
  22. data/ext/cumo/include/cumo/reduce_kernel.h +55 -71
  23. data/ext/cumo/include/cumo/template.h +56 -51
  24. data/ext/cumo/include/cumo/template_kernel.h +31 -31
  25. data/ext/cumo/include/cumo/types/bit.h +3 -3
  26. data/ext/cumo/include/cumo/types/bit_kernel.h +2 -2
  27. data/ext/cumo/include/cumo/types/complex.h +126 -126
  28. data/ext/cumo/include/cumo/types/complex_kernel.h +126 -126
  29. data/ext/cumo/include/cumo/types/complex_macro.h +28 -28
  30. data/ext/cumo/include/cumo/types/complex_macro_kernel.h +20 -20
  31. data/ext/cumo/include/cumo/types/dcomplex.h +5 -5
  32. data/ext/cumo/include/cumo/types/dcomplex_kernel.h +1 -1
  33. data/ext/cumo/include/cumo/types/int_macro.h +1 -1
  34. data/ext/cumo/include/cumo/types/int_macro_kernel.h +1 -1
  35. data/ext/cumo/include/cumo/types/robj_macro.h +30 -30
  36. data/ext/cumo/include/cumo/types/scomplex.h +5 -5
  37. data/ext/cumo/include/cumo/types/scomplex_kernel.h +1 -1
  38. data/ext/cumo/narray/array.c +143 -143
  39. data/ext/cumo/narray/data.c +184 -184
  40. data/ext/cumo/narray/gen/cogen.rb +5 -2
  41. data/ext/cumo/narray/gen/cogen_kernel.rb +5 -2
  42. data/ext/cumo/narray/gen/def/dcomplex.rb +1 -1
  43. data/ext/cumo/narray/gen/def/scomplex.rb +1 -1
  44. data/ext/cumo/narray/gen/erbln.rb +132 -0
  45. data/ext/cumo/narray/gen/erbpp2.rb +18 -13
  46. data/ext/cumo/narray/gen/narray_def.rb +3 -3
  47. data/ext/cumo/narray/gen/spec.rb +2 -2
  48. data/ext/cumo/narray/gen/tmpl/accum.c +15 -15
  49. data/ext/cumo/narray/gen/tmpl/accum_binary.c +22 -22
  50. data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +3 -3
  51. data/ext/cumo/narray/gen/tmpl/accum_index.c +30 -30
  52. data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +2 -2
  53. data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +3 -3
  54. data/ext/cumo/narray/gen/tmpl/alloc_func.c +14 -14
  55. data/ext/cumo/narray/gen/tmpl/allocate.c +11 -11
  56. data/ext/cumo/narray/gen/tmpl/aref.c +2 -2
  57. data/ext/cumo/narray/gen/tmpl/aref_cpu.c +4 -4
  58. data/ext/cumo/narray/gen/tmpl/aset.c +2 -2
  59. data/ext/cumo/narray/gen/tmpl/binary.c +28 -28
  60. data/ext/cumo/narray/gen/tmpl/binary2.c +18 -18
  61. data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +3 -3
  62. data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +6 -6
  63. data/ext/cumo/narray/gen/tmpl/binary_s.c +13 -13
  64. data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +3 -3
  65. data/ext/cumo/narray/gen/tmpl/bincount.c +23 -23
  66. data/ext/cumo/narray/gen/tmpl/cast.c +7 -7
  67. data/ext/cumo/narray/gen/tmpl/cast_array.c +3 -3
  68. data/ext/cumo/narray/gen/tmpl/clip.c +38 -38
  69. data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +2 -2
  70. data/ext/cumo/narray/gen/tmpl/cond_binary.c +19 -19
  71. data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +7 -7
  72. data/ext/cumo/narray/gen/tmpl/cond_unary.c +15 -15
  73. data/ext/cumo/narray/gen/tmpl/cum.c +15 -15
  74. data/ext/cumo/narray/gen/tmpl/each.c +9 -9
  75. data/ext/cumo/narray/gen/tmpl/each_with_index.c +9 -9
  76. data/ext/cumo/narray/gen/tmpl/ewcomp.c +15 -15
  77. data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +3 -3
  78. data/ext/cumo/narray/gen/tmpl/extract_cpu.c +5 -5
  79. data/ext/cumo/narray/gen/tmpl/extract_data.c +12 -12
  80. data/ext/cumo/narray/gen/tmpl/eye.c +9 -9
  81. data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +3 -3
  82. data/ext/cumo/narray/gen/tmpl/fill.c +9 -9
  83. data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +6 -6
  84. data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +1 -1
  85. data/ext/cumo/narray/gen/tmpl/format.c +11 -11
  86. data/ext/cumo/narray/gen/tmpl/format_to_a.c +8 -8
  87. data/ext/cumo/narray/gen/tmpl/frexp.c +13 -13
  88. data/ext/cumo/narray/gen/tmpl/gemm.c +252 -108
  89. data/ext/cumo/narray/gen/tmpl/inspect.c +1 -1
  90. data/ext/cumo/narray/gen/tmpl/lib.c +2 -2
  91. data/ext/cumo/narray/gen/tmpl/logseq.c +7 -7
  92. data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +6 -6
  93. data/ext/cumo/narray/gen/tmpl/map_with_index.c +17 -17
  94. data/ext/cumo/narray/gen/tmpl/median.c +10 -10
  95. data/ext/cumo/narray/gen/tmpl/minmax.c +10 -10
  96. data/ext/cumo/narray/gen/tmpl/new_dim0.c +3 -3
  97. data/ext/cumo/narray/gen/tmpl/poly.c +6 -6
  98. data/ext/cumo/narray/gen/tmpl/pow.c +28 -28
  99. data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +6 -6
  100. data/ext/cumo/narray/gen/tmpl/rand.c +10 -10
  101. data/ext/cumo/narray/gen/tmpl/rand_norm.c +7 -7
  102. data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +6 -6
  103. data/ext/cumo/narray/gen/tmpl/seq.c +7 -7
  104. data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +6 -6
  105. data/ext/cumo/narray/gen/tmpl/set2.c +20 -20
  106. data/ext/cumo/narray/gen/tmpl/sort.c +11 -11
  107. data/ext/cumo/narray/gen/tmpl/sort_index.c +18 -18
  108. data/ext/cumo/narray/gen/tmpl/store.c +6 -6
  109. data/ext/cumo/narray/gen/tmpl/store_array.c +19 -19
  110. data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +12 -12
  111. data/ext/cumo/narray/gen/tmpl/store_bit.c +23 -23
  112. data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +28 -28
  113. data/ext/cumo/narray/gen/tmpl/store_from.c +16 -16
  114. data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +12 -12
  115. data/ext/cumo/narray/gen/tmpl/to_a.c +10 -10
  116. data/ext/cumo/narray/gen/tmpl/unary.c +25 -25
  117. data/ext/cumo/narray/gen/tmpl/unary2.c +17 -17
  118. data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +15 -15
  119. data/ext/cumo/narray/gen/tmpl/unary_ret2.c +13 -13
  120. data/ext/cumo/narray/gen/tmpl/unary_s.c +17 -17
  121. data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +12 -12
  122. data/ext/cumo/narray/gen/tmpl_bit/allocate.c +9 -9
  123. data/ext/cumo/narray/gen/tmpl_bit/aref.c +2 -2
  124. data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +5 -5
  125. data/ext/cumo/narray/gen/tmpl_bit/aset.c +2 -2
  126. data/ext/cumo/narray/gen/tmpl_bit/binary.c +29 -29
  127. data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +14 -14
  128. data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +21 -21
  129. data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +28 -28
  130. data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +29 -29
  131. data/ext/cumo/narray/gen/tmpl_bit/each.c +10 -10
  132. data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +10 -10
  133. data/ext/cumo/narray/gen/tmpl_bit/extract.c +8 -8
  134. data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +8 -8
  135. data/ext/cumo/narray/gen/tmpl_bit/fill.c +17 -17
  136. data/ext/cumo/narray/gen/tmpl_bit/format.c +14 -14
  137. data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +11 -11
  138. data/ext/cumo/narray/gen/tmpl_bit/inspect.c +3 -3
  139. data/ext/cumo/narray/gen/tmpl_bit/mask.c +33 -33
  140. data/ext/cumo/narray/gen/tmpl_bit/store_array.c +19 -19
  141. data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +22 -22
  142. data/ext/cumo/narray/gen/tmpl_bit/store_from.c +18 -18
  143. data/ext/cumo/narray/gen/tmpl_bit/to_a.c +12 -12
  144. data/ext/cumo/narray/gen/tmpl_bit/unary.c +24 -24
  145. data/ext/cumo/narray/gen/tmpl_bit/where.c +16 -16
  146. data/ext/cumo/narray/gen/tmpl_bit/where2.c +20 -20
  147. data/ext/cumo/narray/index.c +213 -213
  148. data/ext/cumo/narray/math.c +27 -27
  149. data/ext/cumo/narray/narray.c +484 -484
  150. data/ext/cumo/narray/ndloop.c +259 -258
  151. data/ext/cumo/narray/rand.c +3 -3
  152. data/ext/cumo/narray/step.c +70 -70
  153. data/ext/cumo/narray/struct.c +139 -139
  154. metadata +6 -7
  155. data/ext/cumo/include/cumo/intern_fwd.h +0 -38
  156. data/lib/erbpp.rb +0 -294
  157. data/lib/erbpp/line_number.rb +0 -137
  158. data/lib/erbpp/narray_def.rb +0 -381
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 6898598e4614bd912dfa4e83e31695d7fc5eb256
4
- data.tar.gz: 752d37294a51016c0256bcdab6d324ebc975bdac
2
+ SHA256:
3
+ metadata.gz: 621f319a4a92862c60267ace14b39968fb7e5c203da94bb9d1f01b86412b77b3
4
+ data.tar.gz: db4964d257fcbe79c6a6f4b8e84ac71f7657c87168d43b7c205d9de5059b02e3
5
5
  SHA512:
6
- metadata.gz: 81279774f60ac5f38a23751f52a42ae01022d9a5c33cf24d905302def0bc75e20237ad76bb986f4bdd43a184eca14ff3d27da46095189c29217d8fd894dad9fc
7
- data.tar.gz: 609ff0cbf6c3a28fc4fafde14f90adfc82ff7efcf268071a73605d61615feb21770e5bddfc8252a042657cbb2bc179f839030e4b85b1f4c11475ed9b27e92ef5
6
+ metadata.gz: a703d9f146c556af7ef60c869dd44989fdd67d8c2b45f99d337b1cc5dd606a1eace434b46d0b6a7e4a404930351565f2ca3e93f116886a5b915f285a8b1b85d0
7
+ data.tar.gz: 9fc0c1c794aeb12f9f5a3cf7866bec32439b09083c3b34869301ac5dc3c602845ec3c51e7caa7bf5022d375e448db5c4c6d3f16f77660645b07fdd5dc2bf29c1
data/.gitignore CHANGED
@@ -25,3 +25,4 @@ t.rb
25
25
  *.exe
26
26
  .DS_Store
27
27
  /vendor/
28
+ .kernel_cache/
@@ -0,0 +1,60 @@
1
+ ######################################################################
2
+ # The Cumo is a fork of Numo NArray v0.9.0.9.
3
+ # Cumo's source code and documents contain the original Ruby/Numo ones.
4
+ ######################################################################
5
+ BSD 3-Clause License
6
+
7
+ Copyright (c) 1999-2017, Masahiro TANAKA
8
+ All rights reserved.
9
+
10
+ Redistribution and use in source and binary forms, with or without
11
+ modification, are permitted provided that the following conditions are met:
12
+
13
+ * Redistributions of source code must retain the above copyright notice, this
14
+ list of conditions and the following disclaimer.
15
+
16
+ * Redistributions in binary form must reproduce the above copyright notice,
17
+ this list of conditions and the following disclaimer in the documentation
18
+ and/or other materials provided with the distribution.
19
+
20
+ * Neither the name of the copyright holder nor the names of its
21
+ contributors may be used to endorse or promote products derived from
22
+ this software without specific prior written permission.
23
+
24
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
25
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
27
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
28
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34
+
35
+ ######################################################################
36
+ # The Cumo refers some implementations of CuPy such as memory pool.
37
+ # Cumo's source code and documents contain the original CuPy ones.
38
+ ######################################################################
39
+ MIT License
40
+
41
+ Copyright (c) 2015 Preferred Infrastructure, Inc.
42
+ Copyright (c) 2015 Preferred Networks, Inc.
43
+
44
+ Permission is hereby granted, free of charge, to any person obtaining a copy
45
+ of this software and associated documentation files (the "Software"), to deal
46
+ in the Software without restriction, including without limitation the rights
47
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
48
+ copies of the Software, and to permit persons to whom the Software is
49
+ furnished to do so, subject to the following conditions:
50
+
51
+ The above copyright notice and this permission notice shall be included in
52
+ all copies or substantial portions of the Software.
53
+
54
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
55
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
56
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
57
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
58
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
59
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
60
+ THE SOFTWARE.
@@ -32,7 +32,19 @@ module MakeMakefileCuda
32
32
  # TODO(sonots): Make it possible to configure "nvcc" and additional arguments
33
33
  def nvcc_command
34
34
  s = MakeMakefileCuda::Nvcc.generate(argv)
35
- ["nvcc " << s << " -arch=sm_35"]
35
+ cmd = "nvcc " << s
36
+ if ENV['CUMO_NVCC_GENERATE_CODE']
37
+ cmd << " --generate-code=#{ENV['CUMO_NVCC_GENERATE_CODE']}"
38
+ elsif ENV['DEBUG']
39
+ cmd << " -arch=sm_35"
40
+ else
41
+ cmd << " --generate-code=arch=compute_35,code=sm_35"
42
+ cmd << " --generate-code=arch=compute_50,code=sm_50"
43
+ cmd << " --generate-code=arch=compute_60,code=sm_60"
44
+ cmd << " --generate-code=arch=compute_70,code=sm_70"
45
+ cmd << " --generate-code=arch=compute_70,code=compute_70"
46
+ end
47
+ cmd
36
48
  end
37
49
 
38
50
  def c_command
@@ -1,67 +1,6 @@
1
- BSD 3-Clause License
2
-
3
- Copyright (c) 2017 Naotoshi Seo
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in
13
- all copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
- THE SOFTWARE.
22
-
23
- ######################################################################
24
- # The Cumo is a fork of Numo NArray v0.9.0.9.
25
- # Cumo's source code and documents contain the original Ruby/Numo ones.
26
- ######################################################################
27
- BSD 3-Clause License
28
-
29
- Copyright (c) 1999-2017, Masahiro TANAKA
30
- All rights reserved.
31
-
32
- Redistribution and use in source and binary forms, with or without
33
- modification, are permitted provided that the following conditions are met:
34
-
35
- * Redistributions of source code must retain the above copyright notice, this
36
- list of conditions and the following disclaimer.
37
-
38
- * Redistributions in binary form must reproduce the above copyright notice,
39
- this list of conditions and the following disclaimer in the documentation
40
- and/or other materials provided with the distribution.
41
-
42
- * Neither the name of the copyright holder nor the names of its
43
- contributors may be used to endorse or promote products derived from
44
- this software without specific prior written permission.
45
-
46
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
47
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
49
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
50
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
52
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
53
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
54
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
55
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
56
-
57
- ######################################################################
58
- # The Cumo refers some implementations of CuPy such as memory pool.
59
- # Cumo's source code and documents contain the original CuPy ones.
60
- ######################################################################
61
1
  MIT License
62
2
 
63
- Copyright (c) 2015 Preferred Infrastructure, Inc.
64
- Copyright (c) 2015 Preferred Networks, Inc.
3
+ Copyright (c) 2017 Naotoshi Seo
65
4
 
66
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
67
6
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -5,10 +5,11 @@
5
5
  Cumo (pronounced like "koomo") is CUDA-aware numerical library whose interface is highly compatible with [Ruby Numo](https://github.com/ruby-numo).
6
6
  This library provides the benefit of speedup using GPU by replacing Numo with only a small piece of codes.
7
7
 
8
+ <img src="https://raw.githubusercontent.com/sonots/cumo-logo/master/logo_transparent.png" alt="cumo logo" title="cumo logo" width="50%">
8
9
 
9
10
  ## Requirements
10
11
 
11
- * Ruby 2.0 or later
12
+ * Ruby 2.5 or later
12
13
  * NVIDIA GPU Compute Capability 6.0 (Pascal) or later
13
14
  * CUDA 9.0 or later
14
15
 
@@ -169,23 +170,6 @@ Generate docs:
169
170
  bundle exec rake docs
170
171
  ```
171
172
 
172
- ## Source code organizations
173
-
174
- * `*_kernel.{h,cuh,cu}` files are for device (CUDA kernels).
175
- * .cu files are compiled via nvcc.
176
- * .cu files define C wrapper functions to launch CUDA kernels to enable to be called from .c files.
177
- * Technically, it is not possible to use CRuby API such as `VALUE` in .cu files.
178
- * CRuby API is not callable from CUDA kernel because they do not have `__device__` modifier.
179
- * nvcc does not support `#include RUBY_EXTCONF_H`, so can not include `ruby.h`.
180
- * (RULE) It is allowed to use C++14 codes in .cu files.
181
- * Rest of `*.{h,c}` files are for host (CPU).
182
- * Call C wrapper functions defined in .cu files.
183
- * It can use CRuby API.
184
- * (RULE) It is not allowed to use C++ codes in host files.
185
-
186
- Ruby's `mkmf` (or `extconf.rb`) does not support to specify 3rd compiler such as NVCC for another files of extensions `.cu`.
187
- Therefore, cumo specify a wrapper command `bin/mkmf-cu-nvcc` as a compiler and changes its behavor depending on extensions of files to compile.
188
-
189
173
  ## Advanced Tips on Development
190
174
 
191
175
  ### ccache
@@ -201,25 +185,21 @@ ln -sf "$HOME/opt/ccache/bin/ccache" "$HOME/opt/ccache/bin/g++"
201
185
  ln -sf "$HOME/opt/ccache/bin/ccache" "$HOME/opt/ccache/bin/nvcc"
202
186
  ```
203
187
 
204
- ## Run tests only a specific line
188
+ ### Build in parallel
205
189
 
206
- `--location` option is available as:
190
+ Use `MAKEFLAGS` environment variable to specify `make` command options. You can build in parallel as:
207
191
 
208
192
  ```
209
- bundle exec ruby test/narray_test.rb --location 121
193
+ bundle exec MAKEFLAG=-j8 rake compile
210
194
  ```
211
195
 
212
- ### Compile and run tests only a specific type
213
-
214
- `DTYPE` environment variable is available as:
196
+ ### Specify nvcc --generate-code options
215
197
 
216
198
  ```
217
- bundle exec DTYPE=dfloat rake compile
199
+ bundle exec env CUMO_NVCC_GENERATE_CODE=arch=compute_60,code=sm_60 rake compile
218
200
  ```
219
201
 
220
- ```
221
- bundle exec DTYPE=dfloat ruby test/narray_test.rb
222
- ```
202
+ This is useful even on development because it makes possible to skip JIT compilation of PTX to cubin occurring on runtime.
223
203
 
224
204
  ### Run tests with gdb
225
205
 
@@ -237,6 +217,25 @@ bundle exec gdb -x run.gdb --args ruby test/narray_test.rb
237
217
 
238
218
  You may put a breakpoint by calling `cumo_debug_breakpoint()` at C source codes.
239
219
 
220
+ ### Run tests only a specific line
221
+ `--location` option is available as:
222
+
223
+ ```
224
+ bundle exec ruby test/narray_test.rb --location 121
225
+ ```
226
+
227
+ ### Compile and run tests only a specific type
228
+
229
+ `DTYPE` environment variable is available as:
230
+
231
+ ```
232
+ bundle exec DTYPE=dfloat rake compile
233
+ ```
234
+
235
+ ```
236
+ bundle exec DTYPE=dfloat ruby test/narray_test.rb
237
+ ```
238
+
240
239
  ### Run program always synchronizing CPU and GPU
241
240
 
242
241
  ```
@@ -249,4 +248,9 @@ Bug reports and pull requests are welcome on GitHub at https://github.com/sonots
249
248
 
250
249
  ## License
251
250
 
252
- [LICENSE.txt](./LICENSE.txt)
251
+ * [LICENSE.txt](./LICENSE.txt)
252
+ * [3rd_party/LICENSE.txt](./3rd_party/LICENSE.txt)
253
+
254
+ ## Related Materials
255
+
256
+ * [Fast Numerical Computing and Deep Learning in Ruby with Cumo](https://speakerdeck.com/sonots/fast-numerical-computing-and-deep-learning-in-ruby-with-cumo) - Presentation Slide at [RubyKaigi 2018](https://rubykaigi.org/2018/presentations/sonots.html#may31)
@@ -8,12 +8,14 @@ b = Cumo::Float32.new(10).seq(10,10)
8
8
  c = a + b
9
9
  c.free
10
10
 
11
- def elementwise
12
- puts 'element-wise'
11
+ def elementwise(num = nil)
12
+ num ||= NUM
13
+ puts "elementwise(#{num})"
13
14
  Benchmark.bm do |r|
14
15
  a = Cumo::Float32.new(10000).seq(1)
15
16
  b = Cumo::Float32.new(10000).seq(10,10)
16
17
  (a + b).free # warm up
18
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
17
19
  r.report('10**4') do
18
20
  NUM.times do
19
21
  (a + b).free
@@ -24,6 +26,7 @@ def elementwise
24
26
  a = Cumo::Float32.new(100000).seq(1)
25
27
  b = Cumo::Float32.new(100000).seq(10,10)
26
28
  (a + b).free # warm up
29
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
27
30
  r.report('10**5') do
28
31
  NUM.times do
29
32
  (a + b).free
@@ -34,6 +37,7 @@ def elementwise
34
37
  a = Cumo::Float32.new(1000000).seq(1)
35
38
  b = Cumo::Float32.new(1000000).seq(10,10)
36
39
  (a + b).free # warm up
40
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
37
41
  r.report('10**6') do
38
42
  NUM.times do
39
43
  (a + b).free
@@ -44,6 +48,7 @@ def elementwise
44
48
  a = Cumo::Float32.new(10000000).seq(1)
45
49
  b = Cumo::Float32.new(10000000).seq(10,10)
46
50
  (a + b).free # warm up
51
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
47
52
  r.report('10**7') do
48
53
  NUM.times do
49
54
  (a + b).free
@@ -54,6 +59,7 @@ def elementwise
54
59
  a = Cumo::Float32.new(100000000).seq(1)
55
60
  b = Cumo::Float32.new(100000000).seq(10,10)
56
61
  (a + b).free # warm up
62
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
57
63
  r.report('10**8') do
58
64
  NUM.times do
59
65
  (a + b).free
@@ -63,10 +69,13 @@ def elementwise
63
69
  end
64
70
  end
65
71
 
66
- def reduction
67
- puts 'reduction'
72
+ def reduction(num = nil)
73
+ num ||= NUM
74
+ puts "reduction(#{num})"
68
75
  Benchmark.bm do |r|
69
76
  a = Cumo::Float32.new(10000).seq(1)
77
+ (a.sum).free # warm up
78
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
70
79
  r.report('10**4') do
71
80
  NUM.times do
72
81
  (a.sum).free
@@ -75,6 +84,8 @@ def reduction
75
84
  end
76
85
 
77
86
  a = Cumo::Float32.new(100000).seq(1)
87
+ (a.sum).free # warm up
88
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
78
89
  r.report('10**5') do
79
90
  NUM.times do
80
91
  (a.sum).free
@@ -83,6 +94,8 @@ def reduction
83
94
  end
84
95
 
85
96
  a = Cumo::Float32.new(1000000).seq(1)
97
+ (a.sum).free # warm up
98
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
86
99
  r.report('10**6') do
87
100
  NUM.times do
88
101
  (a.sum).free
@@ -91,6 +104,8 @@ def reduction
91
104
  end
92
105
 
93
106
  a = Cumo::Float32.new(10000000).seq(1)
107
+ (a.sum).free # warm up
108
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
94
109
  r.report('10**7') do
95
110
  NUM.times do
96
111
  (a.sum).free
@@ -99,6 +114,8 @@ def reduction
99
114
  end
100
115
 
101
116
  a = Cumo::Float32.new(100000000).seq(1)
117
+ (a.sum).free # warm up
118
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
102
119
  r.report('10**8') do
103
120
  NUM.times do
104
121
  (a.sum).free
@@ -108,13 +125,14 @@ def reduction
108
125
  end
109
126
  end
110
127
 
111
- def dot
112
- num = 3
113
- puts 'dot'
128
+ def dot(num = nil)
129
+ num ||= 1
130
+ puts "dot(#{num})"
114
131
  Benchmark.bm do |r|
115
132
  a = Cumo::Float32.new(100,100).seq(1)
116
133
  b = Cumo::Float32.new(100,100).seq(10,10)
117
134
  a.dot(b).free # warm up
135
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
118
136
  r.report('10**4') do
119
137
  num.times do
120
138
  a.dot(b).free
@@ -125,6 +143,7 @@ def dot
125
143
  a = Cumo::Float32.new(100,1000).seq(1)
126
144
  b = Cumo::Float32.new(1000,100).seq(10,10)
127
145
  a.dot(b).free # warm up
146
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
128
147
  r.report('10**5') do
129
148
  num.times do
130
149
  a.dot(b).free
@@ -135,6 +154,7 @@ def dot
135
154
  a = Cumo::Float32.new(100,10000).seq(1)
136
155
  b = Cumo::Float32.new(10000,100).seq(10,10)
137
156
  a.dot(b).free # warm up
157
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
138
158
  r.report('10**6') do
139
159
  num.times do
140
160
  a.dot(b).free
@@ -145,6 +165,7 @@ def dot
145
165
  a = Cumo::Float32.new(100,100000).seq(1)
146
166
  b = Cumo::Float32.new(100000,100).seq(10,10)
147
167
  a.dot(b).free # warm up
168
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
148
169
  r.report('10**7') do
149
170
  num.times do
150
171
  a.dot(b).free
@@ -155,6 +176,7 @@ def dot
155
176
  a = Cumo::Float32.new(100,1000000).seq(1)
156
177
  b = Cumo::Float32.new(1000000,100).seq(10,10)
157
178
  a.dot(b).free # warm up
179
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
158
180
  r.report('10**8') do
159
181
  num.times do
160
182
  a.dot(b).free
@@ -170,24 +192,24 @@ dot
170
192
 
171
193
  # Tesla V100-SXM2...
172
194
  #
173
- # element-wise
195
+ # element-wise(100)
174
196
  # user system total real
175
- # 10**4 0.000000 0.000000 0.000000 ( 0.005769)
176
- # 10**5 0.010000 0.000000 0.010000 ( 0.006609)
177
- # 10**6 0.000000 0.010000 0.010000 ( 0.010313)
178
- # 10**7 0.040000 0.010000 0.050000 ( 0.050986)
179
- # 10**8 0.310000 0.130000 0.440000 ( 0.449699)
180
- # reduction
197
+ # 10**4 0.000000 0.000000 0.000000 ( 0.006332)
198
+ # 10**5 0.000000 0.000000 0.000000 ( 0.006280)
199
+ # 10**6 0.010000 0.000000 0.010000 ( 0.008123)
200
+ # 10**7 0.000000 0.010000 0.010000 ( 0.022176)
201
+ # 10**8 0.100000 0.050000 0.150000 ( 0.151999)
202
+ # reduction(100)
181
203
  # user system total real
182
- # 10**4 0.010000 0.000000 0.010000 ( 0.009484)
183
- # 10**5 0.020000 0.010000 0.030000 ( 0.022071)
184
- # 10**6 0.100000 0.050000 0.150000 ( 0.152070)
185
- # 10**7 1.150000 0.600000 1.750000 ( 1.754977)
186
- # 10**8 11.720000 5.750000 17.470000 ( 17.470990)
187
- # dot
204
+ # 10**4 0.010000 0.000000 0.010000 ( 0.009735)
205
+ # 10**5 0.010000 0.010000 0.020000 ( 0.022882)
206
+ # 10**6 0.110000 0.050000 0.160000 ( 0.154641)
207
+ # 10**7 1.220000 0.590000 1.810000 ( 1.805643)
208
+ # 10**8 11.840000 6.110000 17.950000 ( 17.946511)
209
+ # dot(1)
188
210
  # user system total real
189
- # 10**4 0.000000 0.000000 0.000000 ( 0.000351)
190
- # 10**5 0.000000 0.000000 0.000000 ( 0.000838)
191
- # 10**6 0.000000 0.000000 0.000000 ( 0.002702)
192
- # 10**7 0.020000 0.010000 0.030000 ( 0.024650)
193
- # 10**8 0.180000 0.060000 0.240000 ( 0.245101)
211
+ # 10**4 0.000000 0.000000 0.000000 ( 0.000206)
212
+ # 10**5 0.000000 0.000000 0.000000 ( 0.000195)
213
+ # 10**6 0.000000 0.000000 0.000000 ( 0.000239)
214
+ # 10**7 0.000000 0.000000 0.000000 ( 0.000719)
215
+ # 10**8 0.010000 0.000000 0.010000 ( 0.004636)