cumo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +27 -0
  3. data/.travis.yml +5 -0
  4. data/3rd_party/mkmf-cu/.gitignore +36 -0
  5. data/3rd_party/mkmf-cu/Gemfile +3 -0
  6. data/3rd_party/mkmf-cu/LICENSE +21 -0
  7. data/3rd_party/mkmf-cu/README.md +36 -0
  8. data/3rd_party/mkmf-cu/Rakefile +11 -0
  9. data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +4 -0
  10. data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +32 -0
  11. data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +80 -0
  12. data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +157 -0
  13. data/3rd_party/mkmf-cu/mkmf-cu.gemspec +16 -0
  14. data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +67 -0
  15. data/CODE_OF_CONDUCT.md +46 -0
  16. data/Gemfile +8 -0
  17. data/LICENSE.txt +82 -0
  18. data/README.md +252 -0
  19. data/Rakefile +43 -0
  20. data/bench/broadcast_fp32.rb +138 -0
  21. data/bench/cumo_bench.rb +193 -0
  22. data/bench/numo_bench.rb +138 -0
  23. data/bench/reduction_fp32.rb +117 -0
  24. data/bin/console +14 -0
  25. data/bin/setup +8 -0
  26. data/cumo.gemspec +32 -0
  27. data/ext/cumo/cuda/cublas.c +278 -0
  28. data/ext/cumo/cuda/driver.c +421 -0
  29. data/ext/cumo/cuda/memory_pool.cpp +185 -0
  30. data/ext/cumo/cuda/memory_pool_impl.cpp +308 -0
  31. data/ext/cumo/cuda/memory_pool_impl.hpp +370 -0
  32. data/ext/cumo/cuda/memory_pool_impl_test.cpp +554 -0
  33. data/ext/cumo/cuda/nvrtc.c +207 -0
  34. data/ext/cumo/cuda/runtime.c +167 -0
  35. data/ext/cumo/cumo.c +148 -0
  36. data/ext/cumo/depend.erb +58 -0
  37. data/ext/cumo/extconf.rb +179 -0
  38. data/ext/cumo/include/cumo.h +25 -0
  39. data/ext/cumo/include/cumo/compat.h +23 -0
  40. data/ext/cumo/include/cumo/cuda/cublas.h +153 -0
  41. data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +187 -0
  42. data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +79 -0
  43. data/ext/cumo/include/cumo/cuda/driver.h +22 -0
  44. data/ext/cumo/include/cumo/cuda/memory_pool.h +28 -0
  45. data/ext/cumo/include/cumo/cuda/nvrtc.h +22 -0
  46. data/ext/cumo/include/cumo/cuda/runtime.h +40 -0
  47. data/ext/cumo/include/cumo/indexer.h +238 -0
  48. data/ext/cumo/include/cumo/intern.h +142 -0
  49. data/ext/cumo/include/cumo/intern_fwd.h +38 -0
  50. data/ext/cumo/include/cumo/intern_kernel.h +6 -0
  51. data/ext/cumo/include/cumo/narray.h +429 -0
  52. data/ext/cumo/include/cumo/narray_kernel.h +149 -0
  53. data/ext/cumo/include/cumo/ndloop.h +95 -0
  54. data/ext/cumo/include/cumo/reduce_kernel.h +126 -0
  55. data/ext/cumo/include/cumo/template.h +158 -0
  56. data/ext/cumo/include/cumo/template_kernel.h +77 -0
  57. data/ext/cumo/include/cumo/types/bit.h +40 -0
  58. data/ext/cumo/include/cumo/types/bit_kernel.h +34 -0
  59. data/ext/cumo/include/cumo/types/complex.h +402 -0
  60. data/ext/cumo/include/cumo/types/complex_kernel.h +414 -0
  61. data/ext/cumo/include/cumo/types/complex_macro.h +382 -0
  62. data/ext/cumo/include/cumo/types/complex_macro_kernel.h +186 -0
  63. data/ext/cumo/include/cumo/types/dcomplex.h +46 -0
  64. data/ext/cumo/include/cumo/types/dcomplex_kernel.h +13 -0
  65. data/ext/cumo/include/cumo/types/dfloat.h +47 -0
  66. data/ext/cumo/include/cumo/types/dfloat_kernel.h +14 -0
  67. data/ext/cumo/include/cumo/types/float_def.h +34 -0
  68. data/ext/cumo/include/cumo/types/float_def_kernel.h +39 -0
  69. data/ext/cumo/include/cumo/types/float_macro.h +191 -0
  70. data/ext/cumo/include/cumo/types/float_macro_kernel.h +158 -0
  71. data/ext/cumo/include/cumo/types/int16.h +24 -0
  72. data/ext/cumo/include/cumo/types/int16_kernel.h +23 -0
  73. data/ext/cumo/include/cumo/types/int32.h +24 -0
  74. data/ext/cumo/include/cumo/types/int32_kernel.h +19 -0
  75. data/ext/cumo/include/cumo/types/int64.h +24 -0
  76. data/ext/cumo/include/cumo/types/int64_kernel.h +19 -0
  77. data/ext/cumo/include/cumo/types/int8.h +24 -0
  78. data/ext/cumo/include/cumo/types/int8_kernel.h +19 -0
  79. data/ext/cumo/include/cumo/types/int_macro.h +67 -0
  80. data/ext/cumo/include/cumo/types/int_macro_kernel.h +48 -0
  81. data/ext/cumo/include/cumo/types/real_accum.h +486 -0
  82. data/ext/cumo/include/cumo/types/real_accum_kernel.h +101 -0
  83. data/ext/cumo/include/cumo/types/robj_macro.h +80 -0
  84. data/ext/cumo/include/cumo/types/robj_macro_kernel.h +0 -0
  85. data/ext/cumo/include/cumo/types/robject.h +27 -0
  86. data/ext/cumo/include/cumo/types/robject_kernel.h +7 -0
  87. data/ext/cumo/include/cumo/types/scomplex.h +46 -0
  88. data/ext/cumo/include/cumo/types/scomplex_kernel.h +13 -0
  89. data/ext/cumo/include/cumo/types/sfloat.h +48 -0
  90. data/ext/cumo/include/cumo/types/sfloat_kernel.h +14 -0
  91. data/ext/cumo/include/cumo/types/uint16.h +25 -0
  92. data/ext/cumo/include/cumo/types/uint16_kernel.h +20 -0
  93. data/ext/cumo/include/cumo/types/uint32.h +25 -0
  94. data/ext/cumo/include/cumo/types/uint32_kernel.h +20 -0
  95. data/ext/cumo/include/cumo/types/uint64.h +25 -0
  96. data/ext/cumo/include/cumo/types/uint64_kernel.h +20 -0
  97. data/ext/cumo/include/cumo/types/uint8.h +25 -0
  98. data/ext/cumo/include/cumo/types/uint8_kernel.h +20 -0
  99. data/ext/cumo/include/cumo/types/uint_macro.h +58 -0
  100. data/ext/cumo/include/cumo/types/uint_macro_kernel.h +38 -0
  101. data/ext/cumo/include/cumo/types/xint_macro.h +169 -0
  102. data/ext/cumo/include/cumo/types/xint_macro_kernel.h +88 -0
  103. data/ext/cumo/narray/SFMT-params.h +97 -0
  104. data/ext/cumo/narray/SFMT-params19937.h +46 -0
  105. data/ext/cumo/narray/SFMT.c +620 -0
  106. data/ext/cumo/narray/SFMT.h +167 -0
  107. data/ext/cumo/narray/array.c +638 -0
  108. data/ext/cumo/narray/data.c +961 -0
  109. data/ext/cumo/narray/gen/cogen.rb +56 -0
  110. data/ext/cumo/narray/gen/cogen_kernel.rb +58 -0
  111. data/ext/cumo/narray/gen/def/bit.rb +37 -0
  112. data/ext/cumo/narray/gen/def/dcomplex.rb +39 -0
  113. data/ext/cumo/narray/gen/def/dfloat.rb +37 -0
  114. data/ext/cumo/narray/gen/def/int16.rb +36 -0
  115. data/ext/cumo/narray/gen/def/int32.rb +36 -0
  116. data/ext/cumo/narray/gen/def/int64.rb +36 -0
  117. data/ext/cumo/narray/gen/def/int8.rb +36 -0
  118. data/ext/cumo/narray/gen/def/robject.rb +37 -0
  119. data/ext/cumo/narray/gen/def/scomplex.rb +39 -0
  120. data/ext/cumo/narray/gen/def/sfloat.rb +37 -0
  121. data/ext/cumo/narray/gen/def/uint16.rb +36 -0
  122. data/ext/cumo/narray/gen/def/uint32.rb +36 -0
  123. data/ext/cumo/narray/gen/def/uint64.rb +36 -0
  124. data/ext/cumo/narray/gen/def/uint8.rb +36 -0
  125. data/ext/cumo/narray/gen/erbpp2.rb +346 -0
  126. data/ext/cumo/narray/gen/narray_def.rb +268 -0
  127. data/ext/cumo/narray/gen/spec.rb +425 -0
  128. data/ext/cumo/narray/gen/tmpl/accum.c +86 -0
  129. data/ext/cumo/narray/gen/tmpl/accum_binary.c +121 -0
  130. data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +61 -0
  131. data/ext/cumo/narray/gen/tmpl/accum_index.c +119 -0
  132. data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +66 -0
  133. data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +12 -0
  134. data/ext/cumo/narray/gen/tmpl/alloc_func.c +107 -0
  135. data/ext/cumo/narray/gen/tmpl/allocate.c +37 -0
  136. data/ext/cumo/narray/gen/tmpl/aref.c +66 -0
  137. data/ext/cumo/narray/gen/tmpl/aref_cpu.c +50 -0
  138. data/ext/cumo/narray/gen/tmpl/aset.c +56 -0
  139. data/ext/cumo/narray/gen/tmpl/binary.c +162 -0
  140. data/ext/cumo/narray/gen/tmpl/binary2.c +70 -0
  141. data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +15 -0
  142. data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +31 -0
  143. data/ext/cumo/narray/gen/tmpl/binary_s.c +45 -0
  144. data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +15 -0
  145. data/ext/cumo/narray/gen/tmpl/bincount.c +181 -0
  146. data/ext/cumo/narray/gen/tmpl/cast.c +44 -0
  147. data/ext/cumo/narray/gen/tmpl/cast_array.c +13 -0
  148. data/ext/cumo/narray/gen/tmpl/class.c +9 -0
  149. data/ext/cumo/narray/gen/tmpl/class_kernel.cu +6 -0
  150. data/ext/cumo/narray/gen/tmpl/clip.c +121 -0
  151. data/ext/cumo/narray/gen/tmpl/coerce_cast.c +10 -0
  152. data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +129 -0
  153. data/ext/cumo/narray/gen/tmpl/cond_binary.c +68 -0
  154. data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +18 -0
  155. data/ext/cumo/narray/gen/tmpl/cond_unary.c +46 -0
  156. data/ext/cumo/narray/gen/tmpl/cum.c +50 -0
  157. data/ext/cumo/narray/gen/tmpl/each.c +47 -0
  158. data/ext/cumo/narray/gen/tmpl/each_with_index.c +70 -0
  159. data/ext/cumo/narray/gen/tmpl/ewcomp.c +79 -0
  160. data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +19 -0
  161. data/ext/cumo/narray/gen/tmpl/extract.c +22 -0
  162. data/ext/cumo/narray/gen/tmpl/extract_cpu.c +26 -0
  163. data/ext/cumo/narray/gen/tmpl/extract_data.c +53 -0
  164. data/ext/cumo/narray/gen/tmpl/eye.c +105 -0
  165. data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +19 -0
  166. data/ext/cumo/narray/gen/tmpl/fill.c +52 -0
  167. data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +29 -0
  168. data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +106 -0
  169. data/ext/cumo/narray/gen/tmpl/format.c +62 -0
  170. data/ext/cumo/narray/gen/tmpl/format_to_a.c +49 -0
  171. data/ext/cumo/narray/gen/tmpl/frexp.c +38 -0
  172. data/ext/cumo/narray/gen/tmpl/gemm.c +203 -0
  173. data/ext/cumo/narray/gen/tmpl/init_class.c +20 -0
  174. data/ext/cumo/narray/gen/tmpl/init_module.c +12 -0
  175. data/ext/cumo/narray/gen/tmpl/inspect.c +21 -0
  176. data/ext/cumo/narray/gen/tmpl/lib.c +50 -0
  177. data/ext/cumo/narray/gen/tmpl/lib_kernel.cu +24 -0
  178. data/ext/cumo/narray/gen/tmpl/logseq.c +102 -0
  179. data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +31 -0
  180. data/ext/cumo/narray/gen/tmpl/map_with_index.c +98 -0
  181. data/ext/cumo/narray/gen/tmpl/median.c +66 -0
  182. data/ext/cumo/narray/gen/tmpl/minmax.c +47 -0
  183. data/ext/cumo/narray/gen/tmpl/module.c +9 -0
  184. data/ext/cumo/narray/gen/tmpl/module_kernel.cu +1 -0
  185. data/ext/cumo/narray/gen/tmpl/new_dim0.c +15 -0
  186. data/ext/cumo/narray/gen/tmpl/new_dim0_kernel.cu +8 -0
  187. data/ext/cumo/narray/gen/tmpl/poly.c +50 -0
  188. data/ext/cumo/narray/gen/tmpl/pow.c +97 -0
  189. data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +29 -0
  190. data/ext/cumo/narray/gen/tmpl/powint.c +17 -0
  191. data/ext/cumo/narray/gen/tmpl/qsort.c +212 -0
  192. data/ext/cumo/narray/gen/tmpl/rand.c +168 -0
  193. data/ext/cumo/narray/gen/tmpl/rand_norm.c +121 -0
  194. data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +75 -0
  195. data/ext/cumo/narray/gen/tmpl/seq.c +112 -0
  196. data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +43 -0
  197. data/ext/cumo/narray/gen/tmpl/set2.c +57 -0
  198. data/ext/cumo/narray/gen/tmpl/sort.c +48 -0
  199. data/ext/cumo/narray/gen/tmpl/sort_index.c +111 -0
  200. data/ext/cumo/narray/gen/tmpl/store.c +41 -0
  201. data/ext/cumo/narray/gen/tmpl/store_array.c +187 -0
  202. data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +58 -0
  203. data/ext/cumo/narray/gen/tmpl/store_bit.c +86 -0
  204. data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +66 -0
  205. data/ext/cumo/narray/gen/tmpl/store_from.c +81 -0
  206. data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +58 -0
  207. data/ext/cumo/narray/gen/tmpl/store_kernel.cu +3 -0
  208. data/ext/cumo/narray/gen/tmpl/store_numeric.c +9 -0
  209. data/ext/cumo/narray/gen/tmpl/to_a.c +43 -0
  210. data/ext/cumo/narray/gen/tmpl/unary.c +132 -0
  211. data/ext/cumo/narray/gen/tmpl/unary2.c +60 -0
  212. data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +72 -0
  213. data/ext/cumo/narray/gen/tmpl/unary_ret2.c +34 -0
  214. data/ext/cumo/narray/gen/tmpl/unary_s.c +86 -0
  215. data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +58 -0
  216. data/ext/cumo/narray/gen/tmpl_bit/allocate.c +24 -0
  217. data/ext/cumo/narray/gen/tmpl_bit/aref.c +54 -0
  218. data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +57 -0
  219. data/ext/cumo/narray/gen/tmpl_bit/aset.c +56 -0
  220. data/ext/cumo/narray/gen/tmpl_bit/binary.c +98 -0
  221. data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +64 -0
  222. data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +88 -0
  223. data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +76 -0
  224. data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +133 -0
  225. data/ext/cumo/narray/gen/tmpl_bit/each.c +48 -0
  226. data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +70 -0
  227. data/ext/cumo/narray/gen/tmpl_bit/extract.c +30 -0
  228. data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +29 -0
  229. data/ext/cumo/narray/gen/tmpl_bit/fill.c +69 -0
  230. data/ext/cumo/narray/gen/tmpl_bit/format.c +64 -0
  231. data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +51 -0
  232. data/ext/cumo/narray/gen/tmpl_bit/inspect.c +21 -0
  233. data/ext/cumo/narray/gen/tmpl_bit/mask.c +136 -0
  234. data/ext/cumo/narray/gen/tmpl_bit/none_p.c +14 -0
  235. data/ext/cumo/narray/gen/tmpl_bit/store_array.c +108 -0
  236. data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +70 -0
  237. data/ext/cumo/narray/gen/tmpl_bit/store_from.c +60 -0
  238. data/ext/cumo/narray/gen/tmpl_bit/to_a.c +47 -0
  239. data/ext/cumo/narray/gen/tmpl_bit/unary.c +81 -0
  240. data/ext/cumo/narray/gen/tmpl_bit/where.c +90 -0
  241. data/ext/cumo/narray/gen/tmpl_bit/where2.c +95 -0
  242. data/ext/cumo/narray/index.c +880 -0
  243. data/ext/cumo/narray/kwargs.c +153 -0
  244. data/ext/cumo/narray/math.c +142 -0
  245. data/ext/cumo/narray/narray.c +1948 -0
  246. data/ext/cumo/narray/ndloop.c +2105 -0
  247. data/ext/cumo/narray/rand.c +45 -0
  248. data/ext/cumo/narray/step.c +474 -0
  249. data/ext/cumo/narray/struct.c +886 -0
  250. data/lib/cumo.rb +3 -0
  251. data/lib/cumo/cuda.rb +11 -0
  252. data/lib/cumo/cuda/compile_error.rb +36 -0
  253. data/lib/cumo/cuda/compiler.rb +161 -0
  254. data/lib/cumo/cuda/device.rb +47 -0
  255. data/lib/cumo/cuda/link_state.rb +31 -0
  256. data/lib/cumo/cuda/module.rb +40 -0
  257. data/lib/cumo/cuda/nvrtc_program.rb +27 -0
  258. data/lib/cumo/linalg.rb +12 -0
  259. data/lib/cumo/narray.rb +2 -0
  260. data/lib/cumo/narray/extra.rb +1278 -0
  261. data/lib/erbpp.rb +294 -0
  262. data/lib/erbpp/line_number.rb +137 -0
  263. data/lib/erbpp/narray_def.rb +381 -0
  264. data/numo-narray-version +1 -0
  265. data/run.gdb +7 -0
  266. metadata +353 -0
@@ -0,0 +1,43 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new(:test) do |t|
5
+ t.libs << "test"
6
+ t.libs << "lib"
7
+ t.test_files = FileList["test/**/*_test.rb"]
8
+ end
9
+
10
+ task :compile do
11
+ sh 'cd ext/cumo && ruby extconf.rb && make && make build-ctest'
12
+ end
13
+
14
+ task :ctest do
15
+ sh 'cd ext/cumo && ruby extconf.rb && make run-ctest'
16
+ end
17
+
18
+ task :clean do
19
+ sh 'cd ext/cumo && make clean'
20
+ end
21
+
22
+ task :docs do
23
+ dir = "ext/cumo"
24
+ srcs = %w[array.c data.c index.c math.c narray.c rand.c struct.c].map{|s| File.join(dir, "narray", s)}
25
+ srcs += %w[cublas.c driver.c nvrtc.c runtime.c memory_pool.cpp].map{|s| File.join(dir, "cuda", s) }
26
+ srcs << File.join(dir, "narray", "types/*.c")
27
+ srcs << "lib/cumo/narray/extra.rb"
28
+ sh "cd ext/cumo; ruby extconf.rb; make src"
29
+ sh "rm -rf docs .yardoc; yard doc -o docs -m markdown -r README.md #{srcs.join(' ')}"
30
+ end
31
+ task :doc => :docs
32
+
33
+ task :gdb do
34
+ sh "gdb -x run.gdb --args ruby -I. ./test.rb"
35
+ end
36
+
37
+ task :default => [:clobber, :compile, :test]
38
+
39
+ desc 'Open an irb session preloaded with the gem library'
40
+ task :console do
41
+ sh 'irb -rubygems -I lib'
42
+ end
43
+ task :c => :console
@@ -0,0 +1,138 @@
1
+ require 'benchmark'
2
+ require 'cumo/narray'
3
+
4
+ num_iteration = 1000
5
+
6
+ Benchmark.bm 20 do |r|
7
+ x = Cumo::SFloat.ones([1000,784])
8
+ y = Cumo::SFloat.ones([1000,784])
9
+ r.report "x.inplace + y" do
10
+ num_iteration.times do
11
+ x.inplace + y
12
+ end
13
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
14
+ end
15
+
16
+ x = Cumo::SFloat.ones([1000,784])
17
+ y = Cumo::SFloat.ones([1000,784])
18
+ r.report "x + y" do
19
+ num_iteration.times do
20
+ (x + y).free
21
+ end
22
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
23
+ end
24
+
25
+ x = Cumo::SFloat.ones([1000,784])
26
+ y = Cumo::SFloat.ones([1000,784])
27
+ r.report "x.inplace + 1.0" do
28
+ num_iteration.times do
29
+ x.inplace + 1.0
30
+ end
31
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
32
+ end
33
+
34
+ x = Cumo::SFloat.ones([1000,784])
35
+ z = Cumo::SFloat.ones([1000,1])
36
+ r.report "x.inplace + z" do
37
+ num_iteration.times do
38
+ x.inplace + z
39
+ end
40
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
41
+ end
42
+
43
+ x = Cumo::SFloat.ones([1000,784])
44
+ y = Cumo::SFloat.ones([1000,784])
45
+ r.report "x.inplace - y" do
46
+ num_iteration.times do
47
+ x.inplace - y
48
+ end
49
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
50
+ end
51
+
52
+ x = Cumo::SFloat.ones([1000,784])
53
+ y = Cumo::SFloat.ones([1000,784])
54
+ r.report "x.inplace - 1.0" do
55
+ num_iteration.times do
56
+ x.inplace - 1.0
57
+ end
58
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
59
+ end
60
+
61
+ x = Cumo::SFloat.ones([1000,784])
62
+ z = Cumo::SFloat.ones([1000,1])
63
+ r.report "x.inplace - z" do
64
+ num_iteration.times do
65
+ x.inplace - z
66
+ end
67
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
68
+ end
69
+
70
+ x = Cumo::SFloat.ones([1000,784])
71
+ y = Cumo::SFloat.ones([1000,784])
72
+ r.report "x.inplace * y" do
73
+ num_iteration.times do
74
+ x.inplace * y
75
+ end
76
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
77
+ end
78
+
79
+ x = Cumo::SFloat.ones([1000,784])
80
+ y = Cumo::SFloat.ones([1000,784])
81
+ r.report "x.inplace * 1.0" do
82
+ num_iteration.times do
83
+ x.inplace * 1.0
84
+ end
85
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
86
+ end
87
+
88
+ x = Cumo::SFloat.ones([1000,784])
89
+ z = Cumo::SFloat.ones([1000,1])
90
+ r.report "x.inplace * z" do
91
+ num_iteration.times do
92
+ x.inplace * z
93
+ end
94
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
95
+ end
96
+
97
+ x = Cumo::SFloat.ones([1000,784])
98
+ y = Cumo::SFloat.ones([1000,784])
99
+ r.report "x.inplace / y" do
100
+ num_iteration.times do
101
+ x.inplace / y
102
+ end
103
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
104
+ end
105
+
106
+ x = Cumo::SFloat.ones([1000,784])
107
+ y = Cumo::SFloat.ones([1000,784])
108
+ r.report "x.inplace / 1.0" do
109
+ num_iteration.times do
110
+ x.inplace / 1.0
111
+ end
112
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
113
+ end
114
+
115
+ x = Cumo::SFloat.ones([1000,784])
116
+ z = Cumo::SFloat.ones([1000,1])
117
+ r.report "x.inplace / z" do
118
+ num_iteration.times do
119
+ x.inplace / z
120
+ end
121
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
122
+ end
123
+ end
124
+
125
+ # user system total real
126
+ # x.inplace + y 0.040000 0.000000 0.040000 ( 0.039255)
127
+ # x + y 0.140000 0.520000 0.660000 ( 0.689061)
128
+ # x.inplace + 1.0 0.040000 0.000000 0.040000 ( 0.040255)
129
+ # x.inplace + z 0.090000 0.000000 0.090000 ( 0.083663)
130
+ # x.inplace - y 0.030000 0.010000 0.040000 ( 0.039017)
131
+ # x.inplace - 1.0 0.050000 0.000000 0.050000 ( 0.053187)
132
+ # x.inplace - z 0.080000 0.010000 0.090000 ( 0.085909)
133
+ # x.inplace * y 0.030000 0.010000 0.040000 ( 0.038583)
134
+ # x.inplace * 1.0 0.050000 0.010000 0.060000 ( 0.051933)
135
+ # x.inplace * z 0.080000 0.010000 0.090000 ( 0.084889)
136
+ # x.inplace / y 0.040000 0.000000 0.040000 ( 0.038958)
137
+ # x.inplace / 1.0 0.040000 0.010000 0.050000 ( 0.050266)
138
+ # x.inplace / z 0.080000 0.010000 0.090000 ( 0.086685)
@@ -0,0 +1,193 @@
1
+ require 'cumo/narray'
2
+ require 'benchmark'
3
+
4
+ NUM = (ARGV.first || 100).to_i
5
+
6
+ a = Cumo::Float32.new(10).seq(1)
7
+ b = Cumo::Float32.new(10).seq(10,10)
8
+ c = a + b
9
+ c.free
10
+
11
+ def elementwise
12
+ puts 'element-wise'
13
+ Benchmark.bm do |r|
14
+ a = Cumo::Float32.new(10000).seq(1)
15
+ b = Cumo::Float32.new(10000).seq(10,10)
16
+ (a + b).free # warm up
17
+ r.report('10**4') do
18
+ NUM.times do
19
+ (a + b).free
20
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
21
+ end
22
+ end
23
+
24
+ a = Cumo::Float32.new(100000).seq(1)
25
+ b = Cumo::Float32.new(100000).seq(10,10)
26
+ (a + b).free # warm up
27
+ r.report('10**5') do
28
+ NUM.times do
29
+ (a + b).free
30
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
31
+ end
32
+ end
33
+
34
+ a = Cumo::Float32.new(1000000).seq(1)
35
+ b = Cumo::Float32.new(1000000).seq(10,10)
36
+ (a + b).free # warm up
37
+ r.report('10**6') do
38
+ NUM.times do
39
+ (a + b).free
40
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
41
+ end
42
+ end
43
+
44
+ a = Cumo::Float32.new(10000000).seq(1)
45
+ b = Cumo::Float32.new(10000000).seq(10,10)
46
+ (a + b).free # warm up
47
+ r.report('10**7') do
48
+ NUM.times do
49
+ (a + b).free
50
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
51
+ end
52
+ end
53
+
54
+ a = Cumo::Float32.new(100000000).seq(1)
55
+ b = Cumo::Float32.new(100000000).seq(10,10)
56
+ (a + b).free # warm up
57
+ r.report('10**8') do
58
+ NUM.times do
59
+ (a + b).free
60
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
61
+ end
62
+ end
63
+ end
64
+ end
65
+
66
+ def reduction
67
+ puts 'reduction'
68
+ Benchmark.bm do |r|
69
+ a = Cumo::Float32.new(10000).seq(1)
70
+ r.report('10**4') do
71
+ NUM.times do
72
+ (a.sum).free
73
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
74
+ end
75
+ end
76
+
77
+ a = Cumo::Float32.new(100000).seq(1)
78
+ r.report('10**5') do
79
+ NUM.times do
80
+ (a.sum).free
81
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
82
+ end
83
+ end
84
+
85
+ a = Cumo::Float32.new(1000000).seq(1)
86
+ r.report('10**6') do
87
+ NUM.times do
88
+ (a.sum).free
89
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
90
+ end
91
+ end
92
+
93
+ a = Cumo::Float32.new(10000000).seq(1)
94
+ r.report('10**7') do
95
+ NUM.times do
96
+ (a.sum).free
97
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
98
+ end
99
+ end
100
+
101
+ a = Cumo::Float32.new(100000000).seq(1)
102
+ r.report('10**8') do
103
+ NUM.times do
104
+ (a.sum).free
105
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
106
+ end
107
+ end
108
+ end
109
+ end
110
+
111
+ def dot
112
+ num = 3
113
+ puts 'dot'
114
+ Benchmark.bm do |r|
115
+ a = Cumo::Float32.new(100,100).seq(1)
116
+ b = Cumo::Float32.new(100,100).seq(10,10)
117
+ a.dot(b).free # warm up
118
+ r.report('10**4') do
119
+ num.times do
120
+ a.dot(b).free
121
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
122
+ end
123
+ end
124
+
125
+ a = Cumo::Float32.new(100,1000).seq(1)
126
+ b = Cumo::Float32.new(1000,100).seq(10,10)
127
+ a.dot(b).free # warm up
128
+ r.report('10**5') do
129
+ num.times do
130
+ a.dot(b).free
131
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
132
+ end
133
+ end
134
+
135
+ a = Cumo::Float32.new(100,10000).seq(1)
136
+ b = Cumo::Float32.new(10000,100).seq(10,10)
137
+ a.dot(b).free # warm up
138
+ r.report('10**6') do
139
+ num.times do
140
+ a.dot(b).free
141
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
142
+ end
143
+ end
144
+
145
+ a = Cumo::Float32.new(100,100000).seq(1)
146
+ b = Cumo::Float32.new(100000,100).seq(10,10)
147
+ a.dot(b).free # warm up
148
+ r.report('10**7') do
149
+ num.times do
150
+ a.dot(b).free
151
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
152
+ end
153
+ end
154
+
155
+ a = Cumo::Float32.new(100,1000000).seq(1)
156
+ b = Cumo::Float32.new(1000000,100).seq(10,10)
157
+ a.dot(b).free # warm up
158
+ r.report('10**8') do
159
+ num.times do
160
+ a.dot(b).free
161
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
162
+ end
163
+ end
164
+ end
165
+ end
166
+
167
+ elementwise
168
+ reduction
169
+ dot
170
+
171
+ # Tesla V100-SXM2...
172
+ #
173
+ # element-wise
174
+ # user system total real
175
+ # 10**4 0.000000 0.000000 0.000000 ( 0.005769)
176
+ # 10**5 0.010000 0.000000 0.010000 ( 0.006609)
177
+ # 10**6 0.000000 0.010000 0.010000 ( 0.010313)
178
+ # 10**7 0.040000 0.010000 0.050000 ( 0.050986)
179
+ # 10**8 0.310000 0.130000 0.440000 ( 0.449699)
180
+ # reduction
181
+ # user system total real
182
+ # 10**4 0.010000 0.000000 0.010000 ( 0.009484)
183
+ # 10**5 0.020000 0.010000 0.030000 ( 0.022071)
184
+ # 10**6 0.100000 0.050000 0.150000 ( 0.152070)
185
+ # 10**7 1.150000 0.600000 1.750000 ( 1.754977)
186
+ # 10**8 11.720000 5.750000 17.470000 ( 17.470990)
187
+ # dot
188
+ # user system total real
189
+ # 10**4 0.000000 0.000000 0.000000 ( 0.000351)
190
+ # 10**5 0.000000 0.000000 0.000000 ( 0.000838)
191
+ # 10**6 0.000000 0.000000 0.000000 ( 0.002702)
192
+ # 10**7 0.020000 0.010000 0.030000 ( 0.024650)
193
+ # 10**8 0.180000 0.060000 0.240000 ( 0.245101)
@@ -0,0 +1,138 @@
1
+ require 'numo/narray'
2
+ require 'benchmark'
3
+
4
+ NUM = (ARGV.first || 100).to_i
5
+
6
+ # warm up
7
+ a = Numo::Float32.new(10).seq(1)
8
+ b = Numo::Float32.new(10).seq(10,10)
9
+ c = a + b
10
+
11
+ def elementwise
12
+ puts 'element-wise'
13
+ Benchmark.bm do |r|
14
+ a = Numo::Float32.new(10000).seq(1)
15
+ b = Numo::Float32.new(10000).seq(10,10)
16
+ r.report('10**4') do
17
+ NUM.times { (a + b) }
18
+ end
19
+
20
+ a = Numo::Float32.new(100000).seq(1)
21
+ b = Numo::Float32.new(100000).seq(10,10)
22
+ r.report('10**5') do
23
+ NUM.times { (a + b) }
24
+ end
25
+
26
+ a = Numo::Float32.new(1000000).seq(1)
27
+ b = Numo::Float32.new(1000000).seq(10,10)
28
+ r.report('10**6') do
29
+ NUM.times { (a + b) }
30
+ end
31
+
32
+ a = Numo::Float32.new(10000000).seq(1)
33
+ b = Numo::Float32.new(10000000).seq(10,10)
34
+ r.report('10**7') do
35
+ NUM.times { (a + b) }
36
+ end
37
+
38
+ a = Numo::Float32.new(100000000).seq(1)
39
+ b = Numo::Float32.new(100000000).seq(10,10)
40
+ r.report('10**8') do
41
+ NUM.times { (a + b) }
42
+ end
43
+ end
44
+ end
45
+
46
+ def reduction
47
+ puts 'reduction'
48
+ Benchmark.bm do |r|
49
+ a = Numo::Float32.new(10000).seq(1)
50
+ r.report('10**4') do
51
+ NUM.times { (a.sum) }
52
+ end
53
+
54
+ a = Numo::Float32.new(100000).seq(1)
55
+ r.report('10**5') do
56
+ NUM.times { (a.sum) }
57
+ end
58
+
59
+ a = Numo::Float32.new(1000000).seq(1)
60
+ r.report('10**6') do
61
+ NUM.times { (a.sum) }
62
+ end
63
+
64
+ a = Numo::Float32.new(10000000).seq(1)
65
+ r.report('10**7') do
66
+ NUM.times { (a.sum) }
67
+ end
68
+
69
+ a = Numo::Float32.new(100000000).seq(1)
70
+ r.report('10**8') do
71
+ NUM.times { (a.sum) }
72
+ end
73
+ end
74
+ end
75
+
76
+ def dot
77
+ num = 3
78
+ puts 'dot'
79
+ Benchmark.bm do |r|
80
+ a = Numo::Float32.new(100,100).seq(1)
81
+ b = Numo::Float32.new(100,100).seq(10,10)
82
+ r.report('10**4') do
83
+ num.times { a.dot(b) }
84
+ end
85
+
86
+ a = Numo::Float32.new(100,1000).seq(1)
87
+ b = Numo::Float32.new(1000,100).seq(10,10)
88
+ r.report('10**5') do
89
+ num.times { a.dot(b) }
90
+ end
91
+
92
+ a = Numo::Float32.new(100,10000).seq(1)
93
+ b = Numo::Float32.new(10000,100).seq(10,10)
94
+ r.report('10**6') do
95
+ num.times { a.dot(b) }
96
+ end
97
+
98
+ a = Numo::Float32.new(100,100000).seq(1)
99
+ b = Numo::Float32.new(100000,100).seq(10,10)
100
+ r.report('10**7') do
101
+ num.times { a.dot(b) }
102
+ end
103
+
104
+ a = Numo::Float32.new(100,1000000).seq(1)
105
+ b = Numo::Float32.new(1000000,100).seq(10,10)
106
+ r.report('10**8') do
107
+ num.times { a.dot(b) }
108
+ end
109
+ end
110
+ end
111
+
112
+ elementwise
113
+ reduction
114
+ dot
115
+
116
+ # Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
117
+ #
118
+ # element-wise
119
+ # user system total real
120
+ # 10**4 0.010000 0.000000 0.010000 ( 0.002212)
121
+ # 10**5 0.000000 0.020000 0.020000 ( 0.021604)
122
+ # 10**6 0.060000 0.060000 0.120000 ( 0.120241)
123
+ # 10**7 0.980000 0.890000 1.870000 ( 1.874592)
124
+ # 10**8 9.530000 8.520000 18.050000 ( 18.054087)
125
+ # reduction
126
+ # user system total real
127
+ # 10**4 0.000000 0.000000 0.000000 ( 0.001313)
128
+ # 10**5 0.010000 0.000000 0.010000 ( 0.011400)
129
+ # 10**6 0.110000 0.000000 0.110000 ( 0.111674)
130
+ # 10**7 1.120000 0.000000 1.120000 ( 1.127018)
131
+ # 10**8 11.770000 0.010000 11.780000 ( 11.770858)
132
+ # dot
133
+ # user system total real
134
+ # 10**4 0.000000 0.000000 0.000000 ( 0.003935)
135
+ # 10**5 0.040000 0.000000 0.040000 ( 0.037682)
136
+ # 10**6 0.380000 0.000000 0.380000 ( 0.377312)
137
+ # 10**7 3.790000 0.000000 3.790000 ( 3.792297)
138
+ # 10**8 38.820000 0.000000 38.820000 ( 38.816987)