cumo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +27 -0
  3. data/.travis.yml +5 -0
  4. data/3rd_party/mkmf-cu/.gitignore +36 -0
  5. data/3rd_party/mkmf-cu/Gemfile +3 -0
  6. data/3rd_party/mkmf-cu/LICENSE +21 -0
  7. data/3rd_party/mkmf-cu/README.md +36 -0
  8. data/3rd_party/mkmf-cu/Rakefile +11 -0
  9. data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +4 -0
  10. data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +32 -0
  11. data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +80 -0
  12. data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +157 -0
  13. data/3rd_party/mkmf-cu/mkmf-cu.gemspec +16 -0
  14. data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +67 -0
  15. data/CODE_OF_CONDUCT.md +46 -0
  16. data/Gemfile +8 -0
  17. data/LICENSE.txt +82 -0
  18. data/README.md +252 -0
  19. data/Rakefile +43 -0
  20. data/bench/broadcast_fp32.rb +138 -0
  21. data/bench/cumo_bench.rb +193 -0
  22. data/bench/numo_bench.rb +138 -0
  23. data/bench/reduction_fp32.rb +117 -0
  24. data/bin/console +14 -0
  25. data/bin/setup +8 -0
  26. data/cumo.gemspec +32 -0
  27. data/ext/cumo/cuda/cublas.c +278 -0
  28. data/ext/cumo/cuda/driver.c +421 -0
  29. data/ext/cumo/cuda/memory_pool.cpp +185 -0
  30. data/ext/cumo/cuda/memory_pool_impl.cpp +308 -0
  31. data/ext/cumo/cuda/memory_pool_impl.hpp +370 -0
  32. data/ext/cumo/cuda/memory_pool_impl_test.cpp +554 -0
  33. data/ext/cumo/cuda/nvrtc.c +207 -0
  34. data/ext/cumo/cuda/runtime.c +167 -0
  35. data/ext/cumo/cumo.c +148 -0
  36. data/ext/cumo/depend.erb +58 -0
  37. data/ext/cumo/extconf.rb +179 -0
  38. data/ext/cumo/include/cumo.h +25 -0
  39. data/ext/cumo/include/cumo/compat.h +23 -0
  40. data/ext/cumo/include/cumo/cuda/cublas.h +153 -0
  41. data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +187 -0
  42. data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +79 -0
  43. data/ext/cumo/include/cumo/cuda/driver.h +22 -0
  44. data/ext/cumo/include/cumo/cuda/memory_pool.h +28 -0
  45. data/ext/cumo/include/cumo/cuda/nvrtc.h +22 -0
  46. data/ext/cumo/include/cumo/cuda/runtime.h +40 -0
  47. data/ext/cumo/include/cumo/indexer.h +238 -0
  48. data/ext/cumo/include/cumo/intern.h +142 -0
  49. data/ext/cumo/include/cumo/intern_fwd.h +38 -0
  50. data/ext/cumo/include/cumo/intern_kernel.h +6 -0
  51. data/ext/cumo/include/cumo/narray.h +429 -0
  52. data/ext/cumo/include/cumo/narray_kernel.h +149 -0
  53. data/ext/cumo/include/cumo/ndloop.h +95 -0
  54. data/ext/cumo/include/cumo/reduce_kernel.h +126 -0
  55. data/ext/cumo/include/cumo/template.h +158 -0
  56. data/ext/cumo/include/cumo/template_kernel.h +77 -0
  57. data/ext/cumo/include/cumo/types/bit.h +40 -0
  58. data/ext/cumo/include/cumo/types/bit_kernel.h +34 -0
  59. data/ext/cumo/include/cumo/types/complex.h +402 -0
  60. data/ext/cumo/include/cumo/types/complex_kernel.h +414 -0
  61. data/ext/cumo/include/cumo/types/complex_macro.h +382 -0
  62. data/ext/cumo/include/cumo/types/complex_macro_kernel.h +186 -0
  63. data/ext/cumo/include/cumo/types/dcomplex.h +46 -0
  64. data/ext/cumo/include/cumo/types/dcomplex_kernel.h +13 -0
  65. data/ext/cumo/include/cumo/types/dfloat.h +47 -0
  66. data/ext/cumo/include/cumo/types/dfloat_kernel.h +14 -0
  67. data/ext/cumo/include/cumo/types/float_def.h +34 -0
  68. data/ext/cumo/include/cumo/types/float_def_kernel.h +39 -0
  69. data/ext/cumo/include/cumo/types/float_macro.h +191 -0
  70. data/ext/cumo/include/cumo/types/float_macro_kernel.h +158 -0
  71. data/ext/cumo/include/cumo/types/int16.h +24 -0
  72. data/ext/cumo/include/cumo/types/int16_kernel.h +23 -0
  73. data/ext/cumo/include/cumo/types/int32.h +24 -0
  74. data/ext/cumo/include/cumo/types/int32_kernel.h +19 -0
  75. data/ext/cumo/include/cumo/types/int64.h +24 -0
  76. data/ext/cumo/include/cumo/types/int64_kernel.h +19 -0
  77. data/ext/cumo/include/cumo/types/int8.h +24 -0
  78. data/ext/cumo/include/cumo/types/int8_kernel.h +19 -0
  79. data/ext/cumo/include/cumo/types/int_macro.h +67 -0
  80. data/ext/cumo/include/cumo/types/int_macro_kernel.h +48 -0
  81. data/ext/cumo/include/cumo/types/real_accum.h +486 -0
  82. data/ext/cumo/include/cumo/types/real_accum_kernel.h +101 -0
  83. data/ext/cumo/include/cumo/types/robj_macro.h +80 -0
  84. data/ext/cumo/include/cumo/types/robj_macro_kernel.h +0 -0
  85. data/ext/cumo/include/cumo/types/robject.h +27 -0
  86. data/ext/cumo/include/cumo/types/robject_kernel.h +7 -0
  87. data/ext/cumo/include/cumo/types/scomplex.h +46 -0
  88. data/ext/cumo/include/cumo/types/scomplex_kernel.h +13 -0
  89. data/ext/cumo/include/cumo/types/sfloat.h +48 -0
  90. data/ext/cumo/include/cumo/types/sfloat_kernel.h +14 -0
  91. data/ext/cumo/include/cumo/types/uint16.h +25 -0
  92. data/ext/cumo/include/cumo/types/uint16_kernel.h +20 -0
  93. data/ext/cumo/include/cumo/types/uint32.h +25 -0
  94. data/ext/cumo/include/cumo/types/uint32_kernel.h +20 -0
  95. data/ext/cumo/include/cumo/types/uint64.h +25 -0
  96. data/ext/cumo/include/cumo/types/uint64_kernel.h +20 -0
  97. data/ext/cumo/include/cumo/types/uint8.h +25 -0
  98. data/ext/cumo/include/cumo/types/uint8_kernel.h +20 -0
  99. data/ext/cumo/include/cumo/types/uint_macro.h +58 -0
  100. data/ext/cumo/include/cumo/types/uint_macro_kernel.h +38 -0
  101. data/ext/cumo/include/cumo/types/xint_macro.h +169 -0
  102. data/ext/cumo/include/cumo/types/xint_macro_kernel.h +88 -0
  103. data/ext/cumo/narray/SFMT-params.h +97 -0
  104. data/ext/cumo/narray/SFMT-params19937.h +46 -0
  105. data/ext/cumo/narray/SFMT.c +620 -0
  106. data/ext/cumo/narray/SFMT.h +167 -0
  107. data/ext/cumo/narray/array.c +638 -0
  108. data/ext/cumo/narray/data.c +961 -0
  109. data/ext/cumo/narray/gen/cogen.rb +56 -0
  110. data/ext/cumo/narray/gen/cogen_kernel.rb +58 -0
  111. data/ext/cumo/narray/gen/def/bit.rb +37 -0
  112. data/ext/cumo/narray/gen/def/dcomplex.rb +39 -0
  113. data/ext/cumo/narray/gen/def/dfloat.rb +37 -0
  114. data/ext/cumo/narray/gen/def/int16.rb +36 -0
  115. data/ext/cumo/narray/gen/def/int32.rb +36 -0
  116. data/ext/cumo/narray/gen/def/int64.rb +36 -0
  117. data/ext/cumo/narray/gen/def/int8.rb +36 -0
  118. data/ext/cumo/narray/gen/def/robject.rb +37 -0
  119. data/ext/cumo/narray/gen/def/scomplex.rb +39 -0
  120. data/ext/cumo/narray/gen/def/sfloat.rb +37 -0
  121. data/ext/cumo/narray/gen/def/uint16.rb +36 -0
  122. data/ext/cumo/narray/gen/def/uint32.rb +36 -0
  123. data/ext/cumo/narray/gen/def/uint64.rb +36 -0
  124. data/ext/cumo/narray/gen/def/uint8.rb +36 -0
  125. data/ext/cumo/narray/gen/erbpp2.rb +346 -0
  126. data/ext/cumo/narray/gen/narray_def.rb +268 -0
  127. data/ext/cumo/narray/gen/spec.rb +425 -0
  128. data/ext/cumo/narray/gen/tmpl/accum.c +86 -0
  129. data/ext/cumo/narray/gen/tmpl/accum_binary.c +121 -0
  130. data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +61 -0
  131. data/ext/cumo/narray/gen/tmpl/accum_index.c +119 -0
  132. data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +66 -0
  133. data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +12 -0
  134. data/ext/cumo/narray/gen/tmpl/alloc_func.c +107 -0
  135. data/ext/cumo/narray/gen/tmpl/allocate.c +37 -0
  136. data/ext/cumo/narray/gen/tmpl/aref.c +66 -0
  137. data/ext/cumo/narray/gen/tmpl/aref_cpu.c +50 -0
  138. data/ext/cumo/narray/gen/tmpl/aset.c +56 -0
  139. data/ext/cumo/narray/gen/tmpl/binary.c +162 -0
  140. data/ext/cumo/narray/gen/tmpl/binary2.c +70 -0
  141. data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +15 -0
  142. data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +31 -0
  143. data/ext/cumo/narray/gen/tmpl/binary_s.c +45 -0
  144. data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +15 -0
  145. data/ext/cumo/narray/gen/tmpl/bincount.c +181 -0
  146. data/ext/cumo/narray/gen/tmpl/cast.c +44 -0
  147. data/ext/cumo/narray/gen/tmpl/cast_array.c +13 -0
  148. data/ext/cumo/narray/gen/tmpl/class.c +9 -0
  149. data/ext/cumo/narray/gen/tmpl/class_kernel.cu +6 -0
  150. data/ext/cumo/narray/gen/tmpl/clip.c +121 -0
  151. data/ext/cumo/narray/gen/tmpl/coerce_cast.c +10 -0
  152. data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +129 -0
  153. data/ext/cumo/narray/gen/tmpl/cond_binary.c +68 -0
  154. data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +18 -0
  155. data/ext/cumo/narray/gen/tmpl/cond_unary.c +46 -0
  156. data/ext/cumo/narray/gen/tmpl/cum.c +50 -0
  157. data/ext/cumo/narray/gen/tmpl/each.c +47 -0
  158. data/ext/cumo/narray/gen/tmpl/each_with_index.c +70 -0
  159. data/ext/cumo/narray/gen/tmpl/ewcomp.c +79 -0
  160. data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +19 -0
  161. data/ext/cumo/narray/gen/tmpl/extract.c +22 -0
  162. data/ext/cumo/narray/gen/tmpl/extract_cpu.c +26 -0
  163. data/ext/cumo/narray/gen/tmpl/extract_data.c +53 -0
  164. data/ext/cumo/narray/gen/tmpl/eye.c +105 -0
  165. data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +19 -0
  166. data/ext/cumo/narray/gen/tmpl/fill.c +52 -0
  167. data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +29 -0
  168. data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +106 -0
  169. data/ext/cumo/narray/gen/tmpl/format.c +62 -0
  170. data/ext/cumo/narray/gen/tmpl/format_to_a.c +49 -0
  171. data/ext/cumo/narray/gen/tmpl/frexp.c +38 -0
  172. data/ext/cumo/narray/gen/tmpl/gemm.c +203 -0
  173. data/ext/cumo/narray/gen/tmpl/init_class.c +20 -0
  174. data/ext/cumo/narray/gen/tmpl/init_module.c +12 -0
  175. data/ext/cumo/narray/gen/tmpl/inspect.c +21 -0
  176. data/ext/cumo/narray/gen/tmpl/lib.c +50 -0
  177. data/ext/cumo/narray/gen/tmpl/lib_kernel.cu +24 -0
  178. data/ext/cumo/narray/gen/tmpl/logseq.c +102 -0
  179. data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +31 -0
  180. data/ext/cumo/narray/gen/tmpl/map_with_index.c +98 -0
  181. data/ext/cumo/narray/gen/tmpl/median.c +66 -0
  182. data/ext/cumo/narray/gen/tmpl/minmax.c +47 -0
  183. data/ext/cumo/narray/gen/tmpl/module.c +9 -0
  184. data/ext/cumo/narray/gen/tmpl/module_kernel.cu +1 -0
  185. data/ext/cumo/narray/gen/tmpl/new_dim0.c +15 -0
  186. data/ext/cumo/narray/gen/tmpl/new_dim0_kernel.cu +8 -0
  187. data/ext/cumo/narray/gen/tmpl/poly.c +50 -0
  188. data/ext/cumo/narray/gen/tmpl/pow.c +97 -0
  189. data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +29 -0
  190. data/ext/cumo/narray/gen/tmpl/powint.c +17 -0
  191. data/ext/cumo/narray/gen/tmpl/qsort.c +212 -0
  192. data/ext/cumo/narray/gen/tmpl/rand.c +168 -0
  193. data/ext/cumo/narray/gen/tmpl/rand_norm.c +121 -0
  194. data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +75 -0
  195. data/ext/cumo/narray/gen/tmpl/seq.c +112 -0
  196. data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +43 -0
  197. data/ext/cumo/narray/gen/tmpl/set2.c +57 -0
  198. data/ext/cumo/narray/gen/tmpl/sort.c +48 -0
  199. data/ext/cumo/narray/gen/tmpl/sort_index.c +111 -0
  200. data/ext/cumo/narray/gen/tmpl/store.c +41 -0
  201. data/ext/cumo/narray/gen/tmpl/store_array.c +187 -0
  202. data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +58 -0
  203. data/ext/cumo/narray/gen/tmpl/store_bit.c +86 -0
  204. data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +66 -0
  205. data/ext/cumo/narray/gen/tmpl/store_from.c +81 -0
  206. data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +58 -0
  207. data/ext/cumo/narray/gen/tmpl/store_kernel.cu +3 -0
  208. data/ext/cumo/narray/gen/tmpl/store_numeric.c +9 -0
  209. data/ext/cumo/narray/gen/tmpl/to_a.c +43 -0
  210. data/ext/cumo/narray/gen/tmpl/unary.c +132 -0
  211. data/ext/cumo/narray/gen/tmpl/unary2.c +60 -0
  212. data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +72 -0
  213. data/ext/cumo/narray/gen/tmpl/unary_ret2.c +34 -0
  214. data/ext/cumo/narray/gen/tmpl/unary_s.c +86 -0
  215. data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +58 -0
  216. data/ext/cumo/narray/gen/tmpl_bit/allocate.c +24 -0
  217. data/ext/cumo/narray/gen/tmpl_bit/aref.c +54 -0
  218. data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +57 -0
  219. data/ext/cumo/narray/gen/tmpl_bit/aset.c +56 -0
  220. data/ext/cumo/narray/gen/tmpl_bit/binary.c +98 -0
  221. data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +64 -0
  222. data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +88 -0
  223. data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +76 -0
  224. data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +133 -0
  225. data/ext/cumo/narray/gen/tmpl_bit/each.c +48 -0
  226. data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +70 -0
  227. data/ext/cumo/narray/gen/tmpl_bit/extract.c +30 -0
  228. data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +29 -0
  229. data/ext/cumo/narray/gen/tmpl_bit/fill.c +69 -0
  230. data/ext/cumo/narray/gen/tmpl_bit/format.c +64 -0
  231. data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +51 -0
  232. data/ext/cumo/narray/gen/tmpl_bit/inspect.c +21 -0
  233. data/ext/cumo/narray/gen/tmpl_bit/mask.c +136 -0
  234. data/ext/cumo/narray/gen/tmpl_bit/none_p.c +14 -0
  235. data/ext/cumo/narray/gen/tmpl_bit/store_array.c +108 -0
  236. data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +70 -0
  237. data/ext/cumo/narray/gen/tmpl_bit/store_from.c +60 -0
  238. data/ext/cumo/narray/gen/tmpl_bit/to_a.c +47 -0
  239. data/ext/cumo/narray/gen/tmpl_bit/unary.c +81 -0
  240. data/ext/cumo/narray/gen/tmpl_bit/where.c +90 -0
  241. data/ext/cumo/narray/gen/tmpl_bit/where2.c +95 -0
  242. data/ext/cumo/narray/index.c +880 -0
  243. data/ext/cumo/narray/kwargs.c +153 -0
  244. data/ext/cumo/narray/math.c +142 -0
  245. data/ext/cumo/narray/narray.c +1948 -0
  246. data/ext/cumo/narray/ndloop.c +2105 -0
  247. data/ext/cumo/narray/rand.c +45 -0
  248. data/ext/cumo/narray/step.c +474 -0
  249. data/ext/cumo/narray/struct.c +886 -0
  250. data/lib/cumo.rb +3 -0
  251. data/lib/cumo/cuda.rb +11 -0
  252. data/lib/cumo/cuda/compile_error.rb +36 -0
  253. data/lib/cumo/cuda/compiler.rb +161 -0
  254. data/lib/cumo/cuda/device.rb +47 -0
  255. data/lib/cumo/cuda/link_state.rb +31 -0
  256. data/lib/cumo/cuda/module.rb +40 -0
  257. data/lib/cumo/cuda/nvrtc_program.rb +27 -0
  258. data/lib/cumo/linalg.rb +12 -0
  259. data/lib/cumo/narray.rb +2 -0
  260. data/lib/cumo/narray/extra.rb +1278 -0
  261. data/lib/erbpp.rb +294 -0
  262. data/lib/erbpp/line_number.rb +137 -0
  263. data/lib/erbpp/narray_def.rb +381 -0
  264. data/numo-narray-version +1 -0
  265. data/run.gdb +7 -0
  266. metadata +353 -0
@@ -0,0 +1,117 @@
1
+ require 'benchmark'
2
+ require 'cumo/narray'
3
+
4
+ num_iteration = 100
5
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
6
+
7
+ Benchmark.bm 30 do |r|
8
+ x = Cumo::SFloat.ones([500,500])
9
+ r.report "x.sum" do
10
+ num_iteration.times do
11
+ x.sum
12
+ end
13
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
14
+ end
15
+
16
+ x = Cumo::SFloat.ones([500,500])
17
+ r.report "x.sum(axis: 0)" do
18
+ num_iteration.times do
19
+ x.sum(axis: 0)
20
+ end
21
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
22
+ end
23
+
24
+ x = Cumo::SFloat.ones([500,500])
25
+ r.report "x.sum(axis: 1)" do
26
+ num_iteration.times do
27
+ x.sum(axis: 1)
28
+ end
29
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
30
+ end
31
+
32
+ x = Cumo::SFloat.ones([500,500])
33
+ r.report "x.sum(keepdims: true)" do
34
+ num_iteration.times do
35
+ x.sum(keepdims: true)
36
+ end
37
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
38
+ end
39
+
40
+ x = Cumo::SFloat.ones([500,500])
41
+ r.report "x.sum(axis: 0, keepdims: true)" do
42
+ num_iteration.times do
43
+ x.sum(axis: 0, keepdims: true)
44
+ end
45
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
46
+ end
47
+
48
+ x = Cumo::SFloat.ones([500,500])
49
+ r.report "x.sum(axis: 1, keepdims: true)" do
50
+ num_iteration.times do
51
+ x.sum(axis: 1, keepdims: true)
52
+ end
53
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
54
+ end
55
+
56
+ x = Cumo::SFloat.ones([500,500])
57
+ r.report "x.max" do
58
+ num_iteration.times do
59
+ x.max
60
+ end
61
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
62
+ end
63
+
64
+ x = Cumo::SFloat.ones([500,500])
65
+ r.report "x.max(axis: 0)" do
66
+ num_iteration.times do
67
+ x.max(axis: 0)
68
+ end
69
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
70
+ end
71
+
72
+ x = Cumo::SFloat.ones([500,500])
73
+ r.report "x.max(axis: 1)" do
74
+ num_iteration.times do
75
+ x.max(axis: 1)
76
+ end
77
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
78
+ end
79
+
80
+ x = Cumo::SFloat.ones([500,500])
81
+ r.report "x.max(keepdims: true)" do
82
+ num_iteration.times do
83
+ x.max(keepdims: true)
84
+ end
85
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
86
+ end
87
+
88
+ x = Cumo::SFloat.ones([500,500])
89
+ r.report "x.max(axis: 0, keepdims: true)" do
90
+ num_iteration.times do
91
+ x.max(axis: 0, keepdims: true)
92
+ end
93
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
94
+ end
95
+
96
+ x = Cumo::SFloat.ones([500,500])
97
+ r.report "x.max(axis: 1, keepdims: true)" do
98
+ num_iteration.times do
99
+ x.max(axis: 1, keepdims: true)
100
+ end
101
+ Cumo::CUDA::Runtime.cudaDeviceSynchronize
102
+ end
103
+ end
104
+
105
+ # user system total real
106
+ # x.sum 0.170000 0.020000 0.190000 ( 0.188164)
107
+ # x.sum(axis: 0) 0.070000 0.010000 0.080000 ( 0.081719)
108
+ # x.sum(axis: 1) 0.060000 0.020000 0.080000 ( 0.080435)
109
+ # x.sum(keepdims: true) 0.120000 0.040000 0.160000 ( 0.153970)
110
+ # x.sum(axis: 0, keepdims: true) 0.070000 0.010000 0.080000 ( 0.083349)
111
+ # x.sum(axis: 1, keepdims: true) 0.080000 0.010000 0.090000 ( 0.083299)
112
+ # x.max 0.140000 0.020000 0.160000 ( 0.158882)
113
+ # x.max(axis: 0) 0.080000 0.000000 0.080000 ( 0.081502)
114
+ # x.max(axis: 1) 0.080000 0.000000 0.080000 ( 0.080473)
115
+ # x.max(keepdims: true) 0.140000 0.020000 0.160000 ( 0.159530)
116
+ # x.max(axis: 0, keepdims: true) 0.070000 0.020000 0.090000 ( 0.083434)
117
+ # x.max(axis: 1, keepdims: true) 0.080000 0.000000 0.080000 ( 0.083299)
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "cumo"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,32 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ cumo_version = File.read(File.join(__dir__, "ext/cumo/include/cumo.h")).match(/CUMO_VERSION "([\d.]+)"/)[1]
6
+ numo_narray_version = File.read(File.join(__dir__, "numo-narray-version")).strip
7
+
8
+ Gem::Specification.new do |spec|
9
+ spec.name = "cumo"
10
+ spec.version = cumo_version
11
+ spec.authors = ["Naotoshi Seo"]
12
+ spec.email = ["sonots@gmail.com"]
13
+
14
+ spec.summary = %q{Cumo is CUDA aware numerical library whose interface is highly compatible with Ruby Numo}
15
+ spec.description = %q{Cumo is CUDA aware numerical library whose interface is highly compatible with Ruby Numo.}
16
+ spec.homepage = "https://github.com/sonots/cumo"
17
+ spec.license = "BSD-3-Clause"
18
+
19
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
20
+ f.match(%r{^(test|spec|features)/})
21
+ end
22
+ spec.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
23
+ spec.bindir = "exe"
24
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
25
+ spec.require_paths = ["lib"]
26
+ spec.extensions = ["ext/cumo/extconf.rb"]
27
+
28
+ spec.add_runtime_dependency "numo-narray", numo_narray_version
29
+
30
+ spec.add_development_dependency "bundler", "~> 1.15"
31
+ spec.add_development_dependency "rake", "~> 10.0"
32
+ end
@@ -0,0 +1,278 @@
1
+ #include "cumo/cuda/cublas.h"
2
+
3
+ #include <assert.h>
4
+ #include <ruby.h>
5
+ #include "cumo/narray.h"
6
+ #include "cumo/template.h"
7
+
8
+ //static void *blas_handle = 0;
9
+ //static char *blas_prefix = 0;
10
+
11
+ VALUE
12
+ cumo_cublas_option_value(VALUE value, VALUE default_value)
13
+ {
14
+ switch(TYPE(value)) {
15
+ case T_NIL:
16
+ case T_UNDEF:
17
+ return default_value;
18
+ }
19
+ return value;
20
+ }
21
+
22
+ //enum CBLAS_ORDER
23
+ //cumo_cublas_option_order(VALUE order)
24
+ //{
25
+ // int opt;
26
+ // char *ptr;
27
+ //
28
+ // switch(TYPE(order)) {
29
+ // case T_NIL:
30
+ // case T_UNDEF:
31
+ // case T_FALSE:
32
+ // return CblasRowMajor;
33
+ // case T_TRUE:
34
+ // return CblasColMajor;
35
+ // case T_FIXNUM:
36
+ // opt = FIX2INT(order);
37
+ // if (opt >= CblasRowMajor && opt <= CblasColMajor) {
38
+ // return opt;
39
+ // }
40
+ // break;
41
+ // case T_SYMBOL:
42
+ // order = rb_sym2str(order);
43
+ // case T_STRING:
44
+ // ptr = RSTRING_PTR(order);
45
+ // if (RSTRING_LEN(order) > 0) {
46
+ // switch(ptr[0]){
47
+ // case 'R': case 'r':
48
+ // return CblasRowMajor;
49
+ // case 'C': case 'c':
50
+ // return CblasColMajor;
51
+ // }
52
+ // }
53
+ // break;
54
+ // }
55
+ // rb_raise(rb_eArgError,"invalid value for CBLAS_ORDER");
56
+ // return 0;
57
+ //}
58
+
59
+ cublasOperation_t
60
+ cumo_cublas_option_trans(VALUE trans)
61
+ {
62
+ int opt;
63
+ char *ptr;
64
+
65
+ switch(TYPE(trans)) {
66
+ case T_NIL:
67
+ case T_UNDEF:
68
+ case T_FALSE:
69
+ return CUBLAS_OP_N;
70
+ case T_TRUE:
71
+ return CUBLAS_OP_T;
72
+ case T_FIXNUM:
73
+ opt = FIX2INT(trans);
74
+ if (opt >= CUBLAS_OP_N && opt <= CUBLAS_OP_C) {
75
+ return opt;
76
+ }
77
+ break;
78
+ case T_SYMBOL:
79
+ trans = rb_sym2str(trans);
80
+ case T_STRING:
81
+ ptr = RSTRING_PTR(trans);
82
+ if (RSTRING_LEN(trans) > 0) {
83
+ switch(ptr[0]){
84
+ case 'N': case 'n':
85
+ return CUBLAS_OP_N;
86
+ case 'T': case 't':
87
+ return CUBLAS_OP_T;
88
+ case 'C': case 'c':
89
+ return CUBLAS_OP_C;
90
+ }
91
+ }
92
+ break;
93
+ }
94
+ rb_raise(rb_eArgError, "invalid value for cublasOperation_t");
95
+ return 0;
96
+ }
97
+
98
+ cublasFillMode_t
99
+ cumo_cublas_option_uplo(VALUE uplo)
100
+ {
101
+ int opt;
102
+ char *ptr;
103
+
104
+ switch(TYPE(uplo)) {
105
+ case T_NIL:
106
+ case T_UNDEF:
107
+ case T_FALSE:
108
+ return CUBLAS_FILL_MODE_UPPER;
109
+ case T_TRUE:
110
+ return CUBLAS_FILL_MODE_LOWER;
111
+ case T_FIXNUM:
112
+ opt = FIX2INT(uplo);
113
+ switch(opt){
114
+ case CUBLAS_FILL_MODE_UPPER:
115
+ case CUBLAS_FILL_MODE_LOWER:
116
+ return opt;
117
+ }
118
+ break;
119
+ case T_SYMBOL:
120
+ uplo = rb_sym2str(uplo);
121
+ case T_STRING:
122
+ ptr = RSTRING_PTR(uplo);
123
+ if (RSTRING_LEN(uplo) > 0) {
124
+ switch(ptr[0]){
125
+ case 'U': case 'u':
126
+ return CUBLAS_FILL_MODE_UPPER;
127
+ case 'L': case 'l':
128
+ return CUBLAS_FILL_MODE_LOWER;
129
+ }
130
+ }
131
+ break;
132
+ }
133
+ rb_raise(rb_eArgError, "invalid value for cublasFillMode_t");
134
+ return 0;
135
+ }
136
+
137
+ cublasDiagType_t
138
+ cumo_cublas_option_diag(VALUE diag)
139
+ {
140
+ int opt;
141
+ char *ptr;
142
+
143
+ switch(TYPE(diag)) {
144
+ case T_NIL:
145
+ case T_UNDEF:
146
+ case T_FALSE:
147
+ return CUBLAS_DIAG_NON_UNIT;
148
+ case T_TRUE:
149
+ return CUBLAS_DIAG_UNIT;
150
+ case T_FIXNUM:
151
+ opt = FIX2INT(diag);
152
+ switch(opt){
153
+ case CUBLAS_DIAG_NON_UNIT:
154
+ case CUBLAS_DIAG_UNIT:
155
+ return opt;
156
+ }
157
+ break;
158
+ case T_SYMBOL:
159
+ diag = rb_sym2str(diag);
160
+ case T_STRING:
161
+ ptr = RSTRING_PTR(diag);
162
+ if (RSTRING_LEN(diag) > 0) {
163
+ switch(ptr[0]){
164
+ case 'N': case 'n':
165
+ return CUBLAS_DIAG_NON_UNIT;
166
+ case 'U': case 'u':
167
+ return CUBLAS_DIAG_UNIT;
168
+ }
169
+ }
170
+ break;
171
+ }
172
+ rb_raise(rb_eArgError, "invalid value for cublasDiagType_t");
173
+ return 0;
174
+ }
175
+
176
+ cublasSideMode_t
177
+ cumo_cublas_option_side(VALUE side)
178
+ {
179
+ int opt;
180
+ char *ptr;
181
+
182
+ switch(TYPE(side)) {
183
+ case T_NIL:
184
+ case T_UNDEF:
185
+ case T_FALSE:
186
+ return CUBLAS_SIDE_LEFT;
187
+ case T_TRUE:
188
+ return CUBLAS_SIDE_RIGHT;
189
+ case T_FIXNUM:
190
+ opt = FIX2INT(side);
191
+ switch(opt){
192
+ case CUBLAS_SIDE_LEFT:
193
+ case CUBLAS_SIDE_RIGHT:
194
+ return opt;
195
+ }
196
+ break;
197
+ case T_SYMBOL:
198
+ side = rb_sym2str(side);
199
+ case T_STRING:
200
+ ptr = RSTRING_PTR(side);
201
+ if (RSTRING_LEN(side) > 0) {
202
+ switch(ptr[0]){
203
+ case 'L': case 'l':
204
+ return CUBLAS_SIDE_LEFT;
205
+ case 'R': case 'r':
206
+ return CUBLAS_SIDE_RIGHT;
207
+ }
208
+ }
209
+ break;
210
+ }
211
+ rb_raise(rb_eArgError, "invalid value for cublasSideMode_t");
212
+ return 0;
213
+ }
214
+
215
+ //void
216
+ //cumo_cublas_check_func(void **func, const char *name)
217
+ //{
218
+ // char *s, *error;
219
+ //
220
+ // if (*func==0) {
221
+ // if (blas_handle==0) {
222
+ // rb_raise(rb_eRuntimeError,"BLAS library is not loaded");
223
+ // }
224
+ // if (blas_prefix==0) {
225
+ // rb_raise(rb_eRuntimeError,"CBLAS prefix is not set");
226
+ // }
227
+ // s = alloca(strlen(blas_prefix)+strlen(name)+1);
228
+ // strcpy(s,blas_prefix);
229
+ // strcat(s,name);
230
+ // dlerror();
231
+ // *func = dlsym(blas_handle, s);
232
+ // error = dlerror();
233
+ // if (error != NULL) {
234
+ // rb_raise(rb_eRuntimeError, "%s", error);
235
+ // }
236
+ // }
237
+ //}
238
+
239
+ //static VALUE
240
+ //blas_s_prefix_set(VALUE mod, VALUE prefix)
241
+ //{
242
+ // long len;
243
+ //
244
+ // if (TYPE(prefix) != T_STRING) {
245
+ // rb_raise(rb_eTypeError,"argument must be string");
246
+ // }
247
+ // if (blas_prefix) {
248
+ // free(blas_prefix);
249
+ // }
250
+ // len = RSTRING_LEN(prefix);
251
+ // blas_prefix = malloc(len+1);
252
+ // strcpy(blas_prefix, StringValueCStr(prefix));
253
+ // return prefix;
254
+ //}
255
+
256
+ //void
257
+ //Init_blas(void)
258
+ //{
259
+ // VALUE mN;
260
+ //
261
+ // mN = rb_define_module("Numo");
262
+ // /*
263
+ // Document-module: Numo::Linalg
264
+ // */
265
+ // mLinalg = rb_define_module_under(mN, "Linalg");
266
+ // mBlas = rb_define_module_under(mLinalg, "Blas");
267
+ //
268
+ // rb_define_module_function(mBlas, "dlopen", blas_s_dlopen, -1);
269
+ // rb_define_module_function(mBlas, "prefix=", blas_s_prefix_set, 1);
270
+ //
271
+ // blas_prefix = malloc(strlen("cublas_")+1); // default prefix
272
+ // strcpy(blas_prefix,"cublas_");
273
+ //
274
+ // Init_cumo_linalg_blas_s();
275
+ // Init_cumo_linalg_blas_d();
276
+ // Init_cumo_linalg_blas_c();
277
+ // Init_cumo_linalg_blas_z();
278
+ //}