cumo 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +27 -0
- data/.travis.yml +5 -0
- data/3rd_party/mkmf-cu/.gitignore +36 -0
- data/3rd_party/mkmf-cu/Gemfile +3 -0
- data/3rd_party/mkmf-cu/LICENSE +21 -0
- data/3rd_party/mkmf-cu/README.md +36 -0
- data/3rd_party/mkmf-cu/Rakefile +11 -0
- data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +4 -0
- data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +32 -0
- data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +80 -0
- data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +157 -0
- data/3rd_party/mkmf-cu/mkmf-cu.gemspec +16 -0
- data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +67 -0
- data/CODE_OF_CONDUCT.md +46 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +82 -0
- data/README.md +252 -0
- data/Rakefile +43 -0
- data/bench/broadcast_fp32.rb +138 -0
- data/bench/cumo_bench.rb +193 -0
- data/bench/numo_bench.rb +138 -0
- data/bench/reduction_fp32.rb +117 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/cumo.gemspec +32 -0
- data/ext/cumo/cuda/cublas.c +278 -0
- data/ext/cumo/cuda/driver.c +421 -0
- data/ext/cumo/cuda/memory_pool.cpp +185 -0
- data/ext/cumo/cuda/memory_pool_impl.cpp +308 -0
- data/ext/cumo/cuda/memory_pool_impl.hpp +370 -0
- data/ext/cumo/cuda/memory_pool_impl_test.cpp +554 -0
- data/ext/cumo/cuda/nvrtc.c +207 -0
- data/ext/cumo/cuda/runtime.c +167 -0
- data/ext/cumo/cumo.c +148 -0
- data/ext/cumo/depend.erb +58 -0
- data/ext/cumo/extconf.rb +179 -0
- data/ext/cumo/include/cumo.h +25 -0
- data/ext/cumo/include/cumo/compat.h +23 -0
- data/ext/cumo/include/cumo/cuda/cublas.h +153 -0
- data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +187 -0
- data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +79 -0
- data/ext/cumo/include/cumo/cuda/driver.h +22 -0
- data/ext/cumo/include/cumo/cuda/memory_pool.h +28 -0
- data/ext/cumo/include/cumo/cuda/nvrtc.h +22 -0
- data/ext/cumo/include/cumo/cuda/runtime.h +40 -0
- data/ext/cumo/include/cumo/indexer.h +238 -0
- data/ext/cumo/include/cumo/intern.h +142 -0
- data/ext/cumo/include/cumo/intern_fwd.h +38 -0
- data/ext/cumo/include/cumo/intern_kernel.h +6 -0
- data/ext/cumo/include/cumo/narray.h +429 -0
- data/ext/cumo/include/cumo/narray_kernel.h +149 -0
- data/ext/cumo/include/cumo/ndloop.h +95 -0
- data/ext/cumo/include/cumo/reduce_kernel.h +126 -0
- data/ext/cumo/include/cumo/template.h +158 -0
- data/ext/cumo/include/cumo/template_kernel.h +77 -0
- data/ext/cumo/include/cumo/types/bit.h +40 -0
- data/ext/cumo/include/cumo/types/bit_kernel.h +34 -0
- data/ext/cumo/include/cumo/types/complex.h +402 -0
- data/ext/cumo/include/cumo/types/complex_kernel.h +414 -0
- data/ext/cumo/include/cumo/types/complex_macro.h +382 -0
- data/ext/cumo/include/cumo/types/complex_macro_kernel.h +186 -0
- data/ext/cumo/include/cumo/types/dcomplex.h +46 -0
- data/ext/cumo/include/cumo/types/dcomplex_kernel.h +13 -0
- data/ext/cumo/include/cumo/types/dfloat.h +47 -0
- data/ext/cumo/include/cumo/types/dfloat_kernel.h +14 -0
- data/ext/cumo/include/cumo/types/float_def.h +34 -0
- data/ext/cumo/include/cumo/types/float_def_kernel.h +39 -0
- data/ext/cumo/include/cumo/types/float_macro.h +191 -0
- data/ext/cumo/include/cumo/types/float_macro_kernel.h +158 -0
- data/ext/cumo/include/cumo/types/int16.h +24 -0
- data/ext/cumo/include/cumo/types/int16_kernel.h +23 -0
- data/ext/cumo/include/cumo/types/int32.h +24 -0
- data/ext/cumo/include/cumo/types/int32_kernel.h +19 -0
- data/ext/cumo/include/cumo/types/int64.h +24 -0
- data/ext/cumo/include/cumo/types/int64_kernel.h +19 -0
- data/ext/cumo/include/cumo/types/int8.h +24 -0
- data/ext/cumo/include/cumo/types/int8_kernel.h +19 -0
- data/ext/cumo/include/cumo/types/int_macro.h +67 -0
- data/ext/cumo/include/cumo/types/int_macro_kernel.h +48 -0
- data/ext/cumo/include/cumo/types/real_accum.h +486 -0
- data/ext/cumo/include/cumo/types/real_accum_kernel.h +101 -0
- data/ext/cumo/include/cumo/types/robj_macro.h +80 -0
- data/ext/cumo/include/cumo/types/robj_macro_kernel.h +0 -0
- data/ext/cumo/include/cumo/types/robject.h +27 -0
- data/ext/cumo/include/cumo/types/robject_kernel.h +7 -0
- data/ext/cumo/include/cumo/types/scomplex.h +46 -0
- data/ext/cumo/include/cumo/types/scomplex_kernel.h +13 -0
- data/ext/cumo/include/cumo/types/sfloat.h +48 -0
- data/ext/cumo/include/cumo/types/sfloat_kernel.h +14 -0
- data/ext/cumo/include/cumo/types/uint16.h +25 -0
- data/ext/cumo/include/cumo/types/uint16_kernel.h +20 -0
- data/ext/cumo/include/cumo/types/uint32.h +25 -0
- data/ext/cumo/include/cumo/types/uint32_kernel.h +20 -0
- data/ext/cumo/include/cumo/types/uint64.h +25 -0
- data/ext/cumo/include/cumo/types/uint64_kernel.h +20 -0
- data/ext/cumo/include/cumo/types/uint8.h +25 -0
- data/ext/cumo/include/cumo/types/uint8_kernel.h +20 -0
- data/ext/cumo/include/cumo/types/uint_macro.h +58 -0
- data/ext/cumo/include/cumo/types/uint_macro_kernel.h +38 -0
- data/ext/cumo/include/cumo/types/xint_macro.h +169 -0
- data/ext/cumo/include/cumo/types/xint_macro_kernel.h +88 -0
- data/ext/cumo/narray/SFMT-params.h +97 -0
- data/ext/cumo/narray/SFMT-params19937.h +46 -0
- data/ext/cumo/narray/SFMT.c +620 -0
- data/ext/cumo/narray/SFMT.h +167 -0
- data/ext/cumo/narray/array.c +638 -0
- data/ext/cumo/narray/data.c +961 -0
- data/ext/cumo/narray/gen/cogen.rb +56 -0
- data/ext/cumo/narray/gen/cogen_kernel.rb +58 -0
- data/ext/cumo/narray/gen/def/bit.rb +37 -0
- data/ext/cumo/narray/gen/def/dcomplex.rb +39 -0
- data/ext/cumo/narray/gen/def/dfloat.rb +37 -0
- data/ext/cumo/narray/gen/def/int16.rb +36 -0
- data/ext/cumo/narray/gen/def/int32.rb +36 -0
- data/ext/cumo/narray/gen/def/int64.rb +36 -0
- data/ext/cumo/narray/gen/def/int8.rb +36 -0
- data/ext/cumo/narray/gen/def/robject.rb +37 -0
- data/ext/cumo/narray/gen/def/scomplex.rb +39 -0
- data/ext/cumo/narray/gen/def/sfloat.rb +37 -0
- data/ext/cumo/narray/gen/def/uint16.rb +36 -0
- data/ext/cumo/narray/gen/def/uint32.rb +36 -0
- data/ext/cumo/narray/gen/def/uint64.rb +36 -0
- data/ext/cumo/narray/gen/def/uint8.rb +36 -0
- data/ext/cumo/narray/gen/erbpp2.rb +346 -0
- data/ext/cumo/narray/gen/narray_def.rb +268 -0
- data/ext/cumo/narray/gen/spec.rb +425 -0
- data/ext/cumo/narray/gen/tmpl/accum.c +86 -0
- data/ext/cumo/narray/gen/tmpl/accum_binary.c +121 -0
- data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +61 -0
- data/ext/cumo/narray/gen/tmpl/accum_index.c +119 -0
- data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +66 -0
- data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +12 -0
- data/ext/cumo/narray/gen/tmpl/alloc_func.c +107 -0
- data/ext/cumo/narray/gen/tmpl/allocate.c +37 -0
- data/ext/cumo/narray/gen/tmpl/aref.c +66 -0
- data/ext/cumo/narray/gen/tmpl/aref_cpu.c +50 -0
- data/ext/cumo/narray/gen/tmpl/aset.c +56 -0
- data/ext/cumo/narray/gen/tmpl/binary.c +162 -0
- data/ext/cumo/narray/gen/tmpl/binary2.c +70 -0
- data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +15 -0
- data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +31 -0
- data/ext/cumo/narray/gen/tmpl/binary_s.c +45 -0
- data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +15 -0
- data/ext/cumo/narray/gen/tmpl/bincount.c +181 -0
- data/ext/cumo/narray/gen/tmpl/cast.c +44 -0
- data/ext/cumo/narray/gen/tmpl/cast_array.c +13 -0
- data/ext/cumo/narray/gen/tmpl/class.c +9 -0
- data/ext/cumo/narray/gen/tmpl/class_kernel.cu +6 -0
- data/ext/cumo/narray/gen/tmpl/clip.c +121 -0
- data/ext/cumo/narray/gen/tmpl/coerce_cast.c +10 -0
- data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +129 -0
- data/ext/cumo/narray/gen/tmpl/cond_binary.c +68 -0
- data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +18 -0
- data/ext/cumo/narray/gen/tmpl/cond_unary.c +46 -0
- data/ext/cumo/narray/gen/tmpl/cum.c +50 -0
- data/ext/cumo/narray/gen/tmpl/each.c +47 -0
- data/ext/cumo/narray/gen/tmpl/each_with_index.c +70 -0
- data/ext/cumo/narray/gen/tmpl/ewcomp.c +79 -0
- data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +19 -0
- data/ext/cumo/narray/gen/tmpl/extract.c +22 -0
- data/ext/cumo/narray/gen/tmpl/extract_cpu.c +26 -0
- data/ext/cumo/narray/gen/tmpl/extract_data.c +53 -0
- data/ext/cumo/narray/gen/tmpl/eye.c +105 -0
- data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +19 -0
- data/ext/cumo/narray/gen/tmpl/fill.c +52 -0
- data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +29 -0
- data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +106 -0
- data/ext/cumo/narray/gen/tmpl/format.c +62 -0
- data/ext/cumo/narray/gen/tmpl/format_to_a.c +49 -0
- data/ext/cumo/narray/gen/tmpl/frexp.c +38 -0
- data/ext/cumo/narray/gen/tmpl/gemm.c +203 -0
- data/ext/cumo/narray/gen/tmpl/init_class.c +20 -0
- data/ext/cumo/narray/gen/tmpl/init_module.c +12 -0
- data/ext/cumo/narray/gen/tmpl/inspect.c +21 -0
- data/ext/cumo/narray/gen/tmpl/lib.c +50 -0
- data/ext/cumo/narray/gen/tmpl/lib_kernel.cu +24 -0
- data/ext/cumo/narray/gen/tmpl/logseq.c +102 -0
- data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +31 -0
- data/ext/cumo/narray/gen/tmpl/map_with_index.c +98 -0
- data/ext/cumo/narray/gen/tmpl/median.c +66 -0
- data/ext/cumo/narray/gen/tmpl/minmax.c +47 -0
- data/ext/cumo/narray/gen/tmpl/module.c +9 -0
- data/ext/cumo/narray/gen/tmpl/module_kernel.cu +1 -0
- data/ext/cumo/narray/gen/tmpl/new_dim0.c +15 -0
- data/ext/cumo/narray/gen/tmpl/new_dim0_kernel.cu +8 -0
- data/ext/cumo/narray/gen/tmpl/poly.c +50 -0
- data/ext/cumo/narray/gen/tmpl/pow.c +97 -0
- data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +29 -0
- data/ext/cumo/narray/gen/tmpl/powint.c +17 -0
- data/ext/cumo/narray/gen/tmpl/qsort.c +212 -0
- data/ext/cumo/narray/gen/tmpl/rand.c +168 -0
- data/ext/cumo/narray/gen/tmpl/rand_norm.c +121 -0
- data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +75 -0
- data/ext/cumo/narray/gen/tmpl/seq.c +112 -0
- data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +43 -0
- data/ext/cumo/narray/gen/tmpl/set2.c +57 -0
- data/ext/cumo/narray/gen/tmpl/sort.c +48 -0
- data/ext/cumo/narray/gen/tmpl/sort_index.c +111 -0
- data/ext/cumo/narray/gen/tmpl/store.c +41 -0
- data/ext/cumo/narray/gen/tmpl/store_array.c +187 -0
- data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +58 -0
- data/ext/cumo/narray/gen/tmpl/store_bit.c +86 -0
- data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +66 -0
- data/ext/cumo/narray/gen/tmpl/store_from.c +81 -0
- data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +58 -0
- data/ext/cumo/narray/gen/tmpl/store_kernel.cu +3 -0
- data/ext/cumo/narray/gen/tmpl/store_numeric.c +9 -0
- data/ext/cumo/narray/gen/tmpl/to_a.c +43 -0
- data/ext/cumo/narray/gen/tmpl/unary.c +132 -0
- data/ext/cumo/narray/gen/tmpl/unary2.c +60 -0
- data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +72 -0
- data/ext/cumo/narray/gen/tmpl/unary_ret2.c +34 -0
- data/ext/cumo/narray/gen/tmpl/unary_s.c +86 -0
- data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +58 -0
- data/ext/cumo/narray/gen/tmpl_bit/allocate.c +24 -0
- data/ext/cumo/narray/gen/tmpl_bit/aref.c +54 -0
- data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +57 -0
- data/ext/cumo/narray/gen/tmpl_bit/aset.c +56 -0
- data/ext/cumo/narray/gen/tmpl_bit/binary.c +98 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +64 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +88 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +76 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +133 -0
- data/ext/cumo/narray/gen/tmpl_bit/each.c +48 -0
- data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +70 -0
- data/ext/cumo/narray/gen/tmpl_bit/extract.c +30 -0
- data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +29 -0
- data/ext/cumo/narray/gen/tmpl_bit/fill.c +69 -0
- data/ext/cumo/narray/gen/tmpl_bit/format.c +64 -0
- data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +51 -0
- data/ext/cumo/narray/gen/tmpl_bit/inspect.c +21 -0
- data/ext/cumo/narray/gen/tmpl_bit/mask.c +136 -0
- data/ext/cumo/narray/gen/tmpl_bit/none_p.c +14 -0
- data/ext/cumo/narray/gen/tmpl_bit/store_array.c +108 -0
- data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +70 -0
- data/ext/cumo/narray/gen/tmpl_bit/store_from.c +60 -0
- data/ext/cumo/narray/gen/tmpl_bit/to_a.c +47 -0
- data/ext/cumo/narray/gen/tmpl_bit/unary.c +81 -0
- data/ext/cumo/narray/gen/tmpl_bit/where.c +90 -0
- data/ext/cumo/narray/gen/tmpl_bit/where2.c +95 -0
- data/ext/cumo/narray/index.c +880 -0
- data/ext/cumo/narray/kwargs.c +153 -0
- data/ext/cumo/narray/math.c +142 -0
- data/ext/cumo/narray/narray.c +1948 -0
- data/ext/cumo/narray/ndloop.c +2105 -0
- data/ext/cumo/narray/rand.c +45 -0
- data/ext/cumo/narray/step.c +474 -0
- data/ext/cumo/narray/struct.c +886 -0
- data/lib/cumo.rb +3 -0
- data/lib/cumo/cuda.rb +11 -0
- data/lib/cumo/cuda/compile_error.rb +36 -0
- data/lib/cumo/cuda/compiler.rb +161 -0
- data/lib/cumo/cuda/device.rb +47 -0
- data/lib/cumo/cuda/link_state.rb +31 -0
- data/lib/cumo/cuda/module.rb +40 -0
- data/lib/cumo/cuda/nvrtc_program.rb +27 -0
- data/lib/cumo/linalg.rb +12 -0
- data/lib/cumo/narray.rb +2 -0
- data/lib/cumo/narray/extra.rb +1278 -0
- data/lib/erbpp.rb +294 -0
- data/lib/erbpp/line_number.rb +137 -0
- data/lib/erbpp/narray_def.rb +381 -0
- data/numo-narray-version +1 -0
- data/run.gdb +7 -0
- metadata +353 -0
@@ -0,0 +1,79 @@
|
|
1
|
+
/*
|
2
|
+
Element-wise <%=name%> of two arrays.
|
3
|
+
|
4
|
+
<% if is_float %>
|
5
|
+
@overload <%=name%>(a1, a2, nan:false)
|
6
|
+
@param [Cumo::NArray,Numeric] a1 The array to be compared.
|
7
|
+
@param [Cumo::NArray,Numeric] a2 The array to be compared.
|
8
|
+
@param [TrueClass] nan If true, apply NaN-aware algorithm (return NaN if exist).
|
9
|
+
<% else %>
|
10
|
+
@overload <%=name%>(a1, a2)
|
11
|
+
@param [Cumo::NArray,Numeric] a1,a2 The arrays holding the elements to be compared.
|
12
|
+
<% end %>
|
13
|
+
@return [Cumo::<%=class_name%>]
|
14
|
+
*/
|
15
|
+
|
16
|
+
<% (is_float ? ["","_nan"] : [""]).each do |nan| %>
|
17
|
+
|
18
|
+
<% unless type_name == 'robject' %>
|
19
|
+
void cumo_<%=type_name%>_<%=name%><%=nan%>_kernel_launch(char *p1, char* p2, char* p3, ssize_t s1, ssize_t s2, ssize_t s3, size_t n);
|
20
|
+
<% end %>
|
21
|
+
|
22
|
+
static void
|
23
|
+
<%=c_iter%><%=nan%>(na_loop_t *const lp)
|
24
|
+
{
|
25
|
+
size_t n;
|
26
|
+
char *p1, *p2, *p3;
|
27
|
+
ssize_t s1, s2, s3;
|
28
|
+
|
29
|
+
INIT_COUNTER(lp, n);
|
30
|
+
INIT_PTR(lp, 0, p1, s1);
|
31
|
+
INIT_PTR(lp, 1, p2, s2);
|
32
|
+
INIT_PTR(lp, 2, p3, s3);
|
33
|
+
|
34
|
+
<% if type_name == 'robject' %>
|
35
|
+
{
|
36
|
+
size_t i;
|
37
|
+
SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%><%=nan%>", "<%=type_name%>");
|
38
|
+
for (i=0; i<n; i++) {
|
39
|
+
dtype x, y, z;
|
40
|
+
GET_DATA_STRIDE(p1,s1,dtype,x);
|
41
|
+
GET_DATA_STRIDE(p2,s2,dtype,y);
|
42
|
+
GET_DATA(p3,dtype,z);
|
43
|
+
z = f_<%=name%><%=nan%>(x,y);
|
44
|
+
SET_DATA_STRIDE(p3,s3,dtype,z);
|
45
|
+
}
|
46
|
+
}
|
47
|
+
<% else %>
|
48
|
+
{
|
49
|
+
cumo_<%=type_name%>_<%=name%><%=nan%>_kernel_launch(p1,p2,p3,s1,s2,s3,n);
|
50
|
+
}
|
51
|
+
<% end %>
|
52
|
+
}
|
53
|
+
<% end %>
|
54
|
+
|
55
|
+
static VALUE
|
56
|
+
<%=c_func(-1)%>(int argc, VALUE *argv, VALUE mod)
|
57
|
+
{
|
58
|
+
VALUE a1 = Qnil;
|
59
|
+
VALUE a2 = Qnil;
|
60
|
+
ndfunc_arg_in_t ain[2] = {{cT,0},{cT,0}};
|
61
|
+
ndfunc_arg_out_t aout[1] = {{cT,0}};
|
62
|
+
ndfunc_t ndf = { <%=c_iter%>, STRIDE_LOOP_NIP, 2, 1, ain, aout };
|
63
|
+
|
64
|
+
<% if is_float %>
|
65
|
+
VALUE kw_hash = Qnil;
|
66
|
+
ID kw_table[1] = {id_nan};
|
67
|
+
VALUE opts[1] = {Qundef};
|
68
|
+
|
69
|
+
rb_scan_args(argc, argv, "20:", &a1, &a2, &kw_hash);
|
70
|
+
rb_get_kwargs(kw_hash, kw_table, 0, 1, opts);
|
71
|
+
if (opts[0] != Qundef) {
|
72
|
+
ndf.func = <%=c_iter%>_nan;
|
73
|
+
}
|
74
|
+
<% else %>
|
75
|
+
rb_scan_args(argc, argv, "20", &a1, &a2);
|
76
|
+
<% end %>
|
77
|
+
|
78
|
+
return na_ndloop(&ndf, 2, a1, a2);
|
79
|
+
}
|
@@ -0,0 +1,19 @@
|
|
1
|
+
<% unless type_name == 'robject' %>
|
2
|
+
<% (is_float ? ["","_nan"] : [""]).each do |nan| %>
|
3
|
+
|
4
|
+
__global__ void <%="cumo_#{type_name}_#{name}#{nan}_kernel"%>(char* p1, char* p2, char* p3, ssize_t s1, ssize_t s2, ssize_t s3, uint64_t n)
|
5
|
+
{
|
6
|
+
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
|
7
|
+
*((dtype*)(p3+(i*s3))) = f_<%=name%><%=nan%>(*((dtype*)(p1+(i*s1))), *((dtype*)(p2+(i*s2))));
|
8
|
+
}
|
9
|
+
}
|
10
|
+
|
11
|
+
void cumo_<%=type_name%>_<%=name%><%=nan%>_kernel_launch(char *p1, char *p2, char* p3, ssize_t s1, ssize_t s2, ssize_t s3, size_t n)
|
12
|
+
{
|
13
|
+
size_t gridDim = get_gridDim(n);
|
14
|
+
size_t blockDim = get_blockDim(n);
|
15
|
+
<%="cumo_#{type_name}_#{name}#{nan}_kernel"%><<<gridDim, blockDim>>>(p1,p2,p3,s1,s2,s3,n);
|
16
|
+
}
|
17
|
+
|
18
|
+
<% end %>
|
19
|
+
<% end %>
|
@@ -0,0 +1,22 @@
|
|
1
|
+
static VALUE
|
2
|
+
<%=c_func(0)%>_cpu(VALUE self);
|
3
|
+
|
4
|
+
/*
|
5
|
+
Returns self.
|
6
|
+
@overload extract
|
7
|
+
@return [Cumo::NArray]
|
8
|
+
--- Note that Cumo::NArray always returns NArray and does not
|
9
|
+
return a Ruby numeric object as Numo::NArray does to avoid
|
10
|
+
synchronization between CPU and GPU for performance.
|
11
|
+
|
12
|
+
Call `Cumo.enable_compatible_mode` to make this method behave
|
13
|
+
compatible with Numo, or you can use `extract_cpu` method instead.
|
14
|
+
*/
|
15
|
+
static VALUE
|
16
|
+
<%=c_func(0)%>(VALUE self)
|
17
|
+
{
|
18
|
+
if (cumo_compatible_mode_enabled_p()) {
|
19
|
+
return <%=c_func(0)%>_cpu(self);
|
20
|
+
}
|
21
|
+
return self;
|
22
|
+
}
|
@@ -0,0 +1,26 @@
|
|
1
|
+
/*
|
2
|
+
Extract an element only if self is a dimensionless NArray.
|
3
|
+
@overload extract_cpu
|
4
|
+
@return [Numeric,Cumo::NArray]
|
5
|
+
--- Extract element value as Ruby Object if self is a dimensionless NArray,
|
6
|
+
otherwise returns self.
|
7
|
+
This method is compatible with Numo NArray's `extract` method.
|
8
|
+
*/
|
9
|
+
static VALUE
|
10
|
+
<%=c_func(0)%>(VALUE self)
|
11
|
+
{
|
12
|
+
volatile VALUE v;
|
13
|
+
char *ptr;
|
14
|
+
narray_t *na;
|
15
|
+
GetNArray(self,na);
|
16
|
+
|
17
|
+
if (na->ndim==0) {
|
18
|
+
ptr = na_get_pointer_for_read(self) + na_get_offset(self);
|
19
|
+
SHOW_SYNCHRONIZE_WARNING_ONCE("<%=name%>", "<%=type_name%>");
|
20
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
21
|
+
v = m_extract(ptr);
|
22
|
+
na_release_lock(self);
|
23
|
+
return v;
|
24
|
+
}
|
25
|
+
return self;
|
26
|
+
}
|
@@ -0,0 +1,53 @@
|
|
1
|
+
/*
|
2
|
+
Convert a data value of obj (with a single element) to dtype.
|
3
|
+
*/
|
4
|
+
/*
|
5
|
+
static dtype
|
6
|
+
<%=c_func(:nodef)%>(VALUE obj)
|
7
|
+
{
|
8
|
+
narray_t *na;
|
9
|
+
dtype x;
|
10
|
+
char *ptr;
|
11
|
+
size_t pos;
|
12
|
+
VALUE r, klass;
|
13
|
+
|
14
|
+
SHOW_SYNCHRONIZE_WARNING_ONCE("<%=name%>", "<%=type_name%>");
|
15
|
+
cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
|
16
|
+
|
17
|
+
if (IsNArray(obj)) {
|
18
|
+
GetNArray(obj,na);
|
19
|
+
if (na->size != 1) {
|
20
|
+
rb_raise(nary_eShapeError,"narray size should be 1");
|
21
|
+
}
|
22
|
+
klass = CLASS_OF(obj);
|
23
|
+
ptr = na_get_pointer_for_read(obj);
|
24
|
+
pos = na_get_offset(obj);
|
25
|
+
<% find_tmpl("store").definitions.select{|x| x.class==Store}.each do |x| %>
|
26
|
+
if (<%=x.condition("klass")%>) {
|
27
|
+
<%=x.extract_data("ptr","pos","x")%>;
|
28
|
+
return x;
|
29
|
+
}
|
30
|
+
<% end %>
|
31
|
+
|
32
|
+
// coerce
|
33
|
+
r = rb_funcall(obj, rb_intern("coerce_cast"), 1, cT);
|
34
|
+
if (CLASS_OF(r)==cT) {
|
35
|
+
return <%=c_func%>(r);
|
36
|
+
}
|
37
|
+
<% if is_object %>
|
38
|
+
return obj;
|
39
|
+
<% else %>
|
40
|
+
rb_raise(nary_eCastError, "unknown conversion from %s to %s",
|
41
|
+
rb_class2name(CLASS_OF(obj)),
|
42
|
+
rb_class2name(cT));
|
43
|
+
<% end %>
|
44
|
+
}
|
45
|
+
if (TYPE(obj)==T_ARRAY) {
|
46
|
+
if (RARRAY_LEN(obj) != 1) {
|
47
|
+
rb_raise(nary_eShapeError,"array size should be 1");
|
48
|
+
}
|
49
|
+
return m_num_to_data(RARRAY_AREF(obj,0));
|
50
|
+
}
|
51
|
+
return m_num_to_data(obj);
|
52
|
+
}
|
53
|
+
*/
|
@@ -0,0 +1,105 @@
|
|
1
|
+
<% unless type_name == 'robject' %>
|
2
|
+
void <%="cumo_#{c_iter}_stride_kernel_launch"%>(char *ptr, ssize_t s0, ssize_t s1, ssize_t kofs, dtype data, uint64_t n0, uint64_t n1);
|
3
|
+
<% end %>
|
4
|
+
|
5
|
+
#include <cuda_runtime.h>
|
6
|
+
|
7
|
+
static void
|
8
|
+
<%=c_iter%>(na_loop_t *const lp)
|
9
|
+
{
|
10
|
+
size_t n0, n1;
|
11
|
+
ssize_t s0, s1;
|
12
|
+
char *p0;
|
13
|
+
char *g;
|
14
|
+
ssize_t kofs;
|
15
|
+
dtype data;
|
16
|
+
|
17
|
+
g = (char*)(lp->opt_ptr);
|
18
|
+
kofs = *(ssize_t*)g;
|
19
|
+
data = *(dtype*)(g+sizeof(ssize_t));
|
20
|
+
|
21
|
+
n0 = lp->args[0].shape[0];
|
22
|
+
n1 = lp->args[0].shape[1];
|
23
|
+
s0 = lp->args[0].iter[0].step;
|
24
|
+
s1 = lp->args[0].iter[1].step;
|
25
|
+
p0 = NDL_PTR(lp,0);
|
26
|
+
|
27
|
+
<% if type_name == 'robject' %>
|
28
|
+
{
|
29
|
+
size_t i0, i1;
|
30
|
+
char *p1;
|
31
|
+
SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
|
32
|
+
for (i0=0; i0 < n0; i0++) {
|
33
|
+
p1 = p0;
|
34
|
+
for (i1=0; i1 < n1; i1++) {
|
35
|
+
*(dtype*)p1 = (i0+kofs==i1) ? data : m_zero;
|
36
|
+
p1 += s1;
|
37
|
+
}
|
38
|
+
p0 += s0;
|
39
|
+
}
|
40
|
+
}
|
41
|
+
<% else %>
|
42
|
+
<%="cumo_#{c_iter}_stride_kernel_launch"%>(p0,s0,s1,kofs,data,n0,n1);
|
43
|
+
<% end %>
|
44
|
+
}
|
45
|
+
|
46
|
+
/*
|
47
|
+
Eye: Set a value to diagonal components, set 0 to non-diagonal components.
|
48
|
+
@overload <%=name%>([element,offset])
|
49
|
+
@param [Numeric] element Diagonal element to be stored. Default is 1.
|
50
|
+
@param [Integer] offset Diagonal offset from the main diagonal. The
|
51
|
+
default is 0. k>0 for diagonals above the main diagonal, and k<0
|
52
|
+
for diagonals below the main diagonal.
|
53
|
+
@return [Cumo::<%=class_name%>] <%=name%> of self.
|
54
|
+
*/
|
55
|
+
static VALUE
|
56
|
+
<%=c_func(-1)%>(int argc, VALUE *argv, VALUE self)
|
57
|
+
{
|
58
|
+
ndfunc_arg_in_t ain[1] = {{OVERWRITE,2}};
|
59
|
+
ndfunc_t ndf = {<%=c_iter%>, NO_LOOP, 1,0, ain,0};
|
60
|
+
ssize_t kofs;
|
61
|
+
dtype data;
|
62
|
+
char *g;
|
63
|
+
int nd;
|
64
|
+
narray_t *na;
|
65
|
+
|
66
|
+
// check arguments
|
67
|
+
if (argc > 2) {
|
68
|
+
rb_raise(rb_eArgError,"too many arguments (%d for 0..2)",argc);
|
69
|
+
} else if (argc == 2) {
|
70
|
+
data = m_num_to_data(argv[0]);
|
71
|
+
kofs = NUM2SSIZET(argv[1]);
|
72
|
+
} else if (argc == 1) {
|
73
|
+
data = m_num_to_data(argv[0]);
|
74
|
+
kofs = 0;
|
75
|
+
} else {
|
76
|
+
data = m_one;
|
77
|
+
kofs = 0;
|
78
|
+
}
|
79
|
+
|
80
|
+
GetNArray(self,na);
|
81
|
+
nd = na->ndim;
|
82
|
+
if (nd < 2) {
|
83
|
+
rb_raise(nary_eDimensionError,"less than 2-d array");
|
84
|
+
}
|
85
|
+
|
86
|
+
// Diagonal offset from the main diagonal.
|
87
|
+
if (kofs >= 0) {
|
88
|
+
if ((size_t)(kofs) >= na->shape[nd-1]) {
|
89
|
+
rb_raise(rb_eArgError,"invalid diagonal offset(%"SZF"d) for "
|
90
|
+
"last dimension size(%"SZF"d)",kofs,na->shape[nd-1]);
|
91
|
+
}
|
92
|
+
} else {
|
93
|
+
if ((size_t)(-kofs) >= na->shape[nd-2]) {
|
94
|
+
rb_raise(rb_eArgError,"invalid diagonal offset(%"SZF"d) for "
|
95
|
+
"last-1 dimension size(%"SZF"d)",kofs,na->shape[nd-2]);
|
96
|
+
}
|
97
|
+
}
|
98
|
+
|
99
|
+
g = ALLOCA_N(char,sizeof(ssize_t)+sizeof(dtype));
|
100
|
+
*(ssize_t*)g = kofs;
|
101
|
+
*(dtype*)(g+sizeof(ssize_t)) = data;
|
102
|
+
|
103
|
+
na_ndloop3(&ndf, g, 1, self);
|
104
|
+
return self;
|
105
|
+
}
|
@@ -0,0 +1,19 @@
|
|
1
|
+
<% unless type_name == 'robject' %>
|
2
|
+
__global__ void <%="cumo_#{c_iter}_stride_kernel"%>(char*ptr, ssize_t s0, ssize_t s1, ssize_t kofs, dtype data, uint64_t n0, uint64_t n1, uint64_t n)
|
3
|
+
{
|
4
|
+
for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
|
5
|
+
uint64_t i0 = i / n1;
|
6
|
+
uint64_t i1 = i - (i0 * n1);
|
7
|
+
*(dtype*)(ptr + (i0*s0) + (i1*s1)) = (i0+kofs==i1) ? data : m_zero;
|
8
|
+
}
|
9
|
+
}
|
10
|
+
|
11
|
+
void <%="cumo_#{c_iter}_stride_kernel_launch"%>(char *ptr, ssize_t s0, ssize_t s1, ssize_t kofs, dtype data, uint64_t n0, uint64_t n1)
|
12
|
+
{
|
13
|
+
uint64_t n = n0 * n1;
|
14
|
+
size_t gridDim = get_gridDim(n);
|
15
|
+
size_t blockDim = get_blockDim(n);
|
16
|
+
<%="cumo_#{c_iter}_stride_kernel"%><<<gridDim, blockDim>>>(ptr,s0,s1,kofs,data,n0,n1,n);
|
17
|
+
}
|
18
|
+
<% end %>
|
19
|
+
|
@@ -0,0 +1,52 @@
|
|
1
|
+
<% unless type_name == 'robject' %>
|
2
|
+
void <%="cumo_#{c_iter}_index_kernel_launch"%>(char *ptr, size_t *idx, dtype val, uint64_t n);
|
3
|
+
void <%="cumo_#{c_iter}_stride_kernel_launch"%>(char *ptr, ssize_t step, dtype val, uint64_t n);
|
4
|
+
<% end %>
|
5
|
+
|
6
|
+
static void
|
7
|
+
<%=c_iter%>(na_loop_t *const lp)
|
8
|
+
{
|
9
|
+
size_t i;
|
10
|
+
char *p1;
|
11
|
+
ssize_t s1;
|
12
|
+
size_t *idx1;
|
13
|
+
VALUE x = lp->option;
|
14
|
+
dtype y;
|
15
|
+
INIT_COUNTER(lp, i);
|
16
|
+
INIT_PTR_IDX(lp, 0, p1, s1, idx1);
|
17
|
+
y = m_num_to_data(x);
|
18
|
+
<% if type_name == 'robject' %>
|
19
|
+
SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("<%=name%>", "<%=type_name%>");
|
20
|
+
if (idx1) {
|
21
|
+
for (; i--;) {
|
22
|
+
SET_DATA_INDEX(p1,idx1,dtype,y);
|
23
|
+
}
|
24
|
+
} else {
|
25
|
+
for (; i--;) {
|
26
|
+
SET_DATA_STRIDE(p1,s1,dtype,y);
|
27
|
+
}
|
28
|
+
}
|
29
|
+
<% else %>
|
30
|
+
if (idx1) {
|
31
|
+
<%="cumo_#{c_iter}_index_kernel_launch"%>(p1,idx1,y,i);
|
32
|
+
} else {
|
33
|
+
<%="cumo_#{c_iter}_stride_kernel_launch"%>(p1,s1,y,i);
|
34
|
+
}
|
35
|
+
<% end %>
|
36
|
+
}
|
37
|
+
|
38
|
+
/*
|
39
|
+
Fill elements with other.
|
40
|
+
@overload <%=name%> other
|
41
|
+
@param [Numeric] other
|
42
|
+
@return [Cumo::<%=class_name%>] self.
|
43
|
+
*/
|
44
|
+
static VALUE
|
45
|
+
<%=c_func(1)%>(VALUE self, VALUE val)
|
46
|
+
{
|
47
|
+
ndfunc_arg_in_t ain[2] = {{OVERWRITE,0},{sym_option}};
|
48
|
+
ndfunc_t ndf = { <%=c_iter%>, FULL_LOOP, 2, 0, ain, 0 };
|
49
|
+
|
50
|
+
na_ndloop(&ndf, 2, self, val);
|
51
|
+
return self;
|
52
|
+
}
|
@@ -0,0 +1,29 @@
|
|
1
|
+
<% unless type_name == 'robject' %>
|
2
|
+
__global__ void <%="cumo_#{c_iter}_index_kernel"%>(char *ptr, size_t *idx, dtype val, uint64_t n)
|
3
|
+
{
|
4
|
+
for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
|
5
|
+
*(dtype*)(ptr + idx[i]) = val;
|
6
|
+
}
|
7
|
+
}
|
8
|
+
|
9
|
+
__global__ void <%="cumo_#{c_iter}_stride_kernel"%>(char*ptr, ssize_t step, dtype val, uint64_t n)
|
10
|
+
{
|
11
|
+
for (uint64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
|
12
|
+
*(dtype*)(ptr + (i*step)) = val;
|
13
|
+
}
|
14
|
+
}
|
15
|
+
|
16
|
+
void <%="cumo_#{c_iter}_index_kernel_launch"%>(char *ptr, size_t *idx, dtype val, uint64_t n)
|
17
|
+
{
|
18
|
+
size_t gridDim = get_gridDim(n);
|
19
|
+
size_t blockDim = get_blockDim(n);
|
20
|
+
<%="cumo_#{c_iter}_index_kernel"%><<<gridDim, blockDim>>>(ptr,idx,val,n);
|
21
|
+
}
|
22
|
+
|
23
|
+
void <%="cumo_#{c_iter}_stride_kernel_launch"%>(char *ptr, ssize_t step, dtype val, uint64_t n)
|
24
|
+
{
|
25
|
+
size_t gridDim = get_gridDim(n);
|
26
|
+
size_t blockDim = get_blockDim(n);
|
27
|
+
<%="cumo_#{c_iter}_stride_kernel"%><<<gridDim, blockDim>>>(ptr,step,val,n);
|
28
|
+
}
|
29
|
+
<% end %>
|
@@ -0,0 +1,106 @@
|
|
1
|
+
<% f = File.join(File.dirname(__FILE__), 'real_accum_kernel.cu'); ERB.new(File.read(f)).tap {|erb| erb.filename = f }.result(binding) %>
|
2
|
+
|
3
|
+
#if defined(__cplusplus)
|
4
|
+
#if 0
|
5
|
+
{ /* satisfy cc-mode */
|
6
|
+
#endif
|
7
|
+
} /* extern "C" { */
|
8
|
+
#endif
|
9
|
+
|
10
|
+
template<typename Iterator1>
|
11
|
+
__global__ void cumo_<%=type_name%>_mean_kernel(Iterator1 p1_begin, Iterator1 p1_end, <%=dtype%>* p2, uint64_t n)
|
12
|
+
{
|
13
|
+
dtype init = m_zero;
|
14
|
+
*p2 = thrust::reduce(thrust::cuda::par, p1_begin, p1_end, init, thrust::plus<dtype>());
|
15
|
+
*p2 /= (dtype)n;
|
16
|
+
}
|
17
|
+
|
18
|
+
template<typename Iterator1>
|
19
|
+
__global__ void cumo_<%=type_name%>_var_kernel(Iterator1 p1_begin, Iterator1 p1_end, <%=dtype%>* p2)
|
20
|
+
{
|
21
|
+
cumo_thrust_variance_unary_op<dtype> unary_op;
|
22
|
+
cumo_thrust_variance_binary_op<dtype> binary_op;
|
23
|
+
cumo_thrust_variance_data<dtype> init = {};
|
24
|
+
cumo_thrust_variance_data<dtype> result;
|
25
|
+
result = thrust::transform_reduce(thrust::cuda::par, p1_begin, p1_end, unary_op, init, binary_op);
|
26
|
+
*p2 = result.variance();
|
27
|
+
}
|
28
|
+
|
29
|
+
template<typename Iterator1>
|
30
|
+
__global__ void cumo_<%=type_name%>_stddev_kernel(Iterator1 p1_begin, Iterator1 p1_end, <%=dtype%>* p2)
|
31
|
+
{
|
32
|
+
cumo_thrust_variance_unary_op<dtype> unary_op;
|
33
|
+
cumo_thrust_variance_binary_op<dtype> binary_op;
|
34
|
+
cumo_thrust_variance_data<dtype> init = {};
|
35
|
+
cumo_thrust_variance_data<dtype> result;
|
36
|
+
result = thrust::transform_reduce(thrust::cuda::par, p1_begin, p1_end, unary_op, init, binary_op);
|
37
|
+
*p2 = m_sqrt(result.variance());
|
38
|
+
}
|
39
|
+
|
40
|
+
template<typename Iterator1>
|
41
|
+
__global__ void cumo_<%=type_name%>_rms_kernel(Iterator1 p1_begin, Iterator1 p1_end, <%=dtype%>* p2, uint64_t n)
|
42
|
+
{
|
43
|
+
dtype init = m_zero;
|
44
|
+
dtype result;
|
45
|
+
result = thrust::transform_reduce(thrust::cuda::par, p1_begin, p1_end, cumo_thrust_square(), init, thrust::plus<dtype>());
|
46
|
+
*p2 = m_sqrt(m_div(result,n));
|
47
|
+
}
|
48
|
+
|
49
|
+
#if defined(__cplusplus)
|
50
|
+
extern "C" {
|
51
|
+
#if 0
|
52
|
+
} /* satisfy cc-mode */
|
53
|
+
#endif
|
54
|
+
#endif
|
55
|
+
|
56
|
+
void cumo_<%=type_name%>_mean_kernel_launch(uint64_t n, char *p1, ssize_t s1, char *p2)
|
57
|
+
{
|
58
|
+
ssize_t s1_idx = s1 / sizeof(dtype);
|
59
|
+
thrust::device_ptr<dtype> data_begin = thrust::device_pointer_cast((dtype*)p1);
|
60
|
+
thrust::device_ptr<dtype> data_end = thrust::device_pointer_cast(((dtype*)p1) + n * s1_idx);
|
61
|
+
if (s1_idx == 1) {
|
62
|
+
cumo_<%=type_name%>_mean_kernel<<<1,1>>>(data_begin, data_end, (<%=dtype%>*)p2, n);
|
63
|
+
} else {
|
64
|
+
cumo_thrust_strided_range<thrust::device_vector<dtype>::iterator> range(data_begin, data_end, s1_idx);
|
65
|
+
cumo_<%=type_name%>_mean_kernel<<<1,1>>>(range.begin(), range.end(), (<%=dtype%>*)p2, n);
|
66
|
+
}
|
67
|
+
}
|
68
|
+
|
69
|
+
void cumo_<%=type_name%>_var_kernel_launch(uint64_t n, char *p1, ssize_t s1, char *p2)
|
70
|
+
{
|
71
|
+
ssize_t s1_idx = s1 / sizeof(dtype);
|
72
|
+
thrust::device_ptr<dtype> data_begin = thrust::device_pointer_cast((dtype*)p1);
|
73
|
+
thrust::device_ptr<dtype> data_end = thrust::device_pointer_cast(((dtype*)p1) + n * s1_idx);
|
74
|
+
if (s1_idx == 1) {
|
75
|
+
cumo_<%=type_name%>_var_kernel<<<1,1>>>(data_begin, data_end, (<%=dtype%>*)p2);
|
76
|
+
} else {
|
77
|
+
cumo_thrust_strided_range<thrust::device_vector<dtype>::iterator> range(data_begin, data_end, s1_idx);
|
78
|
+
cumo_<%=type_name%>_var_kernel<<<1,1>>>(range.begin(), range.end(), (<%=dtype%>*)p2);
|
79
|
+
}
|
80
|
+
}
|
81
|
+
|
82
|
+
void cumo_<%=type_name%>_stddev_kernel_launch(uint64_t n, char *p1, ssize_t s1, char *p2)
|
83
|
+
{
|
84
|
+
ssize_t s1_idx = s1 / sizeof(dtype);
|
85
|
+
thrust::device_ptr<dtype> data_begin = thrust::device_pointer_cast((dtype*)p1);
|
86
|
+
thrust::device_ptr<dtype> data_end = thrust::device_pointer_cast(((dtype*)p1) + n * s1_idx);
|
87
|
+
if (s1_idx == 1) {
|
88
|
+
cumo_<%=type_name%>_stddev_kernel<<<1,1>>>(data_begin, data_end, (<%=dtype%>*)p2);
|
89
|
+
} else {
|
90
|
+
cumo_thrust_strided_range<thrust::device_vector<dtype>::iterator> range(data_begin, data_end, s1_idx);
|
91
|
+
cumo_<%=type_name%>_stddev_kernel<<<1,1>>>(range.begin(), range.end(), (<%=dtype%>*)p2);
|
92
|
+
}
|
93
|
+
}
|
94
|
+
|
95
|
+
void cumo_<%=type_name%>_rms_kernel_launch(uint64_t n, char *p1, ssize_t s1, char *p2)
|
96
|
+
{
|
97
|
+
ssize_t s1_idx = s1 / sizeof(dtype);
|
98
|
+
thrust::device_ptr<dtype> data_begin = thrust::device_pointer_cast((dtype*)p1);
|
99
|
+
thrust::device_ptr<dtype> data_end = thrust::device_pointer_cast(((dtype*)p1) + n * s1_idx);
|
100
|
+
if (s1_idx == 1) {
|
101
|
+
cumo_<%=type_name%>_rms_kernel<<<1,1>>>(data_begin, data_end, (<%=dtype%>*)p2, n);
|
102
|
+
} else {
|
103
|
+
cumo_thrust_strided_range<thrust::device_vector<dtype>::iterator> range(data_begin, data_end, s1_idx);
|
104
|
+
cumo_<%=type_name%>_rms_kernel<<<1,1>>>(range.begin(), range.end(), (<%=dtype%>*)p2, n);
|
105
|
+
}
|
106
|
+
}
|