cumo 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +27 -0
- data/.travis.yml +5 -0
- data/3rd_party/mkmf-cu/.gitignore +36 -0
- data/3rd_party/mkmf-cu/Gemfile +3 -0
- data/3rd_party/mkmf-cu/LICENSE +21 -0
- data/3rd_party/mkmf-cu/README.md +36 -0
- data/3rd_party/mkmf-cu/Rakefile +11 -0
- data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +4 -0
- data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +32 -0
- data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +80 -0
- data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +157 -0
- data/3rd_party/mkmf-cu/mkmf-cu.gemspec +16 -0
- data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +67 -0
- data/CODE_OF_CONDUCT.md +46 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +82 -0
- data/README.md +252 -0
- data/Rakefile +43 -0
- data/bench/broadcast_fp32.rb +138 -0
- data/bench/cumo_bench.rb +193 -0
- data/bench/numo_bench.rb +138 -0
- data/bench/reduction_fp32.rb +117 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/cumo.gemspec +32 -0
- data/ext/cumo/cuda/cublas.c +278 -0
- data/ext/cumo/cuda/driver.c +421 -0
- data/ext/cumo/cuda/memory_pool.cpp +185 -0
- data/ext/cumo/cuda/memory_pool_impl.cpp +308 -0
- data/ext/cumo/cuda/memory_pool_impl.hpp +370 -0
- data/ext/cumo/cuda/memory_pool_impl_test.cpp +554 -0
- data/ext/cumo/cuda/nvrtc.c +207 -0
- data/ext/cumo/cuda/runtime.c +167 -0
- data/ext/cumo/cumo.c +148 -0
- data/ext/cumo/depend.erb +58 -0
- data/ext/cumo/extconf.rb +179 -0
- data/ext/cumo/include/cumo.h +25 -0
- data/ext/cumo/include/cumo/compat.h +23 -0
- data/ext/cumo/include/cumo/cuda/cublas.h +153 -0
- data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +187 -0
- data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +79 -0
- data/ext/cumo/include/cumo/cuda/driver.h +22 -0
- data/ext/cumo/include/cumo/cuda/memory_pool.h +28 -0
- data/ext/cumo/include/cumo/cuda/nvrtc.h +22 -0
- data/ext/cumo/include/cumo/cuda/runtime.h +40 -0
- data/ext/cumo/include/cumo/indexer.h +238 -0
- data/ext/cumo/include/cumo/intern.h +142 -0
- data/ext/cumo/include/cumo/intern_fwd.h +38 -0
- data/ext/cumo/include/cumo/intern_kernel.h +6 -0
- data/ext/cumo/include/cumo/narray.h +429 -0
- data/ext/cumo/include/cumo/narray_kernel.h +149 -0
- data/ext/cumo/include/cumo/ndloop.h +95 -0
- data/ext/cumo/include/cumo/reduce_kernel.h +126 -0
- data/ext/cumo/include/cumo/template.h +158 -0
- data/ext/cumo/include/cumo/template_kernel.h +77 -0
- data/ext/cumo/include/cumo/types/bit.h +40 -0
- data/ext/cumo/include/cumo/types/bit_kernel.h +34 -0
- data/ext/cumo/include/cumo/types/complex.h +402 -0
- data/ext/cumo/include/cumo/types/complex_kernel.h +414 -0
- data/ext/cumo/include/cumo/types/complex_macro.h +382 -0
- data/ext/cumo/include/cumo/types/complex_macro_kernel.h +186 -0
- data/ext/cumo/include/cumo/types/dcomplex.h +46 -0
- data/ext/cumo/include/cumo/types/dcomplex_kernel.h +13 -0
- data/ext/cumo/include/cumo/types/dfloat.h +47 -0
- data/ext/cumo/include/cumo/types/dfloat_kernel.h +14 -0
- data/ext/cumo/include/cumo/types/float_def.h +34 -0
- data/ext/cumo/include/cumo/types/float_def_kernel.h +39 -0
- data/ext/cumo/include/cumo/types/float_macro.h +191 -0
- data/ext/cumo/include/cumo/types/float_macro_kernel.h +158 -0
- data/ext/cumo/include/cumo/types/int16.h +24 -0
- data/ext/cumo/include/cumo/types/int16_kernel.h +23 -0
- data/ext/cumo/include/cumo/types/int32.h +24 -0
- data/ext/cumo/include/cumo/types/int32_kernel.h +19 -0
- data/ext/cumo/include/cumo/types/int64.h +24 -0
- data/ext/cumo/include/cumo/types/int64_kernel.h +19 -0
- data/ext/cumo/include/cumo/types/int8.h +24 -0
- data/ext/cumo/include/cumo/types/int8_kernel.h +19 -0
- data/ext/cumo/include/cumo/types/int_macro.h +67 -0
- data/ext/cumo/include/cumo/types/int_macro_kernel.h +48 -0
- data/ext/cumo/include/cumo/types/real_accum.h +486 -0
- data/ext/cumo/include/cumo/types/real_accum_kernel.h +101 -0
- data/ext/cumo/include/cumo/types/robj_macro.h +80 -0
- data/ext/cumo/include/cumo/types/robj_macro_kernel.h +0 -0
- data/ext/cumo/include/cumo/types/robject.h +27 -0
- data/ext/cumo/include/cumo/types/robject_kernel.h +7 -0
- data/ext/cumo/include/cumo/types/scomplex.h +46 -0
- data/ext/cumo/include/cumo/types/scomplex_kernel.h +13 -0
- data/ext/cumo/include/cumo/types/sfloat.h +48 -0
- data/ext/cumo/include/cumo/types/sfloat_kernel.h +14 -0
- data/ext/cumo/include/cumo/types/uint16.h +25 -0
- data/ext/cumo/include/cumo/types/uint16_kernel.h +20 -0
- data/ext/cumo/include/cumo/types/uint32.h +25 -0
- data/ext/cumo/include/cumo/types/uint32_kernel.h +20 -0
- data/ext/cumo/include/cumo/types/uint64.h +25 -0
- data/ext/cumo/include/cumo/types/uint64_kernel.h +20 -0
- data/ext/cumo/include/cumo/types/uint8.h +25 -0
- data/ext/cumo/include/cumo/types/uint8_kernel.h +20 -0
- data/ext/cumo/include/cumo/types/uint_macro.h +58 -0
- data/ext/cumo/include/cumo/types/uint_macro_kernel.h +38 -0
- data/ext/cumo/include/cumo/types/xint_macro.h +169 -0
- data/ext/cumo/include/cumo/types/xint_macro_kernel.h +88 -0
- data/ext/cumo/narray/SFMT-params.h +97 -0
- data/ext/cumo/narray/SFMT-params19937.h +46 -0
- data/ext/cumo/narray/SFMT.c +620 -0
- data/ext/cumo/narray/SFMT.h +167 -0
- data/ext/cumo/narray/array.c +638 -0
- data/ext/cumo/narray/data.c +961 -0
- data/ext/cumo/narray/gen/cogen.rb +56 -0
- data/ext/cumo/narray/gen/cogen_kernel.rb +58 -0
- data/ext/cumo/narray/gen/def/bit.rb +37 -0
- data/ext/cumo/narray/gen/def/dcomplex.rb +39 -0
- data/ext/cumo/narray/gen/def/dfloat.rb +37 -0
- data/ext/cumo/narray/gen/def/int16.rb +36 -0
- data/ext/cumo/narray/gen/def/int32.rb +36 -0
- data/ext/cumo/narray/gen/def/int64.rb +36 -0
- data/ext/cumo/narray/gen/def/int8.rb +36 -0
- data/ext/cumo/narray/gen/def/robject.rb +37 -0
- data/ext/cumo/narray/gen/def/scomplex.rb +39 -0
- data/ext/cumo/narray/gen/def/sfloat.rb +37 -0
- data/ext/cumo/narray/gen/def/uint16.rb +36 -0
- data/ext/cumo/narray/gen/def/uint32.rb +36 -0
- data/ext/cumo/narray/gen/def/uint64.rb +36 -0
- data/ext/cumo/narray/gen/def/uint8.rb +36 -0
- data/ext/cumo/narray/gen/erbpp2.rb +346 -0
- data/ext/cumo/narray/gen/narray_def.rb +268 -0
- data/ext/cumo/narray/gen/spec.rb +425 -0
- data/ext/cumo/narray/gen/tmpl/accum.c +86 -0
- data/ext/cumo/narray/gen/tmpl/accum_binary.c +121 -0
- data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +61 -0
- data/ext/cumo/narray/gen/tmpl/accum_index.c +119 -0
- data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +66 -0
- data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +12 -0
- data/ext/cumo/narray/gen/tmpl/alloc_func.c +107 -0
- data/ext/cumo/narray/gen/tmpl/allocate.c +37 -0
- data/ext/cumo/narray/gen/tmpl/aref.c +66 -0
- data/ext/cumo/narray/gen/tmpl/aref_cpu.c +50 -0
- data/ext/cumo/narray/gen/tmpl/aset.c +56 -0
- data/ext/cumo/narray/gen/tmpl/binary.c +162 -0
- data/ext/cumo/narray/gen/tmpl/binary2.c +70 -0
- data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +15 -0
- data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +31 -0
- data/ext/cumo/narray/gen/tmpl/binary_s.c +45 -0
- data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +15 -0
- data/ext/cumo/narray/gen/tmpl/bincount.c +181 -0
- data/ext/cumo/narray/gen/tmpl/cast.c +44 -0
- data/ext/cumo/narray/gen/tmpl/cast_array.c +13 -0
- data/ext/cumo/narray/gen/tmpl/class.c +9 -0
- data/ext/cumo/narray/gen/tmpl/class_kernel.cu +6 -0
- data/ext/cumo/narray/gen/tmpl/clip.c +121 -0
- data/ext/cumo/narray/gen/tmpl/coerce_cast.c +10 -0
- data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +129 -0
- data/ext/cumo/narray/gen/tmpl/cond_binary.c +68 -0
- data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +18 -0
- data/ext/cumo/narray/gen/tmpl/cond_unary.c +46 -0
- data/ext/cumo/narray/gen/tmpl/cum.c +50 -0
- data/ext/cumo/narray/gen/tmpl/each.c +47 -0
- data/ext/cumo/narray/gen/tmpl/each_with_index.c +70 -0
- data/ext/cumo/narray/gen/tmpl/ewcomp.c +79 -0
- data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +19 -0
- data/ext/cumo/narray/gen/tmpl/extract.c +22 -0
- data/ext/cumo/narray/gen/tmpl/extract_cpu.c +26 -0
- data/ext/cumo/narray/gen/tmpl/extract_data.c +53 -0
- data/ext/cumo/narray/gen/tmpl/eye.c +105 -0
- data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +19 -0
- data/ext/cumo/narray/gen/tmpl/fill.c +52 -0
- data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +29 -0
- data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +106 -0
- data/ext/cumo/narray/gen/tmpl/format.c +62 -0
- data/ext/cumo/narray/gen/tmpl/format_to_a.c +49 -0
- data/ext/cumo/narray/gen/tmpl/frexp.c +38 -0
- data/ext/cumo/narray/gen/tmpl/gemm.c +203 -0
- data/ext/cumo/narray/gen/tmpl/init_class.c +20 -0
- data/ext/cumo/narray/gen/tmpl/init_module.c +12 -0
- data/ext/cumo/narray/gen/tmpl/inspect.c +21 -0
- data/ext/cumo/narray/gen/tmpl/lib.c +50 -0
- data/ext/cumo/narray/gen/tmpl/lib_kernel.cu +24 -0
- data/ext/cumo/narray/gen/tmpl/logseq.c +102 -0
- data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +31 -0
- data/ext/cumo/narray/gen/tmpl/map_with_index.c +98 -0
- data/ext/cumo/narray/gen/tmpl/median.c +66 -0
- data/ext/cumo/narray/gen/tmpl/minmax.c +47 -0
- data/ext/cumo/narray/gen/tmpl/module.c +9 -0
- data/ext/cumo/narray/gen/tmpl/module_kernel.cu +1 -0
- data/ext/cumo/narray/gen/tmpl/new_dim0.c +15 -0
- data/ext/cumo/narray/gen/tmpl/new_dim0_kernel.cu +8 -0
- data/ext/cumo/narray/gen/tmpl/poly.c +50 -0
- data/ext/cumo/narray/gen/tmpl/pow.c +97 -0
- data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +29 -0
- data/ext/cumo/narray/gen/tmpl/powint.c +17 -0
- data/ext/cumo/narray/gen/tmpl/qsort.c +212 -0
- data/ext/cumo/narray/gen/tmpl/rand.c +168 -0
- data/ext/cumo/narray/gen/tmpl/rand_norm.c +121 -0
- data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +75 -0
- data/ext/cumo/narray/gen/tmpl/seq.c +112 -0
- data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +43 -0
- data/ext/cumo/narray/gen/tmpl/set2.c +57 -0
- data/ext/cumo/narray/gen/tmpl/sort.c +48 -0
- data/ext/cumo/narray/gen/tmpl/sort_index.c +111 -0
- data/ext/cumo/narray/gen/tmpl/store.c +41 -0
- data/ext/cumo/narray/gen/tmpl/store_array.c +187 -0
- data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +58 -0
- data/ext/cumo/narray/gen/tmpl/store_bit.c +86 -0
- data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +66 -0
- data/ext/cumo/narray/gen/tmpl/store_from.c +81 -0
- data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +58 -0
- data/ext/cumo/narray/gen/tmpl/store_kernel.cu +3 -0
- data/ext/cumo/narray/gen/tmpl/store_numeric.c +9 -0
- data/ext/cumo/narray/gen/tmpl/to_a.c +43 -0
- data/ext/cumo/narray/gen/tmpl/unary.c +132 -0
- data/ext/cumo/narray/gen/tmpl/unary2.c +60 -0
- data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +72 -0
- data/ext/cumo/narray/gen/tmpl/unary_ret2.c +34 -0
- data/ext/cumo/narray/gen/tmpl/unary_s.c +86 -0
- data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +58 -0
- data/ext/cumo/narray/gen/tmpl_bit/allocate.c +24 -0
- data/ext/cumo/narray/gen/tmpl_bit/aref.c +54 -0
- data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +57 -0
- data/ext/cumo/narray/gen/tmpl_bit/aset.c +56 -0
- data/ext/cumo/narray/gen/tmpl_bit/binary.c +98 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +64 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +88 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +76 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +133 -0
- data/ext/cumo/narray/gen/tmpl_bit/each.c +48 -0
- data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +70 -0
- data/ext/cumo/narray/gen/tmpl_bit/extract.c +30 -0
- data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +29 -0
- data/ext/cumo/narray/gen/tmpl_bit/fill.c +69 -0
- data/ext/cumo/narray/gen/tmpl_bit/format.c +64 -0
- data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +51 -0
- data/ext/cumo/narray/gen/tmpl_bit/inspect.c +21 -0
- data/ext/cumo/narray/gen/tmpl_bit/mask.c +136 -0
- data/ext/cumo/narray/gen/tmpl_bit/none_p.c +14 -0
- data/ext/cumo/narray/gen/tmpl_bit/store_array.c +108 -0
- data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +70 -0
- data/ext/cumo/narray/gen/tmpl_bit/store_from.c +60 -0
- data/ext/cumo/narray/gen/tmpl_bit/to_a.c +47 -0
- data/ext/cumo/narray/gen/tmpl_bit/unary.c +81 -0
- data/ext/cumo/narray/gen/tmpl_bit/where.c +90 -0
- data/ext/cumo/narray/gen/tmpl_bit/where2.c +95 -0
- data/ext/cumo/narray/index.c +880 -0
- data/ext/cumo/narray/kwargs.c +153 -0
- data/ext/cumo/narray/math.c +142 -0
- data/ext/cumo/narray/narray.c +1948 -0
- data/ext/cumo/narray/ndloop.c +2105 -0
- data/ext/cumo/narray/rand.c +45 -0
- data/ext/cumo/narray/step.c +474 -0
- data/ext/cumo/narray/struct.c +886 -0
- data/lib/cumo.rb +3 -0
- data/lib/cumo/cuda.rb +11 -0
- data/lib/cumo/cuda/compile_error.rb +36 -0
- data/lib/cumo/cuda/compiler.rb +161 -0
- data/lib/cumo/cuda/device.rb +47 -0
- data/lib/cumo/cuda/link_state.rb +31 -0
- data/lib/cumo/cuda/module.rb +40 -0
- data/lib/cumo/cuda/nvrtc_program.rb +27 -0
- data/lib/cumo/linalg.rb +12 -0
- data/lib/cumo/narray.rb +2 -0
- data/lib/cumo/narray/extra.rb +1278 -0
- data/lib/erbpp.rb +294 -0
- data/lib/erbpp/line_number.rb +137 -0
- data/lib/erbpp/narray_def.rb +381 -0
- data/numo-narray-version +1 -0
- data/run.gdb +7 -0
- metadata +353 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
#ifndef CUMO_UINT64_KERNEL_H
|
|
2
|
+
#define CUMO_UINT64_KERNEL_H
|
|
3
|
+
|
|
4
|
+
typedef u_int64_t dtype;
|
|
5
|
+
typedef u_int64_t rtype;
|
|
6
|
+
|
|
7
|
+
#ifndef UINT64_MIN
|
|
8
|
+
#define UINT64_MIN (0)
|
|
9
|
+
#endif
|
|
10
|
+
|
|
11
|
+
#ifndef UINT64_MAX
|
|
12
|
+
#define UINT64_MAX (18446744073709551615ul)
|
|
13
|
+
#endif
|
|
14
|
+
|
|
15
|
+
#define DATA_MIN UINT64_MIN
|
|
16
|
+
#define DATA_MAX UINT64_MAX
|
|
17
|
+
|
|
18
|
+
#include "uint_macro_kernel.h"
|
|
19
|
+
|
|
20
|
+
#endif // CUMO_UINT64_KERNEL_H
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
typedef u_int8_t dtype;
|
|
2
|
+
typedef u_int8_t rtype;
|
|
3
|
+
#define cT cumo_cUInt8
|
|
4
|
+
#define cRT cT
|
|
5
|
+
|
|
6
|
+
#define m_num_to_data(x) ((dtype)NUM2UINT(x))
|
|
7
|
+
#define m_data_to_num(x) UINT2NUM((unsigned int)(x))
|
|
8
|
+
#define m_extract(x) UINT2NUM((unsigned int)*(dtype*)(x))
|
|
9
|
+
#define m_sprintf(s,x) sprintf(s,"%u",(unsigned int)(x))
|
|
10
|
+
|
|
11
|
+
#ifndef UINT8_MIN
|
|
12
|
+
#define UINT8_MIN (0)
|
|
13
|
+
#endif
|
|
14
|
+
|
|
15
|
+
#ifndef UINT8_MAX
|
|
16
|
+
#define UINT8_MAX (255)
|
|
17
|
+
#endif
|
|
18
|
+
|
|
19
|
+
#define DATA_MIN UINT8_MIN
|
|
20
|
+
#define DATA_MAX UINT8_MAX
|
|
21
|
+
|
|
22
|
+
#define M_MIN INT2FIX(0)
|
|
23
|
+
#define M_MAX m_data_to_num(UINT8_MAX)
|
|
24
|
+
|
|
25
|
+
#include "uint_macro.h"
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
#ifndef CUMO_UINT8_KERNEL_H
|
|
2
|
+
#define CUMO_UINT8_KERNEL_H
|
|
3
|
+
|
|
4
|
+
typedef u_int8_t dtype;
|
|
5
|
+
typedef u_int8_t rtype;
|
|
6
|
+
|
|
7
|
+
#ifndef UINT8_MIN
|
|
8
|
+
#define UINT8_MIN (0)
|
|
9
|
+
#endif
|
|
10
|
+
|
|
11
|
+
#ifndef UINT8_MAX
|
|
12
|
+
#define UINT8_MAX (255)
|
|
13
|
+
#endif
|
|
14
|
+
|
|
15
|
+
#define DATA_MIN UINT8_MIN
|
|
16
|
+
#define DATA_MAX UINT8_MAX
|
|
17
|
+
|
|
18
|
+
#include "uint_macro_kernel.h"
|
|
19
|
+
|
|
20
|
+
#endif // CUMO_UINT8_KERNEL_H
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
#include "xint_macro.h"
|
|
2
|
+
|
|
3
|
+
#define m_abs(x) (x)
|
|
4
|
+
#define m_sign(x) (((x)==0) ? 0:1)
|
|
5
|
+
|
|
6
|
+
static inline dtype int_reciprocal(dtype x) {
|
|
7
|
+
switch (x) {
|
|
8
|
+
case 1:
|
|
9
|
+
return 1;
|
|
10
|
+
case 0:
|
|
11
|
+
rb_raise(rb_eZeroDivError, "divided by 0");
|
|
12
|
+
default:
|
|
13
|
+
return 0;
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/*
|
|
18
|
+
static dtype pow_int(dtype x, int p)
|
|
19
|
+
{
|
|
20
|
+
dtype r = m_one;
|
|
21
|
+
switch(p) {
|
|
22
|
+
case 0: return 1;
|
|
23
|
+
case 1: return x;
|
|
24
|
+
case 2: return x*x;
|
|
25
|
+
case 3: return x*x*x;
|
|
26
|
+
}
|
|
27
|
+
while (p) {
|
|
28
|
+
if (p&1) r *= x;
|
|
29
|
+
x *= x;
|
|
30
|
+
p >>= 1;
|
|
31
|
+
}
|
|
32
|
+
return r;
|
|
33
|
+
}
|
|
34
|
+
*/
|
|
35
|
+
|
|
36
|
+
static inline u_int64_t f_sum(size_t n, char *p, ssize_t stride)
|
|
37
|
+
{
|
|
38
|
+
u_int64_t x,y=0;
|
|
39
|
+
size_t i=n;
|
|
40
|
+
for (; i--;) {
|
|
41
|
+
x = *(dtype*)p;
|
|
42
|
+
y += x;
|
|
43
|
+
p += stride;
|
|
44
|
+
}
|
|
45
|
+
return y;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
static inline u_int64_t f_prod(size_t n, char *p, ssize_t stride)
|
|
49
|
+
{
|
|
50
|
+
u_int64_t x,y=1;
|
|
51
|
+
size_t i=n;
|
|
52
|
+
for (; i--;) {
|
|
53
|
+
x = *(dtype*)p;
|
|
54
|
+
y *= x;
|
|
55
|
+
p += stride;
|
|
56
|
+
}
|
|
57
|
+
return y;
|
|
58
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
#ifndef CUMO_UINT_MACRO_KERNEL_H
|
|
2
|
+
#define CUMO_UINT_MACRO_KERNEL_H
|
|
3
|
+
|
|
4
|
+
#include "xint_macro_kernel.h"
|
|
5
|
+
|
|
6
|
+
#define m_abs(x) (x)
|
|
7
|
+
#define m_sign(x) (((x)==0) ? 0:1)
|
|
8
|
+
|
|
9
|
+
__host__ __device__ static inline dtype int_reciprocal(dtype x) {
|
|
10
|
+
switch (x) {
|
|
11
|
+
case 1:
|
|
12
|
+
return 1;
|
|
13
|
+
case 0:
|
|
14
|
+
return 0; // as CUDA kernel 1/0 results in 0.
|
|
15
|
+
//rb_raise(rb_eZeroDivError, "divided by 0");
|
|
16
|
+
default:
|
|
17
|
+
return 0;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
__device__ static dtype pow_int(dtype x, int p)
|
|
22
|
+
{
|
|
23
|
+
dtype r = m_one;
|
|
24
|
+
switch(p) {
|
|
25
|
+
case 0: return 1;
|
|
26
|
+
case 1: return x;
|
|
27
|
+
case 2: return x*x;
|
|
28
|
+
case 3: return x*x*x;
|
|
29
|
+
}
|
|
30
|
+
while (p) {
|
|
31
|
+
if (p&1) r *= x;
|
|
32
|
+
x *= x;
|
|
33
|
+
p >>= 1;
|
|
34
|
+
}
|
|
35
|
+
return r;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
#endif // CUMO_UINT_MACRO_KERNEL_H
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
#define m_zero 0
|
|
2
|
+
#define m_one 1
|
|
3
|
+
|
|
4
|
+
#define m_from_double(x) (x)
|
|
5
|
+
#define m_from_real(x) (x)
|
|
6
|
+
#define m_from_sint(x) (x)
|
|
7
|
+
#define m_from_int32(x) (x)
|
|
8
|
+
#define m_from_int64(x) (x)
|
|
9
|
+
#define m_from_uint32(x) (x)
|
|
10
|
+
#define m_from_uint64(x) (x)
|
|
11
|
+
|
|
12
|
+
#define m_add(x,y) ((x)+(y))
|
|
13
|
+
#define m_sub(x,y) ((x)-(y))
|
|
14
|
+
#define m_mul(x,y) ((x)*(y))
|
|
15
|
+
#define m_div(x,y) ((x)/(y))
|
|
16
|
+
#define m_mod(x,y) ((x)%(y))
|
|
17
|
+
#define m_divmod(x,y,a,b) {a=(x)/(y); b=m_mod(x,y);}
|
|
18
|
+
#define m_pow(x,y) pow_int(x,y)
|
|
19
|
+
#define m_pow_int(x,y) pow_int(x,y)
|
|
20
|
+
|
|
21
|
+
#define m_bit_and(x,y) ((x)&(y))
|
|
22
|
+
#define m_bit_or(x,y) ((x)|(y))
|
|
23
|
+
#define m_bit_xor(x,y) ((x)^(y))
|
|
24
|
+
#define m_bit_not(x) (~(x))
|
|
25
|
+
|
|
26
|
+
#define m_minus(x) (-(x))
|
|
27
|
+
#define m_reciprocal(x) int_reciprocal(x)
|
|
28
|
+
#define m_square(x) ((x)*(x))
|
|
29
|
+
|
|
30
|
+
#define m_eq(x,y) ((x)==(y))
|
|
31
|
+
#define m_ne(x,y) ((x)!=(y))
|
|
32
|
+
#define m_gt(x,y) ((x)>(y))
|
|
33
|
+
#define m_ge(x,y) ((x)>=(y))
|
|
34
|
+
#define m_lt(x,y) ((x)<(y))
|
|
35
|
+
#define m_le(x,y) ((x)<=(y))
|
|
36
|
+
#define m_left_shift(x,y) ((x)<<(y))
|
|
37
|
+
#define m_right_shift(x,y) ((x)>>(y))
|
|
38
|
+
|
|
39
|
+
#define m_isnan(x) 0
|
|
40
|
+
|
|
41
|
+
#define m_mulsum(x,y,z) {z += x*y;}
|
|
42
|
+
#define m_mulsum_init INT2FIX(0)
|
|
43
|
+
#define m_cumsum(x,y) {x += y;}
|
|
44
|
+
#define m_cumprod(x,y) {x *= y;}
|
|
45
|
+
|
|
46
|
+
#define cmp(a,b) \
|
|
47
|
+
((qsort_cast(a)==qsort_cast(b)) ? 0 : \
|
|
48
|
+
(qsort_cast(a) > qsort_cast(b)) ? 1 : -1)
|
|
49
|
+
#define cmpgt(a,b) \
|
|
50
|
+
(qsort_cast(a) > qsort_cast(b))
|
|
51
|
+
|
|
52
|
+
static inline dtype f_min(size_t n, char *p, ssize_t stride)
|
|
53
|
+
{
|
|
54
|
+
dtype x,y;
|
|
55
|
+
size_t i=n;
|
|
56
|
+
|
|
57
|
+
y = *(dtype*)p;
|
|
58
|
+
p += stride;
|
|
59
|
+
i--;
|
|
60
|
+
for (; i--;) {
|
|
61
|
+
x = *(dtype*)p;
|
|
62
|
+
if (x < y) {
|
|
63
|
+
y = x;
|
|
64
|
+
}
|
|
65
|
+
p += stride;
|
|
66
|
+
}
|
|
67
|
+
return y;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
static inline dtype f_max(size_t n, char *p, ssize_t stride)
|
|
71
|
+
{
|
|
72
|
+
dtype x,y;
|
|
73
|
+
size_t i=n;
|
|
74
|
+
|
|
75
|
+
y = *(dtype*)p;
|
|
76
|
+
p += stride;
|
|
77
|
+
i--;
|
|
78
|
+
for (; i--;) {
|
|
79
|
+
x = *(dtype*)p;
|
|
80
|
+
if (x > y) {
|
|
81
|
+
y = x;
|
|
82
|
+
}
|
|
83
|
+
p += stride;
|
|
84
|
+
}
|
|
85
|
+
return y;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
static inline size_t f_min_index(size_t n, char *p, ssize_t stride)
|
|
89
|
+
{
|
|
90
|
+
dtype x, y;
|
|
91
|
+
size_t i, j=0;
|
|
92
|
+
|
|
93
|
+
y = *(dtype*)p;
|
|
94
|
+
for (i=1; i<n; i++) {
|
|
95
|
+
x = *(dtype*)(p+i*stride);
|
|
96
|
+
if (x < y) {
|
|
97
|
+
y = x;
|
|
98
|
+
j = i;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
return j;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
static inline size_t f_max_index(size_t n, char *p, ssize_t stride)
|
|
105
|
+
{
|
|
106
|
+
dtype x, y;
|
|
107
|
+
size_t i, j=0;
|
|
108
|
+
|
|
109
|
+
y = *(dtype*)p;
|
|
110
|
+
for (i=1; i<n; i++) {
|
|
111
|
+
x = *(dtype*)(p+i*stride);
|
|
112
|
+
if (x > y) {
|
|
113
|
+
y = x;
|
|
114
|
+
j = i;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
return j;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
static inline void
|
|
121
|
+
f_minmax(size_t n, char *p, ssize_t stride, dtype* amin, dtype* amax)
|
|
122
|
+
{
|
|
123
|
+
dtype x,min,max;
|
|
124
|
+
size_t i=n;
|
|
125
|
+
|
|
126
|
+
min = max = *(dtype*)p;
|
|
127
|
+
p += stride;
|
|
128
|
+
for (i--; i--;) {
|
|
129
|
+
x = *(dtype*)p;
|
|
130
|
+
if (m_gt(x,max)) {
|
|
131
|
+
max = x;
|
|
132
|
+
}
|
|
133
|
+
if (m_lt(x,min)) {
|
|
134
|
+
min = x;
|
|
135
|
+
}
|
|
136
|
+
p += stride;
|
|
137
|
+
}
|
|
138
|
+
*amin = min;
|
|
139
|
+
*amax = max;
|
|
140
|
+
return;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
static inline dtype f_ptp(size_t n, char *p, ssize_t stride)
|
|
144
|
+
{
|
|
145
|
+
dtype min,max;
|
|
146
|
+
f_minmax(n,p,stride,&min,&max);
|
|
147
|
+
return m_sub(max,min);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
static inline double f_seq(double x, double y, double c)
|
|
151
|
+
{
|
|
152
|
+
return x + y * c;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
static inline dtype f_maximum(dtype x, dtype y)
|
|
156
|
+
{
|
|
157
|
+
if (m_ge(x,y)) {
|
|
158
|
+
return x;
|
|
159
|
+
}
|
|
160
|
+
return y;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
static inline dtype f_minimum(dtype x, dtype y)
|
|
164
|
+
{
|
|
165
|
+
if (m_le(x,y)) {
|
|
166
|
+
return x;
|
|
167
|
+
}
|
|
168
|
+
return y;
|
|
169
|
+
}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
#ifndef CUMO_XINT_MACRO_KERNEL_H
|
|
2
|
+
#define CUMO_XINT_MACRO_KERNEL_H
|
|
3
|
+
|
|
4
|
+
#define m_zero 0
|
|
5
|
+
#define m_one 1
|
|
6
|
+
|
|
7
|
+
#define m_from_double(x) (x)
|
|
8
|
+
#define m_from_real(x) (x)
|
|
9
|
+
#define m_from_sint(x) (x)
|
|
10
|
+
#define m_from_int32(x) (x)
|
|
11
|
+
#define m_from_int64(x) (x)
|
|
12
|
+
#define m_from_uint32(x) (x)
|
|
13
|
+
#define m_from_uint64(x) (x)
|
|
14
|
+
|
|
15
|
+
#define m_add(x,y) ((x)+(y))
|
|
16
|
+
#define m_sub(x,y) ((x)-(y))
|
|
17
|
+
#define m_mul(x,y) ((x)*(y))
|
|
18
|
+
#define m_div(x,y) ((x)/(y))
|
|
19
|
+
#define m_mod(x,y) ((x)%(y))
|
|
20
|
+
#define m_divmod(x,y,a,b) {a=(x)/(y); b=m_mod(x,y);}
|
|
21
|
+
#define m_pow(x,y) pow_int(x,y)
|
|
22
|
+
#define m_pow_int(x,y) pow_int(x,y)
|
|
23
|
+
|
|
24
|
+
#define m_bit_and(x,y) ((x)&(y))
|
|
25
|
+
#define m_bit_or(x,y) ((x)|(y))
|
|
26
|
+
#define m_bit_xor(x,y) ((x)^(y))
|
|
27
|
+
#define m_bit_not(x) (~(x))
|
|
28
|
+
|
|
29
|
+
#define m_minus(x) (-(x))
|
|
30
|
+
#define m_reciprocal(x) int_reciprocal(x)
|
|
31
|
+
#define m_square(x) ((x)*(x))
|
|
32
|
+
|
|
33
|
+
#define m_eq(x,y) ((x)==(y))
|
|
34
|
+
#define m_ne(x,y) ((x)!=(y))
|
|
35
|
+
#define m_gt(x,y) ((x)>(y))
|
|
36
|
+
#define m_ge(x,y) ((x)>=(y))
|
|
37
|
+
#define m_lt(x,y) ((x)<(y))
|
|
38
|
+
#define m_le(x,y) ((x)<=(y))
|
|
39
|
+
#define m_left_shift(x,y) ((x)<<(y))
|
|
40
|
+
#define m_right_shift(x,y) ((x)>>(y))
|
|
41
|
+
|
|
42
|
+
#define m_isnan(x) 0
|
|
43
|
+
|
|
44
|
+
#define m_mulsum(x,y,z) {z += x*y;}
|
|
45
|
+
#define m_mulsum_init 0
|
|
46
|
+
//#define m_cumsum(x,y) {x += y;}
|
|
47
|
+
//#define m_cumprod(x,y) {x *= y;}
|
|
48
|
+
|
|
49
|
+
__host__ __device__ static inline double f_seq(double x, double y, double c)
|
|
50
|
+
{
|
|
51
|
+
return x + y * c;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
__host__ __device__ static inline dtype f_maximum(dtype x, dtype y)
|
|
55
|
+
{
|
|
56
|
+
if (m_ge(x,y)) {
|
|
57
|
+
return x;
|
|
58
|
+
}
|
|
59
|
+
return y;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
__host__ __device__ static inline dtype f_minimum(dtype x, dtype y)
|
|
63
|
+
{
|
|
64
|
+
if (m_le(x,y)) {
|
|
65
|
+
return x;
|
|
66
|
+
}
|
|
67
|
+
return y;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/* --------- thrust ----------------- */
|
|
71
|
+
#include "cumo/cuda/cumo_thrust.hpp"
|
|
72
|
+
|
|
73
|
+
struct cumo_thrust_plus : public thrust::binary_function<dtype, dtype, dtype>
|
|
74
|
+
{
|
|
75
|
+
__host__ __device__ dtype operator()(dtype x, dtype y) { return m_add(x,y); }
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
struct cumo_thrust_multiplies : public thrust::binary_function<dtype, dtype, dtype>
|
|
79
|
+
{
|
|
80
|
+
__host__ __device__ dtype operator()(dtype x, dtype y) { return m_mul(x,y); }
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
struct cumo_thrust_square : public thrust::unary_function<dtype, dtype>
|
|
84
|
+
{
|
|
85
|
+
__host__ __device__ rtype operator()(const dtype& x) const { return m_square(x); }
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
#endif // CUMO_XINT_MACRO_KERNEL_H
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
#ifndef SFMT_PARAMS_H
|
|
2
|
+
#define SFMT_PARAMS_H
|
|
3
|
+
|
|
4
|
+
#if !defined(MEXP)
|
|
5
|
+
//#ifdef __GNUC__
|
|
6
|
+
// #warning "MEXP is not defined. I assume MEXP is 19937."
|
|
7
|
+
//#endif
|
|
8
|
+
#define MEXP 19937
|
|
9
|
+
#endif
|
|
10
|
+
/*-----------------
|
|
11
|
+
BASIC DEFINITIONS
|
|
12
|
+
-----------------*/
|
|
13
|
+
/** Mersenne Exponent. The period of the sequence
|
|
14
|
+
* is a multiple of 2^MEXP-1.
|
|
15
|
+
* #define MEXP 19937 */
|
|
16
|
+
/** SFMT generator has an internal state array of 128-bit integers,
|
|
17
|
+
* and N is its size. */
|
|
18
|
+
#define N (MEXP / 128 + 1)
|
|
19
|
+
/** N32 is the size of internal state array when regarded as an array
|
|
20
|
+
* of 32-bit integers.*/
|
|
21
|
+
#define N32 (N * 4)
|
|
22
|
+
/** N64 is the size of internal state array when regarded as an array
|
|
23
|
+
* of 64-bit integers.*/
|
|
24
|
+
#define N64 (N * 2)
|
|
25
|
+
|
|
26
|
+
/*----------------------
|
|
27
|
+
the parameters of SFMT
|
|
28
|
+
following definitions are in paramsXXXX.h file.
|
|
29
|
+
----------------------*/
|
|
30
|
+
/** the pick up position of the array.
|
|
31
|
+
#define POS1 122
|
|
32
|
+
*/
|
|
33
|
+
|
|
34
|
+
/** the parameter of shift left as four 32-bit registers.
|
|
35
|
+
#define SL1 18
|
|
36
|
+
*/
|
|
37
|
+
|
|
38
|
+
/** the parameter of shift left as one 128-bit register.
|
|
39
|
+
* The 128-bit integer is shifted by (SL2 * 8) bits.
|
|
40
|
+
#define SL2 1
|
|
41
|
+
*/
|
|
42
|
+
|
|
43
|
+
/** the parameter of shift right as four 32-bit registers.
|
|
44
|
+
#define SR1 11
|
|
45
|
+
*/
|
|
46
|
+
|
|
47
|
+
/** the parameter of shift right as one 128-bit register.
|
|
48
|
+
* The 128-bit integer is shifted by (SL2 * 8) bits.
|
|
49
|
+
#define SR2 1
|
|
50
|
+
*/
|
|
51
|
+
|
|
52
|
+
/** A bitmask, used in the recursion. These parameters are introduced
|
|
53
|
+
* to break symmetry of SIMD.
|
|
54
|
+
#define MSK1 0xdfffffefU
|
|
55
|
+
#define MSK2 0xddfecb7fU
|
|
56
|
+
#define MSK3 0xbffaffffU
|
|
57
|
+
#define MSK4 0xbffffff6U
|
|
58
|
+
*/
|
|
59
|
+
|
|
60
|
+
/** These definitions are part of a 128-bit period certification vector.
|
|
61
|
+
#define PARITY1 0x00000001U
|
|
62
|
+
#define PARITY2 0x00000000U
|
|
63
|
+
#define PARITY3 0x00000000U
|
|
64
|
+
#define PARITY4 0xc98e126aU
|
|
65
|
+
*/
|
|
66
|
+
|
|
67
|
+
#if MEXP == 607
|
|
68
|
+
#include "SFMT-params607.h"
|
|
69
|
+
#elif MEXP == 1279
|
|
70
|
+
#include "SFMT-params1279.h"
|
|
71
|
+
#elif MEXP == 2281
|
|
72
|
+
#include "SFMT-params2281.h"
|
|
73
|
+
#elif MEXP == 4253
|
|
74
|
+
#include "SFMT-params4253.h"
|
|
75
|
+
#elif MEXP == 11213
|
|
76
|
+
#include "SFMT-params11213.h"
|
|
77
|
+
#elif MEXP == 19937
|
|
78
|
+
#include "SFMT-params19937.h"
|
|
79
|
+
#elif MEXP == 44497
|
|
80
|
+
#include "SFMT-params44497.h"
|
|
81
|
+
#elif MEXP == 86243
|
|
82
|
+
#include "SFMT-params86243.h"
|
|
83
|
+
#elif MEXP == 132049
|
|
84
|
+
#include "SFMT-params132049.h"
|
|
85
|
+
#elif MEXP == 216091
|
|
86
|
+
#include "SFMT-params216091.h"
|
|
87
|
+
#else
|
|
88
|
+
#ifdef __GNUC__
|
|
89
|
+
#error "MEXP is not valid."
|
|
90
|
+
#undef MEXP
|
|
91
|
+
#else
|
|
92
|
+
#undef MEXP
|
|
93
|
+
#endif
|
|
94
|
+
|
|
95
|
+
#endif
|
|
96
|
+
|
|
97
|
+
#endif /* SFMT_PARAMS_H */
|