cumo 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +27 -0
- data/.travis.yml +5 -0
- data/3rd_party/mkmf-cu/.gitignore +36 -0
- data/3rd_party/mkmf-cu/Gemfile +3 -0
- data/3rd_party/mkmf-cu/LICENSE +21 -0
- data/3rd_party/mkmf-cu/README.md +36 -0
- data/3rd_party/mkmf-cu/Rakefile +11 -0
- data/3rd_party/mkmf-cu/bin/mkmf-cu-nvcc +4 -0
- data/3rd_party/mkmf-cu/lib/mkmf-cu.rb +32 -0
- data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +80 -0
- data/3rd_party/mkmf-cu/lib/mkmf-cu/nvcc.rb +157 -0
- data/3rd_party/mkmf-cu/mkmf-cu.gemspec +16 -0
- data/3rd_party/mkmf-cu/test/test_mkmf-cu.rb +67 -0
- data/CODE_OF_CONDUCT.md +46 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +82 -0
- data/README.md +252 -0
- data/Rakefile +43 -0
- data/bench/broadcast_fp32.rb +138 -0
- data/bench/cumo_bench.rb +193 -0
- data/bench/numo_bench.rb +138 -0
- data/bench/reduction_fp32.rb +117 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/cumo.gemspec +32 -0
- data/ext/cumo/cuda/cublas.c +278 -0
- data/ext/cumo/cuda/driver.c +421 -0
- data/ext/cumo/cuda/memory_pool.cpp +185 -0
- data/ext/cumo/cuda/memory_pool_impl.cpp +308 -0
- data/ext/cumo/cuda/memory_pool_impl.hpp +370 -0
- data/ext/cumo/cuda/memory_pool_impl_test.cpp +554 -0
- data/ext/cumo/cuda/nvrtc.c +207 -0
- data/ext/cumo/cuda/runtime.c +167 -0
- data/ext/cumo/cumo.c +148 -0
- data/ext/cumo/depend.erb +58 -0
- data/ext/cumo/extconf.rb +179 -0
- data/ext/cumo/include/cumo.h +25 -0
- data/ext/cumo/include/cumo/compat.h +23 -0
- data/ext/cumo/include/cumo/cuda/cublas.h +153 -0
- data/ext/cumo/include/cumo/cuda/cumo_thrust.hpp +187 -0
- data/ext/cumo/include/cumo/cuda/cumo_thrust_complex.hpp +79 -0
- data/ext/cumo/include/cumo/cuda/driver.h +22 -0
- data/ext/cumo/include/cumo/cuda/memory_pool.h +28 -0
- data/ext/cumo/include/cumo/cuda/nvrtc.h +22 -0
- data/ext/cumo/include/cumo/cuda/runtime.h +40 -0
- data/ext/cumo/include/cumo/indexer.h +238 -0
- data/ext/cumo/include/cumo/intern.h +142 -0
- data/ext/cumo/include/cumo/intern_fwd.h +38 -0
- data/ext/cumo/include/cumo/intern_kernel.h +6 -0
- data/ext/cumo/include/cumo/narray.h +429 -0
- data/ext/cumo/include/cumo/narray_kernel.h +149 -0
- data/ext/cumo/include/cumo/ndloop.h +95 -0
- data/ext/cumo/include/cumo/reduce_kernel.h +126 -0
- data/ext/cumo/include/cumo/template.h +158 -0
- data/ext/cumo/include/cumo/template_kernel.h +77 -0
- data/ext/cumo/include/cumo/types/bit.h +40 -0
- data/ext/cumo/include/cumo/types/bit_kernel.h +34 -0
- data/ext/cumo/include/cumo/types/complex.h +402 -0
- data/ext/cumo/include/cumo/types/complex_kernel.h +414 -0
- data/ext/cumo/include/cumo/types/complex_macro.h +382 -0
- data/ext/cumo/include/cumo/types/complex_macro_kernel.h +186 -0
- data/ext/cumo/include/cumo/types/dcomplex.h +46 -0
- data/ext/cumo/include/cumo/types/dcomplex_kernel.h +13 -0
- data/ext/cumo/include/cumo/types/dfloat.h +47 -0
- data/ext/cumo/include/cumo/types/dfloat_kernel.h +14 -0
- data/ext/cumo/include/cumo/types/float_def.h +34 -0
- data/ext/cumo/include/cumo/types/float_def_kernel.h +39 -0
- data/ext/cumo/include/cumo/types/float_macro.h +191 -0
- data/ext/cumo/include/cumo/types/float_macro_kernel.h +158 -0
- data/ext/cumo/include/cumo/types/int16.h +24 -0
- data/ext/cumo/include/cumo/types/int16_kernel.h +23 -0
- data/ext/cumo/include/cumo/types/int32.h +24 -0
- data/ext/cumo/include/cumo/types/int32_kernel.h +19 -0
- data/ext/cumo/include/cumo/types/int64.h +24 -0
- data/ext/cumo/include/cumo/types/int64_kernel.h +19 -0
- data/ext/cumo/include/cumo/types/int8.h +24 -0
- data/ext/cumo/include/cumo/types/int8_kernel.h +19 -0
- data/ext/cumo/include/cumo/types/int_macro.h +67 -0
- data/ext/cumo/include/cumo/types/int_macro_kernel.h +48 -0
- data/ext/cumo/include/cumo/types/real_accum.h +486 -0
- data/ext/cumo/include/cumo/types/real_accum_kernel.h +101 -0
- data/ext/cumo/include/cumo/types/robj_macro.h +80 -0
- data/ext/cumo/include/cumo/types/robj_macro_kernel.h +0 -0
- data/ext/cumo/include/cumo/types/robject.h +27 -0
- data/ext/cumo/include/cumo/types/robject_kernel.h +7 -0
- data/ext/cumo/include/cumo/types/scomplex.h +46 -0
- data/ext/cumo/include/cumo/types/scomplex_kernel.h +13 -0
- data/ext/cumo/include/cumo/types/sfloat.h +48 -0
- data/ext/cumo/include/cumo/types/sfloat_kernel.h +14 -0
- data/ext/cumo/include/cumo/types/uint16.h +25 -0
- data/ext/cumo/include/cumo/types/uint16_kernel.h +20 -0
- data/ext/cumo/include/cumo/types/uint32.h +25 -0
- data/ext/cumo/include/cumo/types/uint32_kernel.h +20 -0
- data/ext/cumo/include/cumo/types/uint64.h +25 -0
- data/ext/cumo/include/cumo/types/uint64_kernel.h +20 -0
- data/ext/cumo/include/cumo/types/uint8.h +25 -0
- data/ext/cumo/include/cumo/types/uint8_kernel.h +20 -0
- data/ext/cumo/include/cumo/types/uint_macro.h +58 -0
- data/ext/cumo/include/cumo/types/uint_macro_kernel.h +38 -0
- data/ext/cumo/include/cumo/types/xint_macro.h +169 -0
- data/ext/cumo/include/cumo/types/xint_macro_kernel.h +88 -0
- data/ext/cumo/narray/SFMT-params.h +97 -0
- data/ext/cumo/narray/SFMT-params19937.h +46 -0
- data/ext/cumo/narray/SFMT.c +620 -0
- data/ext/cumo/narray/SFMT.h +167 -0
- data/ext/cumo/narray/array.c +638 -0
- data/ext/cumo/narray/data.c +961 -0
- data/ext/cumo/narray/gen/cogen.rb +56 -0
- data/ext/cumo/narray/gen/cogen_kernel.rb +58 -0
- data/ext/cumo/narray/gen/def/bit.rb +37 -0
- data/ext/cumo/narray/gen/def/dcomplex.rb +39 -0
- data/ext/cumo/narray/gen/def/dfloat.rb +37 -0
- data/ext/cumo/narray/gen/def/int16.rb +36 -0
- data/ext/cumo/narray/gen/def/int32.rb +36 -0
- data/ext/cumo/narray/gen/def/int64.rb +36 -0
- data/ext/cumo/narray/gen/def/int8.rb +36 -0
- data/ext/cumo/narray/gen/def/robject.rb +37 -0
- data/ext/cumo/narray/gen/def/scomplex.rb +39 -0
- data/ext/cumo/narray/gen/def/sfloat.rb +37 -0
- data/ext/cumo/narray/gen/def/uint16.rb +36 -0
- data/ext/cumo/narray/gen/def/uint32.rb +36 -0
- data/ext/cumo/narray/gen/def/uint64.rb +36 -0
- data/ext/cumo/narray/gen/def/uint8.rb +36 -0
- data/ext/cumo/narray/gen/erbpp2.rb +346 -0
- data/ext/cumo/narray/gen/narray_def.rb +268 -0
- data/ext/cumo/narray/gen/spec.rb +425 -0
- data/ext/cumo/narray/gen/tmpl/accum.c +86 -0
- data/ext/cumo/narray/gen/tmpl/accum_binary.c +121 -0
- data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +61 -0
- data/ext/cumo/narray/gen/tmpl/accum_index.c +119 -0
- data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +66 -0
- data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +12 -0
- data/ext/cumo/narray/gen/tmpl/alloc_func.c +107 -0
- data/ext/cumo/narray/gen/tmpl/allocate.c +37 -0
- data/ext/cumo/narray/gen/tmpl/aref.c +66 -0
- data/ext/cumo/narray/gen/tmpl/aref_cpu.c +50 -0
- data/ext/cumo/narray/gen/tmpl/aset.c +56 -0
- data/ext/cumo/narray/gen/tmpl/binary.c +162 -0
- data/ext/cumo/narray/gen/tmpl/binary2.c +70 -0
- data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +15 -0
- data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +31 -0
- data/ext/cumo/narray/gen/tmpl/binary_s.c +45 -0
- data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +15 -0
- data/ext/cumo/narray/gen/tmpl/bincount.c +181 -0
- data/ext/cumo/narray/gen/tmpl/cast.c +44 -0
- data/ext/cumo/narray/gen/tmpl/cast_array.c +13 -0
- data/ext/cumo/narray/gen/tmpl/class.c +9 -0
- data/ext/cumo/narray/gen/tmpl/class_kernel.cu +6 -0
- data/ext/cumo/narray/gen/tmpl/clip.c +121 -0
- data/ext/cumo/narray/gen/tmpl/coerce_cast.c +10 -0
- data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +129 -0
- data/ext/cumo/narray/gen/tmpl/cond_binary.c +68 -0
- data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +18 -0
- data/ext/cumo/narray/gen/tmpl/cond_unary.c +46 -0
- data/ext/cumo/narray/gen/tmpl/cum.c +50 -0
- data/ext/cumo/narray/gen/tmpl/each.c +47 -0
- data/ext/cumo/narray/gen/tmpl/each_with_index.c +70 -0
- data/ext/cumo/narray/gen/tmpl/ewcomp.c +79 -0
- data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +19 -0
- data/ext/cumo/narray/gen/tmpl/extract.c +22 -0
- data/ext/cumo/narray/gen/tmpl/extract_cpu.c +26 -0
- data/ext/cumo/narray/gen/tmpl/extract_data.c +53 -0
- data/ext/cumo/narray/gen/tmpl/eye.c +105 -0
- data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +19 -0
- data/ext/cumo/narray/gen/tmpl/fill.c +52 -0
- data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +29 -0
- data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +106 -0
- data/ext/cumo/narray/gen/tmpl/format.c +62 -0
- data/ext/cumo/narray/gen/tmpl/format_to_a.c +49 -0
- data/ext/cumo/narray/gen/tmpl/frexp.c +38 -0
- data/ext/cumo/narray/gen/tmpl/gemm.c +203 -0
- data/ext/cumo/narray/gen/tmpl/init_class.c +20 -0
- data/ext/cumo/narray/gen/tmpl/init_module.c +12 -0
- data/ext/cumo/narray/gen/tmpl/inspect.c +21 -0
- data/ext/cumo/narray/gen/tmpl/lib.c +50 -0
- data/ext/cumo/narray/gen/tmpl/lib_kernel.cu +24 -0
- data/ext/cumo/narray/gen/tmpl/logseq.c +102 -0
- data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +31 -0
- data/ext/cumo/narray/gen/tmpl/map_with_index.c +98 -0
- data/ext/cumo/narray/gen/tmpl/median.c +66 -0
- data/ext/cumo/narray/gen/tmpl/minmax.c +47 -0
- data/ext/cumo/narray/gen/tmpl/module.c +9 -0
- data/ext/cumo/narray/gen/tmpl/module_kernel.cu +1 -0
- data/ext/cumo/narray/gen/tmpl/new_dim0.c +15 -0
- data/ext/cumo/narray/gen/tmpl/new_dim0_kernel.cu +8 -0
- data/ext/cumo/narray/gen/tmpl/poly.c +50 -0
- data/ext/cumo/narray/gen/tmpl/pow.c +97 -0
- data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +29 -0
- data/ext/cumo/narray/gen/tmpl/powint.c +17 -0
- data/ext/cumo/narray/gen/tmpl/qsort.c +212 -0
- data/ext/cumo/narray/gen/tmpl/rand.c +168 -0
- data/ext/cumo/narray/gen/tmpl/rand_norm.c +121 -0
- data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +75 -0
- data/ext/cumo/narray/gen/tmpl/seq.c +112 -0
- data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +43 -0
- data/ext/cumo/narray/gen/tmpl/set2.c +57 -0
- data/ext/cumo/narray/gen/tmpl/sort.c +48 -0
- data/ext/cumo/narray/gen/tmpl/sort_index.c +111 -0
- data/ext/cumo/narray/gen/tmpl/store.c +41 -0
- data/ext/cumo/narray/gen/tmpl/store_array.c +187 -0
- data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +58 -0
- data/ext/cumo/narray/gen/tmpl/store_bit.c +86 -0
- data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +66 -0
- data/ext/cumo/narray/gen/tmpl/store_from.c +81 -0
- data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +58 -0
- data/ext/cumo/narray/gen/tmpl/store_kernel.cu +3 -0
- data/ext/cumo/narray/gen/tmpl/store_numeric.c +9 -0
- data/ext/cumo/narray/gen/tmpl/to_a.c +43 -0
- data/ext/cumo/narray/gen/tmpl/unary.c +132 -0
- data/ext/cumo/narray/gen/tmpl/unary2.c +60 -0
- data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +72 -0
- data/ext/cumo/narray/gen/tmpl/unary_ret2.c +34 -0
- data/ext/cumo/narray/gen/tmpl/unary_s.c +86 -0
- data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +58 -0
- data/ext/cumo/narray/gen/tmpl_bit/allocate.c +24 -0
- data/ext/cumo/narray/gen/tmpl_bit/aref.c +54 -0
- data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +57 -0
- data/ext/cumo/narray/gen/tmpl_bit/aset.c +56 -0
- data/ext/cumo/narray/gen/tmpl_bit/binary.c +98 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +64 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +88 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +76 -0
- data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +133 -0
- data/ext/cumo/narray/gen/tmpl_bit/each.c +48 -0
- data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +70 -0
- data/ext/cumo/narray/gen/tmpl_bit/extract.c +30 -0
- data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +29 -0
- data/ext/cumo/narray/gen/tmpl_bit/fill.c +69 -0
- data/ext/cumo/narray/gen/tmpl_bit/format.c +64 -0
- data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +51 -0
- data/ext/cumo/narray/gen/tmpl_bit/inspect.c +21 -0
- data/ext/cumo/narray/gen/tmpl_bit/mask.c +136 -0
- data/ext/cumo/narray/gen/tmpl_bit/none_p.c +14 -0
- data/ext/cumo/narray/gen/tmpl_bit/store_array.c +108 -0
- data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +70 -0
- data/ext/cumo/narray/gen/tmpl_bit/store_from.c +60 -0
- data/ext/cumo/narray/gen/tmpl_bit/to_a.c +47 -0
- data/ext/cumo/narray/gen/tmpl_bit/unary.c +81 -0
- data/ext/cumo/narray/gen/tmpl_bit/where.c +90 -0
- data/ext/cumo/narray/gen/tmpl_bit/where2.c +95 -0
- data/ext/cumo/narray/index.c +880 -0
- data/ext/cumo/narray/kwargs.c +153 -0
- data/ext/cumo/narray/math.c +142 -0
- data/ext/cumo/narray/narray.c +1948 -0
- data/ext/cumo/narray/ndloop.c +2105 -0
- data/ext/cumo/narray/rand.c +45 -0
- data/ext/cumo/narray/step.c +474 -0
- data/ext/cumo/narray/struct.c +886 -0
- data/lib/cumo.rb +3 -0
- data/lib/cumo/cuda.rb +11 -0
- data/lib/cumo/cuda/compile_error.rb +36 -0
- data/lib/cumo/cuda/compiler.rb +161 -0
- data/lib/cumo/cuda/device.rb +47 -0
- data/lib/cumo/cuda/link_state.rb +31 -0
- data/lib/cumo/cuda/module.rb +40 -0
- data/lib/cumo/cuda/nvrtc_program.rb +27 -0
- data/lib/cumo/linalg.rb +12 -0
- data/lib/cumo/narray.rb +2 -0
- data/lib/cumo/narray/extra.rb +1278 -0
- data/lib/erbpp.rb +294 -0
- data/lib/erbpp/line_number.rb +137 -0
- data/lib/erbpp/narray_def.rb +381 -0
- data/numo-narray-version +1 -0
- data/run.gdb +7 -0
- metadata +353 -0
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
#ifndef CUMO_NARRAY_KERNEL_H
|
|
2
|
+
#define CUMO_NARRAY_KERNEL_H
|
|
3
|
+
|
|
4
|
+
#if defined(__cplusplus)
|
|
5
|
+
extern "C" {
|
|
6
|
+
#if 0
|
|
7
|
+
} /* satisfy cc-mode */
|
|
8
|
+
#endif
|
|
9
|
+
#endif
|
|
10
|
+
|
|
11
|
+
#include <math.h>
|
|
12
|
+
//#include "cumo/compat.h"
|
|
13
|
+
#include "cumo/template_kernel.h"
|
|
14
|
+
//#include "cumo/extconf.h"
|
|
15
|
+
|
|
16
|
+
#ifdef HAVE_STDBOOL_H
|
|
17
|
+
# include <stdbool.h>
|
|
18
|
+
#endif
|
|
19
|
+
|
|
20
|
+
#ifdef HAVE_STDINT_H
|
|
21
|
+
# include <stdint.h>
|
|
22
|
+
#endif
|
|
23
|
+
|
|
24
|
+
#ifdef HAVE_SYS_TYPES_H
|
|
25
|
+
# include <sys/types.h>
|
|
26
|
+
#endif
|
|
27
|
+
|
|
28
|
+
#ifndef HAVE_U_INT8_T
|
|
29
|
+
# ifdef HAVE_UINT8_T
|
|
30
|
+
typedef uint8_t u_int8_t;
|
|
31
|
+
# endif
|
|
32
|
+
#endif
|
|
33
|
+
|
|
34
|
+
#ifndef HAVE_U_INT16_T
|
|
35
|
+
# ifdef HAVE_UINT16_T
|
|
36
|
+
typedef uint16_t u_int16_t;
|
|
37
|
+
# endif
|
|
38
|
+
#endif
|
|
39
|
+
|
|
40
|
+
#ifndef HAVE_U_INT32_T
|
|
41
|
+
# ifdef HAVE_UINT32_T
|
|
42
|
+
typedef uint32_t u_int32_t;
|
|
43
|
+
# endif
|
|
44
|
+
#endif
|
|
45
|
+
|
|
46
|
+
#ifndef HAVE_U_INT64_T
|
|
47
|
+
# ifdef HAVE_UINT64_T
|
|
48
|
+
typedef uint64_t u_int64_t;
|
|
49
|
+
# endif
|
|
50
|
+
#endif
|
|
51
|
+
|
|
52
|
+
#define SZF PRI_SIZE_PREFIX // defined in ruby.h
|
|
53
|
+
|
|
54
|
+
#if SIZEOF_LONG==8
|
|
55
|
+
# define NUM2INT64(x) NUM2LONG(x)
|
|
56
|
+
# define INT642NUM(x) LONG2NUM(x)
|
|
57
|
+
# define NUM2UINT64(x) NUM2ULONG(x)
|
|
58
|
+
# define UINT642NUM(x) ULONG2NUM(x)
|
|
59
|
+
# ifndef PRId64
|
|
60
|
+
# define PRId64 "ld"
|
|
61
|
+
# endif
|
|
62
|
+
# ifndef PRIu64
|
|
63
|
+
# define PRIu64 "lu"
|
|
64
|
+
# endif
|
|
65
|
+
#elif SIZEOF_LONG_LONG==8
|
|
66
|
+
# define NUM2INT64(x) NUM2LL(x)
|
|
67
|
+
# define INT642NUM(x) LL2NUM(x)
|
|
68
|
+
# define NUM2UINT64(x) NUM2ULL(x)
|
|
69
|
+
# define UINT642NUM(x) ULL2NUM(x)
|
|
70
|
+
# ifndef PRId64
|
|
71
|
+
# define PRId64 "lld"
|
|
72
|
+
# endif
|
|
73
|
+
# ifndef PRIu64
|
|
74
|
+
# define PRIu64 "llu"
|
|
75
|
+
# endif
|
|
76
|
+
#endif
|
|
77
|
+
|
|
78
|
+
#if SIZEOF_LONG==4
|
|
79
|
+
# define NUM2INT32(x) NUM2LONG(x)
|
|
80
|
+
# define INT322NUM(x) LONG2NUM(x)
|
|
81
|
+
# define NUM2UINT32(x) NUM2ULONG(x)
|
|
82
|
+
# define UINT322NUM(x) ULONG2NUM(x)
|
|
83
|
+
# ifndef PRId32
|
|
84
|
+
# define PRId32 "ld"
|
|
85
|
+
# endif
|
|
86
|
+
# ifndef PRIu32
|
|
87
|
+
# define PRIu32 "lu"
|
|
88
|
+
# endif
|
|
89
|
+
#elif SIZEOF_INT==4
|
|
90
|
+
# define NUM2INT32(x) NUM2INT(x)
|
|
91
|
+
# define INT322NUM(x) INT2NUM(x)
|
|
92
|
+
# define NUM2UINT32(x) NUM2UINT(x)
|
|
93
|
+
# define UINT322NUM(x) UINT2NUM(x)
|
|
94
|
+
# ifndef PRId32
|
|
95
|
+
# define PRId32 "d"
|
|
96
|
+
# endif
|
|
97
|
+
# ifndef PRIu32
|
|
98
|
+
# define PRIu32 "u"
|
|
99
|
+
# endif
|
|
100
|
+
#endif
|
|
101
|
+
|
|
102
|
+
#ifndef HAVE_TYPE_BOOL
|
|
103
|
+
typedef int bool;
|
|
104
|
+
#endif
|
|
105
|
+
#ifndef FALSE /* in case these macros already exist */
|
|
106
|
+
# define FALSE 0 /* values of bool */
|
|
107
|
+
#endif
|
|
108
|
+
#ifndef TRUE
|
|
109
|
+
# define TRUE 1
|
|
110
|
+
#endif
|
|
111
|
+
|
|
112
|
+
typedef struct { float dat[2]; } scomplex;
|
|
113
|
+
typedef struct { double dat[2]; } dcomplex;
|
|
114
|
+
typedef int fortran_integer;
|
|
115
|
+
|
|
116
|
+
#define REAL(x) ((x).dat[0])
|
|
117
|
+
#define IMAG(x) ((x).dat[1])
|
|
118
|
+
|
|
119
|
+
extern int na_debug_flag;
|
|
120
|
+
|
|
121
|
+
#define NARRAY_DATA_T 0x1
|
|
122
|
+
#define NARRAY_VIEW_T 0x2
|
|
123
|
+
#define NARRAY_FILEMAP_T 0x3
|
|
124
|
+
|
|
125
|
+
//#define NA_MAX_DIMENSION (int)(sizeof(VALUE)*8-2)
|
|
126
|
+
#define NA_MAX_DIMENSION 12
|
|
127
|
+
#define NA_MAX_ELMSZ 65535
|
|
128
|
+
|
|
129
|
+
typedef unsigned int BIT_DIGIT;
|
|
130
|
+
#define BYTE_BIT_DIGIT sizeof(BIT_DIGIT)
|
|
131
|
+
#define NB (sizeof(BIT_DIGIT)*8)
|
|
132
|
+
#define BALL (~(BIT_DIGIT)0)
|
|
133
|
+
#define SLB(n) (((n)==NB)?~(BIT_DIGIT)0:(~(~(BIT_DIGIT)0<<(n))))
|
|
134
|
+
|
|
135
|
+
#define ELEMENT_BIT_SIZE "ELEMENT_BIT_SIZE"
|
|
136
|
+
#define ELEMENT_BYTE_SIZE "ELEMENT_BYTE_SIZE"
|
|
137
|
+
#define CONTIGUOUS_STRIDE "CONTIGUOUS_STRIDE"
|
|
138
|
+
|
|
139
|
+
#include "cumo/indexer.h"
|
|
140
|
+
#include "cumo/intern_kernel.h"
|
|
141
|
+
|
|
142
|
+
#if defined(__cplusplus)
|
|
143
|
+
#if 0
|
|
144
|
+
{ /* satisfy cc-mode */
|
|
145
|
+
#endif
|
|
146
|
+
} /* extern "C" { */
|
|
147
|
+
#endif
|
|
148
|
+
|
|
149
|
+
#endif /* ifndef CUMO_NARRAY_KERNEL_H */
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
#ifndef CUMO_NDLOOP_H
|
|
2
|
+
#define CUMO_NDLOOP_H
|
|
3
|
+
|
|
4
|
+
typedef struct NA_LOOP_ITER {
|
|
5
|
+
ssize_t pos; // - required for each dimension.
|
|
6
|
+
ssize_t step;
|
|
7
|
+
size_t *idx;
|
|
8
|
+
} na_loop_iter_t;
|
|
9
|
+
|
|
10
|
+
typedef struct NA_LOOP_ARGS {
|
|
11
|
+
VALUE value;
|
|
12
|
+
ssize_t elmsz;
|
|
13
|
+
char *ptr;
|
|
14
|
+
//char *buf_ptr; //
|
|
15
|
+
int ndim; // required for each argument.
|
|
16
|
+
// ssize_t pos; - not required here.
|
|
17
|
+
size_t *shape;
|
|
18
|
+
na_loop_iter_t *iter; // moved from na_loop_t
|
|
19
|
+
} na_loop_args_t;
|
|
20
|
+
|
|
21
|
+
// pass this structure to user iterator
|
|
22
|
+
typedef struct NA_LOOP {
|
|
23
|
+
int narg;
|
|
24
|
+
int ndim; // n of user dimention used at user function.
|
|
25
|
+
size_t *n; // n of elements for each dim (=shape)
|
|
26
|
+
na_loop_args_t *args; // for each arg
|
|
27
|
+
VALUE option;
|
|
28
|
+
void *opt_ptr;
|
|
29
|
+
VALUE err_type;
|
|
30
|
+
int reduce_dim; // number of dimensions to reduce in reduction kernel, e.g., for an array of shape: [2,3,4],
|
|
31
|
+
// 3 for sum(), 1 for sum(axis: 1), 2 for sum(axis: [1,2])
|
|
32
|
+
VALUE reduce; // dimension indicies to reduce in reduction kernel (in bits), e.g., for an array of shape:
|
|
33
|
+
// [2,3,4], 111b for sum(), 010b for sum(axis: 1), 110b for sum(axis: [1,2])
|
|
34
|
+
} na_loop_t;
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
// ------------------ ndfunc -------------------------------------------
|
|
38
|
+
|
|
39
|
+
#define NDF_HAS_LOOP (1<<0) // x[i]
|
|
40
|
+
#define NDF_STRIDE_LOOP (1<<1) // *(x+stride*i)
|
|
41
|
+
#define NDF_INDEX_LOOP (1<<2) // *(x+idx[i])
|
|
42
|
+
#define NDF_KEEP_DIM (1<<3)
|
|
43
|
+
#define NDF_INPLACE (1<<4)
|
|
44
|
+
#define NDF_ACCEPT_BYTESWAP (1<<5)
|
|
45
|
+
|
|
46
|
+
#define NDF_FLAT_REDUCE (1<<6)
|
|
47
|
+
#define NDF_EXTRACT (1<<7)
|
|
48
|
+
#define NDF_CUM (1<<8)
|
|
49
|
+
|
|
50
|
+
#define NDF_INDEXER_LOOP (1<<9) // Cumo custom. Use cumo own indexer.
|
|
51
|
+
|
|
52
|
+
#define FULL_LOOP (NDF_HAS_LOOP|NDF_STRIDE_LOOP|NDF_INDEX_LOOP|NDF_INPLACE)
|
|
53
|
+
#define FULL_LOOP_NIP (NDF_HAS_LOOP|NDF_STRIDE_LOOP|NDF_INDEX_LOOP)
|
|
54
|
+
#define STRIDE_LOOP (NDF_HAS_LOOP|NDF_STRIDE_LOOP|NDF_INPLACE)
|
|
55
|
+
#define STRIDE_LOOP_NIP (NDF_HAS_LOOP|NDF_STRIDE_LOOP)
|
|
56
|
+
#define NO_LOOP 0
|
|
57
|
+
|
|
58
|
+
#define OVERWRITE Qtrue // used for CASTABLE(t)
|
|
59
|
+
|
|
60
|
+
#define NDF_TEST(nf,fl) ((nf)->flag & (fl))
|
|
61
|
+
#define NDF_SET(nf,fl) {(nf)->flag |= (fl);}
|
|
62
|
+
|
|
63
|
+
#define NDF_ARG_READ_ONLY 1
|
|
64
|
+
#define NDF_ARG_WRITE_ONLY 2
|
|
65
|
+
#define NDF_ARG_READ_WRITE 3
|
|
66
|
+
|
|
67
|
+
// type of user function
|
|
68
|
+
typedef void (*na_iter_func_t) _((na_loop_t *const));
|
|
69
|
+
typedef VALUE (*na_text_func_t) _((char *ptr, size_t pos, VALUE opt));
|
|
70
|
+
//typedef void (*) void (*loop_func)(ndfunc_t*, na_md_loop_t*))
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
typedef struct NDF_ARG_IN {
|
|
74
|
+
VALUE type; // argument types
|
|
75
|
+
int dim; // # of dimension of argument handled by user function
|
|
76
|
+
// if dim==-1, reduce dimension
|
|
77
|
+
} ndfunc_arg_in_t;
|
|
78
|
+
|
|
79
|
+
typedef struct NDF_ARG_OUT {
|
|
80
|
+
VALUE type; // argument types
|
|
81
|
+
int dim; // # of dimension of argument handled by user function
|
|
82
|
+
size_t *shape;
|
|
83
|
+
} ndfunc_arg_out_t;
|
|
84
|
+
|
|
85
|
+
// spec of user function
|
|
86
|
+
typedef struct NDFUNCTION {
|
|
87
|
+
na_iter_func_t func; // user function
|
|
88
|
+
unsigned int flag; // what kind of loop user function supports
|
|
89
|
+
int nin; // # of arguments
|
|
90
|
+
int nout; // # of results
|
|
91
|
+
ndfunc_arg_in_t *ain; // spec of input arguments
|
|
92
|
+
ndfunc_arg_out_t *aout; // spec of output result
|
|
93
|
+
} ndfunc_t;
|
|
94
|
+
|
|
95
|
+
#endif /* CUMO_NDLOOP_H */
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
#ifndef CUMO_REDUCE_KERNEL_H
|
|
2
|
+
#define CUMO_REDUCE_KERNEL_H
|
|
3
|
+
|
|
4
|
+
#include <algorithm>
|
|
5
|
+
#include <cstdint>
|
|
6
|
+
#include <type_traits>
|
|
7
|
+
|
|
8
|
+
#include "cumo/indexer.h"
|
|
9
|
+
|
|
10
|
+
static inline int64_t round_up_to_power_of_2(int64_t x) {
|
|
11
|
+
--x;
|
|
12
|
+
x |= x >> 1;
|
|
13
|
+
x |= x >> 2;
|
|
14
|
+
x |= x >> 4;
|
|
15
|
+
x |= x >> 8;
|
|
16
|
+
x |= x >> 16;
|
|
17
|
+
x |= x >> 32;
|
|
18
|
+
return x + 1;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
#define _REDUCE(offset) \
|
|
22
|
+
if (tid < offset) { \
|
|
23
|
+
impl.Reduce(sdata[(tid + offset)], sdata[tid]); \
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// reference: cupy reduction kernel
|
|
27
|
+
|
|
28
|
+
template <typename TypeIn, typename TypeOut, typename ReductionImpl>
|
|
29
|
+
__global__ static void reduction_kernel(na_reduction_arg_t arg, ReductionImpl impl) {
|
|
30
|
+
na_iarray_t& in_iarray = arg.in;
|
|
31
|
+
na_iarray_t& out_iarray = arg.out;
|
|
32
|
+
na_indexer_t& in_indexer = arg.in_indexer;
|
|
33
|
+
na_indexer_t& out_indexer = arg.out_indexer;
|
|
34
|
+
na_indexer_t& reduce_indexer = arg.reduce_indexer;
|
|
35
|
+
|
|
36
|
+
using TypeReduce = decltype(impl.Identity());
|
|
37
|
+
|
|
38
|
+
extern __shared__ __align__(8) char sdata_raw[];
|
|
39
|
+
TypeReduce* sdata = (TypeReduce*)sdata_raw;
|
|
40
|
+
unsigned int tid = threadIdx.x;
|
|
41
|
+
unsigned int block_size = blockDim.x; // number of threads
|
|
42
|
+
|
|
43
|
+
for (uint64_t i_out = blockIdx.x; i_out < out_indexer.total_size; i_out += gridDim.x) {
|
|
44
|
+
cumo_na_indexer_set_dim(&out_indexer, i_out);
|
|
45
|
+
TypeReduce accum = impl.Identity();
|
|
46
|
+
|
|
47
|
+
for (int8_t i_out_dim = 0; i_out_dim < out_indexer.ndim; ++i_out_dim) {
|
|
48
|
+
in_indexer.index[i_out_dim] = out_indexer.index[i_out_dim];
|
|
49
|
+
}
|
|
50
|
+
for (auto i_reduce = tid; i_reduce < reduce_indexer.total_size; i_reduce += block_size) {
|
|
51
|
+
cumo_na_indexer_set_dim(&reduce_indexer, i_reduce);
|
|
52
|
+
for (int8_t i_reduce_dim = 0; i_reduce_dim < reduce_indexer.ndim; ++i_reduce_dim) {
|
|
53
|
+
in_indexer.index[out_indexer.ndim + i_reduce_dim] = reduce_indexer.index[i_reduce_dim];
|
|
54
|
+
}
|
|
55
|
+
TypeIn* in_ptr = reinterpret_cast<TypeIn*>(cumo_na_iarray_at_dim(&in_iarray, &in_indexer));
|
|
56
|
+
uint64_t i_in = in_ptr - reinterpret_cast<TypeIn*>(in_iarray.ptr);
|
|
57
|
+
impl.Reduce(impl.MapIn(*in_ptr, i_in), accum);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
if (block_size >= 2) {
|
|
61
|
+
sdata[tid] = accum;
|
|
62
|
+
__syncthreads();
|
|
63
|
+
|
|
64
|
+
if (block_size > 2) {
|
|
65
|
+
if (block_size > 4) {
|
|
66
|
+
if (block_size > 8) {
|
|
67
|
+
if (block_size > 16) {
|
|
68
|
+
if (block_size > 32) {
|
|
69
|
+
if (block_size > 64) {
|
|
70
|
+
if (block_size > 128) {
|
|
71
|
+
if (block_size > 256) {
|
|
72
|
+
_REDUCE(256);
|
|
73
|
+
__syncthreads();
|
|
74
|
+
}
|
|
75
|
+
_REDUCE(128);
|
|
76
|
+
__syncthreads();
|
|
77
|
+
}
|
|
78
|
+
_REDUCE(64);
|
|
79
|
+
__syncthreads();
|
|
80
|
+
}
|
|
81
|
+
_REDUCE(32);
|
|
82
|
+
__syncthreads();
|
|
83
|
+
}
|
|
84
|
+
_REDUCE(16);
|
|
85
|
+
__syncthreads();
|
|
86
|
+
}
|
|
87
|
+
_REDUCE(8);
|
|
88
|
+
__syncthreads();
|
|
89
|
+
}
|
|
90
|
+
_REDUCE(4);
|
|
91
|
+
__syncthreads();
|
|
92
|
+
}
|
|
93
|
+
_REDUCE(2);
|
|
94
|
+
__syncthreads();
|
|
95
|
+
}
|
|
96
|
+
_REDUCE(1);
|
|
97
|
+
accum = sdata[0];
|
|
98
|
+
}
|
|
99
|
+
if (tid == 0) {
|
|
100
|
+
TypeOut* out_ptr = reinterpret_cast<TypeOut*>(cumo_na_iarray_at_dim(&out_iarray, &out_indexer));
|
|
101
|
+
*out_ptr = impl.MapOut(accum);
|
|
102
|
+
//printf("threadId.x:%d blockIdx.x:%d blockDim.x:%d gridDim.x:%d block_size:%d accum:%d out:%p(%d)\n", threadIdx.x, blockIdx.x, blockDim.x, gridDim.x, block_size, accum, out_ptr, *out_ptr);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
#undef _REDUCE
|
|
108
|
+
|
|
109
|
+
static constexpr size_t max_block_size = 512;
|
|
110
|
+
|
|
111
|
+
template <typename TypeIn, typename TypeOut, typename ReductionImpl>
|
|
112
|
+
void cumo_reduce(na_reduction_arg_t arg, ReductionImpl&& impl) {
|
|
113
|
+
na_indexer_t& out_indexer = arg.out_indexer;
|
|
114
|
+
na_indexer_t& reduce_indexer = arg.reduce_indexer;
|
|
115
|
+
|
|
116
|
+
using TypeReduce = decltype(impl.Identity());
|
|
117
|
+
|
|
118
|
+
size_t block_size = round_up_to_power_of_2(std::max(int64_t{1}, static_cast<int64_t>(reduce_indexer.total_size)));
|
|
119
|
+
block_size = std::min(max_block_size, block_size);
|
|
120
|
+
size_t grid_size = out_indexer.total_size;
|
|
121
|
+
size_t shared_mem_size = sizeof(TypeReduce) * block_size;
|
|
122
|
+
|
|
123
|
+
reduction_kernel<TypeIn,TypeOut,ReductionImpl><<<grid_size, block_size, shared_mem_size>>>(arg, impl);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
#endif // CUMO_REDUCE_KERNEL_H
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
#ifndef CUMO_TEMPLATE_H
|
|
2
|
+
#define CUMO_TEMPLATE_H
|
|
3
|
+
|
|
4
|
+
#define INIT_COUNTER( lp, c ) \
|
|
5
|
+
{ c = (lp)->n[0]; }
|
|
6
|
+
|
|
7
|
+
#define NDL_CNT(lp) ((lp)->n[0])
|
|
8
|
+
#define NDL_PTR(lp,i) ((lp)->args[i].ptr + (lp)->args[i].iter[0].pos)
|
|
9
|
+
#define NDL_STEP(lp,i) ((lp)->args[i].iter[0].step)
|
|
10
|
+
#define NDL_IDX(lp,i) ((lp)->args[i].iter[0].idx)
|
|
11
|
+
#define NDL_ESZ(lp,i) ((lp)->args[i].elmsz)
|
|
12
|
+
#define NDL_SHAPE(lp,i) ((lp)->args[i].shape)
|
|
13
|
+
|
|
14
|
+
#define INIT_PTR( lp, i, pt, st ) \
|
|
15
|
+
{ \
|
|
16
|
+
pt = ((lp)->args[i]).ptr + ((lp)->args[i].iter[0]).pos; \
|
|
17
|
+
st = ((lp)->args[i].iter[0]).step; \
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
#define INIT_PTR_IDX( lp, i, pt, st, id ) \
|
|
21
|
+
{ \
|
|
22
|
+
pt = ((lp)->args[i]).ptr + ((lp)->args[i].iter[0]).pos; \
|
|
23
|
+
st = ((lp)->args[i].iter[0]).step; \
|
|
24
|
+
id = ((lp)->args[i].iter[0]).idx; \
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
#define INIT_ELMSIZE( lp, i, es ) \
|
|
28
|
+
{ \
|
|
29
|
+
es = ((lp)->args[i]).elmsz; \
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
#define INIT_PTR_BIT( lp, i, ad, ps, st ) \
|
|
33
|
+
{ \
|
|
34
|
+
ps = ((lp)->args[i].iter[0]).pos; \
|
|
35
|
+
ad = (BIT_DIGIT*)(((lp)->args[i]).ptr) + ps/NB; \
|
|
36
|
+
ps %= NB; \
|
|
37
|
+
st = ((lp)->args[i].iter[0]).step; \
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
#define INIT_PTR_BIT_IDX( lp, i, ad, ps, st, id ) \
|
|
41
|
+
{ \
|
|
42
|
+
ps = ((lp)->args[i].iter[0]).pos; \
|
|
43
|
+
ad = (BIT_DIGIT*)(((lp)->args[i]).ptr) + ps/NB; \
|
|
44
|
+
ps %= NB; \
|
|
45
|
+
st = ((lp)->args[i].iter[0]).step; \
|
|
46
|
+
id = ((lp)->args[i].iter[0]).idx; \
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
#define GET_DATA( ptr, type, val ) \
|
|
50
|
+
{ \
|
|
51
|
+
val = *(type*)(ptr); \
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
#define SET_DATA( ptr, type, val ) \
|
|
55
|
+
{ \
|
|
56
|
+
*(type*)(ptr) = val; \
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
#define GET_DATA_STRIDE( ptr, step, type, val ) \
|
|
60
|
+
{ \
|
|
61
|
+
val = *(type*)(ptr); \
|
|
62
|
+
ptr += step; \
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
#define GET_DATA_INDEX( ptr, idx, type, val ) \
|
|
66
|
+
{ \
|
|
67
|
+
val = *(type*)(ptr + *idx); \
|
|
68
|
+
idx++; \
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
#define SET_DATA_STRIDE( ptr, step, type, val ) \
|
|
72
|
+
{ \
|
|
73
|
+
*(type*)(ptr) = val; \
|
|
74
|
+
ptr += step; \
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
#define SET_DATA_INDEX( ptr, idx, type, val ) \
|
|
78
|
+
{ \
|
|
79
|
+
*(type*)(ptr + *idx) = val; \
|
|
80
|
+
idx++; \
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
#define LOAD_BIT( adr, pos, val ) \
|
|
84
|
+
{ \
|
|
85
|
+
size_t dig = (pos) / NB; \
|
|
86
|
+
int bit = (pos) % NB; \
|
|
87
|
+
val = (((BIT_DIGIT*)(adr))[dig]>>(bit)) & 1u; \
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
#define LOAD_BIT_STEP( adr, pos, step, idx, val ) \
|
|
91
|
+
{ \
|
|
92
|
+
size_t dig; int bit; \
|
|
93
|
+
if (idx) { \
|
|
94
|
+
dig = ((pos) + *(idx)) / NB; \
|
|
95
|
+
bit = ((pos) + *(idx)) % NB; \
|
|
96
|
+
idx++; \
|
|
97
|
+
} else { \
|
|
98
|
+
dig = (pos) / NB; \
|
|
99
|
+
bit = (pos) % NB; \
|
|
100
|
+
pos += step; \
|
|
101
|
+
} \
|
|
102
|
+
val = (((BIT_DIGIT*)(adr))[dig]>>bit) & 1u; \
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
#define STORE_BIT(adr,pos,val) \
|
|
106
|
+
{ \
|
|
107
|
+
size_t dig = (pos) / NB; \
|
|
108
|
+
int bit = (pos) % NB; \
|
|
109
|
+
((BIT_DIGIT*)(adr))[dig] = \
|
|
110
|
+
(((BIT_DIGIT*)(adr))[dig] & ~(1u<<(bit))) | ((val)<<(bit)); \
|
|
111
|
+
}
|
|
112
|
+
// val -> val&1 ??
|
|
113
|
+
|
|
114
|
+
#define STORE_BIT_STEP( adr, pos, step, idx, val )\
|
|
115
|
+
{ \
|
|
116
|
+
size_t dig; int bit; \
|
|
117
|
+
if (idx) { \
|
|
118
|
+
dig = ((pos) + *(idx)) / NB; \
|
|
119
|
+
bit = ((pos) + *(idx)) % NB; \
|
|
120
|
+
idx++; \
|
|
121
|
+
} else { \
|
|
122
|
+
dig = (pos) / NB; \
|
|
123
|
+
bit = (pos) % NB; \
|
|
124
|
+
pos += step; \
|
|
125
|
+
} \
|
|
126
|
+
((BIT_DIGIT*)(adr))[dig] = \
|
|
127
|
+
(((BIT_DIGIT*)(adr))[dig] & ~(1u<<(bit))) | ((val)<<(bit)); \
|
|
128
|
+
}
|
|
129
|
+
// val -> val&1 ??
|
|
130
|
+
|
|
131
|
+
static inline int
|
|
132
|
+
is_aligned(const void *ptr, const size_t alignment)
|
|
133
|
+
{
|
|
134
|
+
return ((size_t)(ptr) & ((alignment)-1)) == 0;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
static inline int
|
|
138
|
+
is_aligned_step(const ssize_t step, const size_t alignment)
|
|
139
|
+
{
|
|
140
|
+
return ((step) & ((alignment)-1)) == 0;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
#define SHOW_WARNING_ONCE( c_str ) \
|
|
144
|
+
{ \
|
|
145
|
+
static bool show_warning = true; \
|
|
146
|
+
if (show_warning) { \
|
|
147
|
+
fprintf(stderr, (c_str)); \
|
|
148
|
+
show_warning = false; \
|
|
149
|
+
} \
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
#define SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE( func_name, type_name ) \
|
|
153
|
+
SHOW_WARNING_ONCE("Warning: FIXME: Method \"" func_name "\" for dtype \"" type_name "\" synchronizes with CPU.\n")
|
|
154
|
+
|
|
155
|
+
#define SHOW_SYNCHRONIZE_WARNING_ONCE( func_name, type_name ) \
|
|
156
|
+
SHOW_WARNING_ONCE("Warning: Method \"" func_name "\" for dtype \"" type_name "\" synchronizes with CPU.\n")
|
|
157
|
+
|
|
158
|
+
#endif /* ifndef CUMO_TEMPLATE_H */
|