cumo 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/3rd_party/LICENSE.txt +60 -0
- data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +13 -1
- data/LICENSE.txt +1 -62
- data/README.md +33 -29
- data/bench/cumo_bench.rb +47 -25
- data/bench/numo_bench.rb +27 -25
- data/docs/src-tree.md +16 -0
- data/ext/cumo/cuda/cublas.c +69 -219
- data/ext/cumo/cuda/memory_pool_impl.hpp +1 -0
- data/ext/cumo/cuda/runtime.c +2 -14
- data/ext/cumo/cumo.c +16 -16
- data/ext/cumo/include/cumo.h +2 -2
- data/ext/cumo/include/cumo/cuda/cublas.h +6 -129
- data/ext/cumo/include/cumo/cuda/runtime.h +16 -0
- data/ext/cumo/include/cumo/indexer.h +46 -63
- data/ext/cumo/include/cumo/intern.h +58 -112
- data/ext/cumo/include/cumo/narray.h +214 -185
- data/ext/cumo/include/cumo/narray_kernel.h +66 -37
- data/ext/cumo/include/cumo/ndloop.h +42 -42
- data/ext/cumo/include/cumo/reduce_kernel.h +55 -71
- data/ext/cumo/include/cumo/template.h +56 -51
- data/ext/cumo/include/cumo/template_kernel.h +31 -31
- data/ext/cumo/include/cumo/types/bit.h +3 -3
- data/ext/cumo/include/cumo/types/bit_kernel.h +2 -2
- data/ext/cumo/include/cumo/types/complex.h +126 -126
- data/ext/cumo/include/cumo/types/complex_kernel.h +126 -126
- data/ext/cumo/include/cumo/types/complex_macro.h +28 -28
- data/ext/cumo/include/cumo/types/complex_macro_kernel.h +20 -20
- data/ext/cumo/include/cumo/types/dcomplex.h +5 -5
- data/ext/cumo/include/cumo/types/dcomplex_kernel.h +1 -1
- data/ext/cumo/include/cumo/types/int_macro.h +1 -1
- data/ext/cumo/include/cumo/types/int_macro_kernel.h +1 -1
- data/ext/cumo/include/cumo/types/robj_macro.h +30 -30
- data/ext/cumo/include/cumo/types/scomplex.h +5 -5
- data/ext/cumo/include/cumo/types/scomplex_kernel.h +1 -1
- data/ext/cumo/narray/array.c +143 -143
- data/ext/cumo/narray/data.c +184 -184
- data/ext/cumo/narray/gen/cogen.rb +5 -2
- data/ext/cumo/narray/gen/cogen_kernel.rb +5 -2
- data/ext/cumo/narray/gen/def/dcomplex.rb +1 -1
- data/ext/cumo/narray/gen/def/scomplex.rb +1 -1
- data/ext/cumo/narray/gen/erbln.rb +132 -0
- data/ext/cumo/narray/gen/erbpp2.rb +18 -13
- data/ext/cumo/narray/gen/narray_def.rb +3 -3
- data/ext/cumo/narray/gen/spec.rb +2 -2
- data/ext/cumo/narray/gen/tmpl/accum.c +15 -15
- data/ext/cumo/narray/gen/tmpl/accum_binary.c +22 -22
- data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/accum_index.c +30 -30
- data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +2 -2
- data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/alloc_func.c +14 -14
- data/ext/cumo/narray/gen/tmpl/allocate.c +11 -11
- data/ext/cumo/narray/gen/tmpl/aref.c +2 -2
- data/ext/cumo/narray/gen/tmpl/aref_cpu.c +4 -4
- data/ext/cumo/narray/gen/tmpl/aset.c +2 -2
- data/ext/cumo/narray/gen/tmpl/binary.c +28 -28
- data/ext/cumo/narray/gen/tmpl/binary2.c +18 -18
- data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/binary_s.c +13 -13
- data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/bincount.c +23 -23
- data/ext/cumo/narray/gen/tmpl/cast.c +7 -7
- data/ext/cumo/narray/gen/tmpl/cast_array.c +3 -3
- data/ext/cumo/narray/gen/tmpl/clip.c +38 -38
- data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +2 -2
- data/ext/cumo/narray/gen/tmpl/cond_binary.c +19 -19
- data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +7 -7
- data/ext/cumo/narray/gen/tmpl/cond_unary.c +15 -15
- data/ext/cumo/narray/gen/tmpl/cum.c +15 -15
- data/ext/cumo/narray/gen/tmpl/each.c +9 -9
- data/ext/cumo/narray/gen/tmpl/each_with_index.c +9 -9
- data/ext/cumo/narray/gen/tmpl/ewcomp.c +15 -15
- data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/extract_cpu.c +5 -5
- data/ext/cumo/narray/gen/tmpl/extract_data.c +12 -12
- data/ext/cumo/narray/gen/tmpl/eye.c +9 -9
- data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +3 -3
- data/ext/cumo/narray/gen/tmpl/fill.c +9 -9
- data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +1 -1
- data/ext/cumo/narray/gen/tmpl/format.c +11 -11
- data/ext/cumo/narray/gen/tmpl/format_to_a.c +8 -8
- data/ext/cumo/narray/gen/tmpl/frexp.c +13 -13
- data/ext/cumo/narray/gen/tmpl/gemm.c +252 -108
- data/ext/cumo/narray/gen/tmpl/inspect.c +1 -1
- data/ext/cumo/narray/gen/tmpl/lib.c +2 -2
- data/ext/cumo/narray/gen/tmpl/logseq.c +7 -7
- data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/map_with_index.c +17 -17
- data/ext/cumo/narray/gen/tmpl/median.c +10 -10
- data/ext/cumo/narray/gen/tmpl/minmax.c +10 -10
- data/ext/cumo/narray/gen/tmpl/new_dim0.c +3 -3
- data/ext/cumo/narray/gen/tmpl/poly.c +6 -6
- data/ext/cumo/narray/gen/tmpl/pow.c +28 -28
- data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/rand.c +10 -10
- data/ext/cumo/narray/gen/tmpl/rand_norm.c +7 -7
- data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/seq.c +7 -7
- data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +6 -6
- data/ext/cumo/narray/gen/tmpl/set2.c +20 -20
- data/ext/cumo/narray/gen/tmpl/sort.c +11 -11
- data/ext/cumo/narray/gen/tmpl/sort_index.c +18 -18
- data/ext/cumo/narray/gen/tmpl/store.c +6 -6
- data/ext/cumo/narray/gen/tmpl/store_array.c +19 -19
- data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +12 -12
- data/ext/cumo/narray/gen/tmpl/store_bit.c +23 -23
- data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +28 -28
- data/ext/cumo/narray/gen/tmpl/store_from.c +16 -16
- data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +12 -12
- data/ext/cumo/narray/gen/tmpl/to_a.c +10 -10
- data/ext/cumo/narray/gen/tmpl/unary.c +25 -25
- data/ext/cumo/narray/gen/tmpl/unary2.c +17 -17
- data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +15 -15
- data/ext/cumo/narray/gen/tmpl/unary_ret2.c +13 -13
- data/ext/cumo/narray/gen/tmpl/unary_s.c +17 -17
- data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +12 -12
- data/ext/cumo/narray/gen/tmpl_bit/allocate.c +9 -9
- data/ext/cumo/narray/gen/tmpl_bit/aref.c +2 -2
- data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +5 -5
- data/ext/cumo/narray/gen/tmpl_bit/aset.c +2 -2
- data/ext/cumo/narray/gen/tmpl_bit/binary.c +29 -29
- data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +14 -14
- data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +21 -21
- data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +28 -28
- data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +29 -29
- data/ext/cumo/narray/gen/tmpl_bit/each.c +10 -10
- data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +10 -10
- data/ext/cumo/narray/gen/tmpl_bit/extract.c +8 -8
- data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +8 -8
- data/ext/cumo/narray/gen/tmpl_bit/fill.c +17 -17
- data/ext/cumo/narray/gen/tmpl_bit/format.c +14 -14
- data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +11 -11
- data/ext/cumo/narray/gen/tmpl_bit/inspect.c +3 -3
- data/ext/cumo/narray/gen/tmpl_bit/mask.c +33 -33
- data/ext/cumo/narray/gen/tmpl_bit/store_array.c +19 -19
- data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +22 -22
- data/ext/cumo/narray/gen/tmpl_bit/store_from.c +18 -18
- data/ext/cumo/narray/gen/tmpl_bit/to_a.c +12 -12
- data/ext/cumo/narray/gen/tmpl_bit/unary.c +24 -24
- data/ext/cumo/narray/gen/tmpl_bit/where.c +16 -16
- data/ext/cumo/narray/gen/tmpl_bit/where2.c +20 -20
- data/ext/cumo/narray/index.c +213 -213
- data/ext/cumo/narray/math.c +27 -27
- data/ext/cumo/narray/narray.c +484 -484
- data/ext/cumo/narray/ndloop.c +259 -258
- data/ext/cumo/narray/rand.c +3 -3
- data/ext/cumo/narray/step.c +70 -70
- data/ext/cumo/narray/struct.c +139 -139
- metadata +6 -7
- data/ext/cumo/include/cumo/intern_fwd.h +0 -38
- data/lib/erbpp.rb +0 -294
- data/lib/erbpp/line_number.rb +0 -137
- data/lib/erbpp/narray_def.rb +0 -381
@@ -49,13 +49,23 @@ extern "C" {
|
|
49
49
|
# endif
|
50
50
|
#endif
|
51
51
|
|
52
|
+
#ifndef SZF
|
52
53
|
#define SZF PRI_SIZE_PREFIX // defined in ruby.h
|
54
|
+
#endif
|
53
55
|
|
54
56
|
#if SIZEOF_LONG==8
|
55
|
-
#
|
56
|
-
#
|
57
|
-
#
|
58
|
-
#
|
57
|
+
# ifndef NUM2INT64
|
58
|
+
# define NUM2INT64(x) NUM2LONG(x)
|
59
|
+
# endif
|
60
|
+
# ifndef INT642NUM
|
61
|
+
# define INT642NUM(x) LONG2NUM(x)
|
62
|
+
# endif
|
63
|
+
# ifndef NUM2UINT64
|
64
|
+
# define NUM2UINT64(x) NUM2ULONG(x)
|
65
|
+
# endif
|
66
|
+
# ifndef UINT642NUM
|
67
|
+
# define UINT642NUM(x) ULONG2NUM(x)
|
68
|
+
# endif
|
59
69
|
# ifndef PRId64
|
60
70
|
# define PRId64 "ld"
|
61
71
|
# endif
|
@@ -63,10 +73,18 @@ extern "C" {
|
|
63
73
|
# define PRIu64 "lu"
|
64
74
|
# endif
|
65
75
|
#elif SIZEOF_LONG_LONG==8
|
66
|
-
#
|
67
|
-
#
|
68
|
-
#
|
69
|
-
#
|
76
|
+
# ifndef NUM2INT64
|
77
|
+
# define NUM2INT64(x) NUM2LL(x)
|
78
|
+
# endif
|
79
|
+
# ifndef INT642NUM
|
80
|
+
# define INT642NUM(x) LL2NUM(x)
|
81
|
+
# endif
|
82
|
+
# ifndef NUM2UINT64
|
83
|
+
# define NUM2UINT64(x) NUM2ULL(x)
|
84
|
+
# endif
|
85
|
+
# ifndef UINT642NUM
|
86
|
+
# define UINT642NUM(x) ULL2NUM(x)
|
87
|
+
# endif
|
70
88
|
# ifndef PRId64
|
71
89
|
# define PRId64 "lld"
|
72
90
|
# endif
|
@@ -76,10 +94,18 @@ extern "C" {
|
|
76
94
|
#endif
|
77
95
|
|
78
96
|
#if SIZEOF_LONG==4
|
79
|
-
#
|
80
|
-
#
|
81
|
-
#
|
82
|
-
#
|
97
|
+
# ifndef NUM2INT32
|
98
|
+
# define NUM2INT32(x) NUM2LONG(x)
|
99
|
+
# endif
|
100
|
+
# ifndef INT322NUM
|
101
|
+
# define INT322NUM(x) LONG2NUM(x)
|
102
|
+
# endif
|
103
|
+
# ifndef NUM2UINT32
|
104
|
+
# define NUM2UINT32(x) NUM2ULONG(x)
|
105
|
+
# endif
|
106
|
+
# ifndef UINT322NUM
|
107
|
+
# define UINT322NUM(x) ULONG2NUM(x)
|
108
|
+
# endif
|
83
109
|
# ifndef PRId32
|
84
110
|
# define PRId32 "ld"
|
85
111
|
# endif
|
@@ -87,10 +113,18 @@ extern "C" {
|
|
87
113
|
# define PRIu32 "lu"
|
88
114
|
# endif
|
89
115
|
#elif SIZEOF_INT==4
|
90
|
-
#
|
91
|
-
#
|
92
|
-
#
|
93
|
-
#
|
116
|
+
# ifndef NUM2INT32
|
117
|
+
# define NUM2INT32(x) NUM2INT(x)
|
118
|
+
# endif
|
119
|
+
# ifndef INT322NUM
|
120
|
+
# define INT322NUM(x) INT2NUM(x)
|
121
|
+
# endif
|
122
|
+
# ifndef NUM2UINT32
|
123
|
+
# define NUM2UINT32(x) NUM2UINT(x)
|
124
|
+
# endif
|
125
|
+
# ifndef UINT322NUM
|
126
|
+
# define UINT322NUM(x) UINT2NUM(x)
|
127
|
+
# endif
|
94
128
|
# ifndef PRId32
|
95
129
|
# define PRId32 "d"
|
96
130
|
# endif
|
@@ -109,32 +143,27 @@ extern "C" {
|
|
109
143
|
# define TRUE 1
|
110
144
|
#endif
|
111
145
|
|
112
|
-
typedef struct { float dat[2]; }
|
113
|
-
typedef struct { double dat[2]; }
|
114
|
-
typedef int fortran_integer;
|
115
|
-
|
116
|
-
#define REAL(x) ((x).dat[0])
|
117
|
-
#define IMAG(x) ((x).dat[1])
|
146
|
+
typedef struct { float dat[2]; } cumo_scomplex;
|
147
|
+
typedef struct { double dat[2]; } cumo_dcomplex;
|
118
148
|
|
119
|
-
|
149
|
+
#define CUMO_REAL(x) ((x).dat[0])
|
150
|
+
#define CUMO_IMAG(x) ((x).dat[1])
|
120
151
|
|
121
|
-
|
122
|
-
#define NARRAY_VIEW_T 0x2
|
123
|
-
#define NARRAY_FILEMAP_T 0x3
|
152
|
+
extern int cumo_na_debug_flag;
|
124
153
|
|
125
|
-
|
126
|
-
#define
|
127
|
-
#define
|
154
|
+
#define CUMO_NARRAY_DATA_T 0x1
|
155
|
+
#define CUMO_NARRAY_VIEW_T 0x2
|
156
|
+
#define CUMO_NARRAY_FILEMAP_T 0x3
|
128
157
|
|
129
|
-
|
130
|
-
#define
|
131
|
-
#define
|
132
|
-
#define BALL (~(BIT_DIGIT)0)
|
133
|
-
#define SLB(n) (((n)==NB)?~(BIT_DIGIT)0:(~(~(BIT_DIGIT)0<<(n))))
|
158
|
+
//#define CUMO_NA_MAX_DIMENSION (int)(sizeof(VALUE)*8-2)
|
159
|
+
#define CUMO_NA_MAX_DIMENSION 12
|
160
|
+
#define CUMO_NA_MAX_ELMSZ 65535
|
134
161
|
|
135
|
-
|
136
|
-
#define
|
137
|
-
#define
|
162
|
+
typedef unsigned int CUMO_BIT_DIGIT;
|
163
|
+
#define CUMO_BYTE_BIT_DIGIT sizeof(CUMO_BIT_DIGIT)
|
164
|
+
#define CUMO_NB (sizeof(CUMO_BIT_DIGIT)*8)
|
165
|
+
#define CUMO_BALL (~(CUMO_BIT_DIGIT)0)
|
166
|
+
#define CUMO_SLB(n) (((n)==CUMO_NB)?~(CUMO_BIT_DIGIT)0:(~(~(CUMO_BIT_DIGIT)0<<(n))))
|
138
167
|
|
139
168
|
#include "cumo/indexer.h"
|
140
169
|
#include "cumo/intern_kernel.h"
|
@@ -1,29 +1,29 @@
|
|
1
1
|
#ifndef CUMO_NDLOOP_H
|
2
2
|
#define CUMO_NDLOOP_H
|
3
3
|
|
4
|
-
typedef struct
|
4
|
+
typedef struct {
|
5
5
|
ssize_t pos; // - required for each dimension.
|
6
6
|
ssize_t step;
|
7
7
|
size_t *idx;
|
8
|
-
}
|
8
|
+
} cumo_na_loop_iter_t;
|
9
9
|
|
10
|
-
typedef struct
|
10
|
+
typedef struct {
|
11
11
|
VALUE value;
|
12
|
-
ssize_t elmsz;
|
12
|
+
ssize_t elmsz; // element size in bytes, e.g., 4 for int, 8 for double
|
13
13
|
char *ptr;
|
14
14
|
//char *buf_ptr; //
|
15
15
|
int ndim; // required for each argument.
|
16
16
|
// ssize_t pos; - not required here.
|
17
17
|
size_t *shape;
|
18
|
-
|
19
|
-
}
|
18
|
+
cumo_na_loop_iter_t *iter; // moved from cumo_na_loop_t
|
19
|
+
} cumo_na_loop_args_t;
|
20
20
|
|
21
21
|
// pass this structure to user iterator
|
22
|
-
typedef struct
|
22
|
+
typedef struct {
|
23
23
|
int narg;
|
24
24
|
int ndim; // n of user dimention used at user function.
|
25
25
|
size_t *n; // n of elements for each dim (=shape)
|
26
|
-
|
26
|
+
cumo_na_loop_args_t *args; // for each arg
|
27
27
|
VALUE option;
|
28
28
|
void *opt_ptr;
|
29
29
|
VALUE err_type;
|
@@ -31,65 +31,65 @@ typedef struct NA_LOOP {
|
|
31
31
|
// 3 for sum(), 1 for sum(axis: 1), 2 for sum(axis: [1,2])
|
32
32
|
VALUE reduce; // dimension indicies to reduce in reduction kernel (in bits), e.g., for an array of shape:
|
33
33
|
// [2,3,4], 111b for sum(), 010b for sum(axis: 1), 110b for sum(axis: [1,2])
|
34
|
-
}
|
34
|
+
} cumo_na_loop_t;
|
35
35
|
|
36
36
|
|
37
37
|
// ------------------ ndfunc -------------------------------------------
|
38
38
|
|
39
|
-
#define
|
40
|
-
#define
|
41
|
-
#define
|
42
|
-
#define
|
43
|
-
#define
|
44
|
-
#define
|
39
|
+
#define CUMO_NDF_HAS_LOOP (1<<0) // x[i]
|
40
|
+
#define CUMO_NDF_STRIDE_LOOP (1<<1) // *(x+stride*i)
|
41
|
+
#define CUMO_NDF_INDEX_LOOP (1<<2) // *(x+idx[i])
|
42
|
+
#define CUMO_NDF_KEEP_DIM (1<<3)
|
43
|
+
#define CUMO_NDF_INPLACE (1<<4)
|
44
|
+
#define CUMO_NDF_ACCEPT_BYTESWAP (1<<5)
|
45
45
|
|
46
|
-
#define
|
47
|
-
#define
|
48
|
-
#define
|
46
|
+
#define CUMO_NDF_FLAT_REDUCE (1<<6)
|
47
|
+
#define CUMO_NDF_EXTRACT (1<<7)
|
48
|
+
#define CUMO_NDF_CUM (1<<8)
|
49
49
|
|
50
|
-
#define
|
50
|
+
#define CUMO_NDF_INDEXER_LOOP (1<<9) // Cumo custom. Use cumo own indexer.
|
51
51
|
|
52
|
-
#define
|
53
|
-
#define
|
54
|
-
#define
|
55
|
-
#define
|
56
|
-
#define
|
52
|
+
#define CUMO_FULL_LOOP (CUMO_NDF_HAS_LOOP|CUMO_NDF_STRIDE_LOOP|CUMO_NDF_INDEX_LOOP|CUMO_NDF_INPLACE)
|
53
|
+
#define CUMO_FULL_LOOP_NIP (CUMO_NDF_HAS_LOOP|CUMO_NDF_STRIDE_LOOP|CUMO_NDF_INDEX_LOOP)
|
54
|
+
#define CUMO_STRIDE_LOOP (CUMO_NDF_HAS_LOOP|CUMO_NDF_STRIDE_LOOP|CUMO_NDF_INPLACE)
|
55
|
+
#define CUMO_STRIDE_LOOP_NIP (CUMO_NDF_HAS_LOOP|CUMO_NDF_STRIDE_LOOP)
|
56
|
+
#define CUMO_NO_LOOP 0
|
57
57
|
|
58
|
-
#define
|
58
|
+
#define CUMO_OVERWRITE Qtrue // used for CASTABLE(t)
|
59
59
|
|
60
|
-
#define
|
61
|
-
#define
|
60
|
+
#define CUMO_NDF_TEST(nf,fl) ((nf)->flag & (fl))
|
61
|
+
#define CUMO_NDF_SET(nf,fl) {(nf)->flag |= (fl);}
|
62
62
|
|
63
|
-
#define
|
64
|
-
#define
|
65
|
-
#define
|
63
|
+
#define CUMO_NDF_ARG_READ_ONLY 1
|
64
|
+
#define CUMO_NDF_ARG_WRITE_ONLY 2
|
65
|
+
#define CUMO_NDF_ARG_READ_WRITE 3
|
66
66
|
|
67
67
|
// type of user function
|
68
|
-
typedef void (*
|
69
|
-
typedef VALUE (*
|
70
|
-
//typedef void (*) void (*loop_func)(
|
68
|
+
typedef void (*cumo_na_iter_func_t) _((cumo_na_loop_t *const));
|
69
|
+
typedef VALUE (*cumo_na_text_func_t) _((char *ptr, size_t pos, VALUE opt));
|
70
|
+
//typedef void (*) void (*loop_func)(cumo_ndfunc_t*, cumo_na_md_loop_t*))
|
71
71
|
|
72
72
|
|
73
|
-
typedef struct
|
73
|
+
typedef struct {
|
74
74
|
VALUE type; // argument types
|
75
75
|
int dim; // # of dimension of argument handled by user function
|
76
76
|
// if dim==-1, reduce dimension
|
77
|
-
}
|
77
|
+
} cumo_ndfunc_arg_in_t;
|
78
78
|
|
79
|
-
typedef struct
|
79
|
+
typedef struct {
|
80
80
|
VALUE type; // argument types
|
81
81
|
int dim; // # of dimension of argument handled by user function
|
82
82
|
size_t *shape;
|
83
|
-
}
|
83
|
+
} cumo_ndfunc_arg_out_t;
|
84
84
|
|
85
85
|
// spec of user function
|
86
|
-
typedef struct
|
87
|
-
|
86
|
+
typedef struct {
|
87
|
+
cumo_na_iter_func_t func; // user function
|
88
88
|
unsigned int flag; // what kind of loop user function supports
|
89
89
|
int nin; // # of arguments
|
90
90
|
int nout; // # of results
|
91
|
-
|
92
|
-
|
93
|
-
}
|
91
|
+
cumo_ndfunc_arg_in_t *ain; // spec of input arguments
|
92
|
+
cumo_ndfunc_arg_out_t *aout; // spec of output result
|
93
|
+
} cumo_ndfunc_t;
|
94
94
|
|
95
95
|
#endif /* CUMO_NDLOOP_H */
|
@@ -7,6 +7,11 @@
|
|
7
7
|
|
8
8
|
#include "cumo/indexer.h"
|
9
9
|
|
10
|
+
namespace cumo_detail {
|
11
|
+
|
12
|
+
static constexpr int64_t max_block_size = 512;
|
13
|
+
static constexpr int64_t max_grid_size = 0x7fffffff;
|
14
|
+
|
10
15
|
static inline int64_t round_up_to_power_of_2(int64_t x) {
|
11
16
|
--x;
|
12
17
|
x |= x >> 1;
|
@@ -18,109 +23,88 @@ static inline int64_t round_up_to_power_of_2(int64_t x) {
|
|
18
23
|
return x + 1;
|
19
24
|
}
|
20
25
|
|
21
|
-
|
22
|
-
|
23
|
-
impl.Reduce(sdata[(tid + offset)], sdata[tid]); \
|
24
|
-
}
|
25
|
-
|
26
|
-
// reference: cupy reduction kernel
|
26
|
+
// Reference: cupy reduction kernel
|
27
|
+
// Note that reduction and out axis are inverse with cupy. Former axes are out axes, latters are reduce axes.
|
27
28
|
|
28
29
|
template <typename TypeIn, typename TypeOut, typename ReductionImpl>
|
29
|
-
__global__ static void reduction_kernel(
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
na_indexer_t& reduce_indexer = arg.reduce_indexer;
|
30
|
+
__global__ static void reduction_kernel(cumo_na_reduction_arg_t arg, int out_block_size, int reduce_block_size, ReductionImpl impl) {
|
31
|
+
cumo_na_iarray_t& in_iarray = arg.in;
|
32
|
+
cumo_na_iarray_t& out_iarray = arg.out;
|
33
|
+
cumo_na_indexer_t& in_indexer = arg.in_indexer;
|
34
|
+
cumo_na_indexer_t& out_indexer = arg.out_indexer;
|
35
35
|
|
36
36
|
using TypeReduce = decltype(impl.Identity());
|
37
37
|
|
38
38
|
extern __shared__ __align__(8) char sdata_raw[];
|
39
|
-
TypeReduce* sdata = (
|
39
|
+
TypeReduce* sdata = reinterpret_cast<TypeReduce*>(sdata_raw);
|
40
40
|
unsigned int tid = threadIdx.x;
|
41
|
-
unsigned int block_size = blockDim.x; // number of threads
|
42
41
|
|
43
|
-
|
42
|
+
int64_t reduce_indexer_total_size = in_indexer.total_size / out_indexer.total_size;
|
43
|
+
int64_t reduce_offset = tid / out_block_size; // # of cols == # of elems
|
44
|
+
|
45
|
+
int64_t out_offset = tid % out_block_size; // # of rows
|
46
|
+
int64_t out_base = blockIdx.x * out_block_size; // # of rows
|
47
|
+
int64_t out_stride = gridDim.x * out_block_size; // # of rows
|
48
|
+
|
49
|
+
for (int64_t i_out = out_base + out_offset; i_out < out_indexer.total_size; i_out += out_stride) {
|
44
50
|
cumo_na_indexer_set_dim(&out_indexer, i_out);
|
45
51
|
TypeReduce accum = impl.Identity();
|
46
52
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
for (auto i_reduce = tid; i_reduce < reduce_indexer.total_size; i_reduce += block_size) {
|
51
|
-
cumo_na_indexer_set_dim(&reduce_indexer, i_reduce);
|
52
|
-
for (int8_t i_reduce_dim = 0; i_reduce_dim < reduce_indexer.ndim; ++i_reduce_dim) {
|
53
|
-
in_indexer.index[out_indexer.ndim + i_reduce_dim] = reduce_indexer.index[i_reduce_dim];
|
54
|
-
}
|
53
|
+
int64_t i_in = i_out * reduce_indexer_total_size + reduce_offset;
|
54
|
+
for (int64_t i_reduce = reduce_offset; i_reduce < reduce_indexer_total_size; i_reduce += reduce_block_size, i_in += reduce_block_size) {
|
55
|
+
cumo_na_indexer_set_dim(&in_indexer, i_in);
|
55
56
|
TypeIn* in_ptr = reinterpret_cast<TypeIn*>(cumo_na_iarray_at_dim(&in_iarray, &in_indexer));
|
56
|
-
|
57
|
-
|
57
|
+
// Note that spec of (min|max)_index of cumo is different with arg(min|max) of cupy.
|
58
|
+
// Cumo returns index of input elements, CuPy returns index of reduction axis.
|
59
|
+
impl.Reduce(impl.MapIn(*in_ptr, in_ptr - reinterpret_cast<TypeIn*>(in_iarray.ptr)), accum);
|
60
|
+
//printf("threadId.x:%d blockIdx.x:%d blockDim.x:%d gridDim.x:%d accum:%d i_in:%ld i_reduce:%ld i_out:%ld in:%p(%d)\n", threadIdx.x, blockIdx.x, blockDim.x, gridDim.x, accum, i_in, i_reduce, i_out, in_ptr, *in_ptr);
|
58
61
|
}
|
59
62
|
|
60
|
-
if (
|
63
|
+
if (out_block_size <= max_block_size / 2) {
|
61
64
|
sdata[tid] = accum;
|
62
65
|
__syncthreads();
|
63
|
-
|
64
|
-
|
65
|
-
if (
|
66
|
-
if (
|
67
|
-
|
68
|
-
if (block_size > 32) {
|
69
|
-
if (block_size > 64) {
|
70
|
-
if (block_size > 128) {
|
71
|
-
if (block_size > 256) {
|
72
|
-
_REDUCE(256);
|
73
|
-
__syncthreads();
|
74
|
-
}
|
75
|
-
_REDUCE(128);
|
76
|
-
__syncthreads();
|
77
|
-
}
|
78
|
-
_REDUCE(64);
|
79
|
-
__syncthreads();
|
80
|
-
}
|
81
|
-
_REDUCE(32);
|
82
|
-
__syncthreads();
|
83
|
-
}
|
84
|
-
_REDUCE(16);
|
85
|
-
__syncthreads();
|
86
|
-
}
|
87
|
-
_REDUCE(8);
|
88
|
-
__syncthreads();
|
66
|
+
// NOTE: Compiler optimizes to unroll this loop
|
67
|
+
for (int stride = max_block_size / 2; stride > 0; stride >>= 1) {
|
68
|
+
if (out_block_size <= stride) {
|
69
|
+
if (tid < stride) {
|
70
|
+
impl.Reduce(sdata[tid + stride], sdata[tid]);
|
89
71
|
}
|
90
|
-
_REDUCE(4);
|
91
72
|
__syncthreads();
|
92
73
|
}
|
93
|
-
_REDUCE(2);
|
94
|
-
__syncthreads();
|
95
74
|
}
|
96
|
-
|
97
|
-
|
75
|
+
accum = sdata[tid];
|
76
|
+
__syncthreads();
|
98
77
|
}
|
99
|
-
if (
|
78
|
+
if (reduce_offset == 0 && i_out < out_indexer.total_size) {
|
100
79
|
TypeOut* out_ptr = reinterpret_cast<TypeOut*>(cumo_na_iarray_at_dim(&out_iarray, &out_indexer));
|
101
80
|
*out_ptr = impl.MapOut(accum);
|
102
|
-
//printf("threadId.x:%d blockIdx.x:%d blockDim.x:%d gridDim.x:%d
|
81
|
+
//printf("threadId.x:%d blockIdx.x:%d blockDim.x:%d gridDim.x:%d accum:%d i_out:%ld out:%p(%d)\n", threadIdx.x, blockIdx.x, blockDim.x, gridDim.x, accum, i_out, out_ptr, *out_ptr);
|
103
82
|
}
|
104
83
|
}
|
105
84
|
}
|
106
85
|
|
107
|
-
|
108
|
-
|
109
|
-
static constexpr size_t max_block_size = 512;
|
86
|
+
} // cumo_detail
|
110
87
|
|
88
|
+
// TODO(sonots): Optimize indexer by squashing (or reducing) dimensions
|
111
89
|
template <typename TypeIn, typename TypeOut, typename ReductionImpl>
|
112
|
-
void cumo_reduce(
|
113
|
-
|
114
|
-
|
90
|
+
void cumo_reduce(cumo_na_reduction_arg_t arg, ReductionImpl&& impl) {
|
91
|
+
cumo_na_indexer_t& in_indexer = arg.in_indexer;
|
92
|
+
cumo_na_indexer_t& out_indexer = arg.out_indexer;
|
115
93
|
|
116
|
-
|
94
|
+
if (out_indexer.total_size == 0) {
|
95
|
+
return;
|
96
|
+
}
|
97
|
+
|
98
|
+
int64_t reduce_total_size_pow2 = cumo_detail::round_up_to_power_of_2(std::max(size_t{1}, in_indexer.total_size / out_indexer.total_size));
|
99
|
+
int64_t reduce_block_size = std::min(cumo_detail::max_block_size, reduce_total_size_pow2);
|
100
|
+
int64_t out_block_size = cumo_detail::max_block_size / reduce_block_size;
|
101
|
+
int64_t out_block_num = (out_indexer.total_size + out_block_size - 1) / out_block_size;
|
117
102
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
size_t shared_mem_size = sizeof(TypeReduce) * block_size;
|
103
|
+
int64_t block_size = cumo_detail::max_block_size;
|
104
|
+
int64_t grid_size = std::min(cumo_detail::max_grid_size, out_block_num);
|
105
|
+
int64_t shared_mem_size = sizeof(decltype(impl.Identity())) * block_size;
|
122
106
|
|
123
|
-
reduction_kernel<TypeIn,TypeOut,ReductionImpl><<<grid_size, block_size, shared_mem_size>>>(arg, impl);
|
107
|
+
cumo_detail::reduction_kernel<TypeIn,TypeOut,ReductionImpl><<<grid_size, block_size, shared_mem_size>>>(arg, out_block_size, reduce_block_size, impl);
|
124
108
|
}
|
125
109
|
|
126
110
|
#endif // CUMO_REDUCE_KERNEL_H
|