cumo 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (158) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +1 -0
  3. data/3rd_party/LICENSE.txt +60 -0
  4. data/3rd_party/mkmf-cu/lib/mkmf-cu/cli.rb +13 -1
  5. data/LICENSE.txt +1 -62
  6. data/README.md +33 -29
  7. data/bench/cumo_bench.rb +47 -25
  8. data/bench/numo_bench.rb +27 -25
  9. data/docs/src-tree.md +16 -0
  10. data/ext/cumo/cuda/cublas.c +69 -219
  11. data/ext/cumo/cuda/memory_pool_impl.hpp +1 -0
  12. data/ext/cumo/cuda/runtime.c +2 -14
  13. data/ext/cumo/cumo.c +16 -16
  14. data/ext/cumo/include/cumo.h +2 -2
  15. data/ext/cumo/include/cumo/cuda/cublas.h +6 -129
  16. data/ext/cumo/include/cumo/cuda/runtime.h +16 -0
  17. data/ext/cumo/include/cumo/indexer.h +46 -63
  18. data/ext/cumo/include/cumo/intern.h +58 -112
  19. data/ext/cumo/include/cumo/narray.h +214 -185
  20. data/ext/cumo/include/cumo/narray_kernel.h +66 -37
  21. data/ext/cumo/include/cumo/ndloop.h +42 -42
  22. data/ext/cumo/include/cumo/reduce_kernel.h +55 -71
  23. data/ext/cumo/include/cumo/template.h +56 -51
  24. data/ext/cumo/include/cumo/template_kernel.h +31 -31
  25. data/ext/cumo/include/cumo/types/bit.h +3 -3
  26. data/ext/cumo/include/cumo/types/bit_kernel.h +2 -2
  27. data/ext/cumo/include/cumo/types/complex.h +126 -126
  28. data/ext/cumo/include/cumo/types/complex_kernel.h +126 -126
  29. data/ext/cumo/include/cumo/types/complex_macro.h +28 -28
  30. data/ext/cumo/include/cumo/types/complex_macro_kernel.h +20 -20
  31. data/ext/cumo/include/cumo/types/dcomplex.h +5 -5
  32. data/ext/cumo/include/cumo/types/dcomplex_kernel.h +1 -1
  33. data/ext/cumo/include/cumo/types/int_macro.h +1 -1
  34. data/ext/cumo/include/cumo/types/int_macro_kernel.h +1 -1
  35. data/ext/cumo/include/cumo/types/robj_macro.h +30 -30
  36. data/ext/cumo/include/cumo/types/scomplex.h +5 -5
  37. data/ext/cumo/include/cumo/types/scomplex_kernel.h +1 -1
  38. data/ext/cumo/narray/array.c +143 -143
  39. data/ext/cumo/narray/data.c +184 -184
  40. data/ext/cumo/narray/gen/cogen.rb +5 -2
  41. data/ext/cumo/narray/gen/cogen_kernel.rb +5 -2
  42. data/ext/cumo/narray/gen/def/dcomplex.rb +1 -1
  43. data/ext/cumo/narray/gen/def/scomplex.rb +1 -1
  44. data/ext/cumo/narray/gen/erbln.rb +132 -0
  45. data/ext/cumo/narray/gen/erbpp2.rb +18 -13
  46. data/ext/cumo/narray/gen/narray_def.rb +3 -3
  47. data/ext/cumo/narray/gen/spec.rb +2 -2
  48. data/ext/cumo/narray/gen/tmpl/accum.c +15 -15
  49. data/ext/cumo/narray/gen/tmpl/accum_binary.c +22 -22
  50. data/ext/cumo/narray/gen/tmpl/accum_binary_kernel.cu +3 -3
  51. data/ext/cumo/narray/gen/tmpl/accum_index.c +30 -30
  52. data/ext/cumo/narray/gen/tmpl/accum_index_kernel.cu +2 -2
  53. data/ext/cumo/narray/gen/tmpl/accum_kernel.cu +3 -3
  54. data/ext/cumo/narray/gen/tmpl/alloc_func.c +14 -14
  55. data/ext/cumo/narray/gen/tmpl/allocate.c +11 -11
  56. data/ext/cumo/narray/gen/tmpl/aref.c +2 -2
  57. data/ext/cumo/narray/gen/tmpl/aref_cpu.c +4 -4
  58. data/ext/cumo/narray/gen/tmpl/aset.c +2 -2
  59. data/ext/cumo/narray/gen/tmpl/binary.c +28 -28
  60. data/ext/cumo/narray/gen/tmpl/binary2.c +18 -18
  61. data/ext/cumo/narray/gen/tmpl/binary2_kernel.cu +3 -3
  62. data/ext/cumo/narray/gen/tmpl/binary_kernel.cu +6 -6
  63. data/ext/cumo/narray/gen/tmpl/binary_s.c +13 -13
  64. data/ext/cumo/narray/gen/tmpl/binary_s_kernel.cu +3 -3
  65. data/ext/cumo/narray/gen/tmpl/bincount.c +23 -23
  66. data/ext/cumo/narray/gen/tmpl/cast.c +7 -7
  67. data/ext/cumo/narray/gen/tmpl/cast_array.c +3 -3
  68. data/ext/cumo/narray/gen/tmpl/clip.c +38 -38
  69. data/ext/cumo/narray/gen/tmpl/complex_accum_kernel.cu +2 -2
  70. data/ext/cumo/narray/gen/tmpl/cond_binary.c +19 -19
  71. data/ext/cumo/narray/gen/tmpl/cond_binary_kernel.cu +7 -7
  72. data/ext/cumo/narray/gen/tmpl/cond_unary.c +15 -15
  73. data/ext/cumo/narray/gen/tmpl/cum.c +15 -15
  74. data/ext/cumo/narray/gen/tmpl/each.c +9 -9
  75. data/ext/cumo/narray/gen/tmpl/each_with_index.c +9 -9
  76. data/ext/cumo/narray/gen/tmpl/ewcomp.c +15 -15
  77. data/ext/cumo/narray/gen/tmpl/ewcomp_kernel.cu +3 -3
  78. data/ext/cumo/narray/gen/tmpl/extract_cpu.c +5 -5
  79. data/ext/cumo/narray/gen/tmpl/extract_data.c +12 -12
  80. data/ext/cumo/narray/gen/tmpl/eye.c +9 -9
  81. data/ext/cumo/narray/gen/tmpl/eye_kernel.cu +3 -3
  82. data/ext/cumo/narray/gen/tmpl/fill.c +9 -9
  83. data/ext/cumo/narray/gen/tmpl/fill_kernel.cu +6 -6
  84. data/ext/cumo/narray/gen/tmpl/float_accum_kernel.cu +1 -1
  85. data/ext/cumo/narray/gen/tmpl/format.c +11 -11
  86. data/ext/cumo/narray/gen/tmpl/format_to_a.c +8 -8
  87. data/ext/cumo/narray/gen/tmpl/frexp.c +13 -13
  88. data/ext/cumo/narray/gen/tmpl/gemm.c +252 -108
  89. data/ext/cumo/narray/gen/tmpl/inspect.c +1 -1
  90. data/ext/cumo/narray/gen/tmpl/lib.c +2 -2
  91. data/ext/cumo/narray/gen/tmpl/logseq.c +7 -7
  92. data/ext/cumo/narray/gen/tmpl/logseq_kernel.cu +6 -6
  93. data/ext/cumo/narray/gen/tmpl/map_with_index.c +17 -17
  94. data/ext/cumo/narray/gen/tmpl/median.c +10 -10
  95. data/ext/cumo/narray/gen/tmpl/minmax.c +10 -10
  96. data/ext/cumo/narray/gen/tmpl/new_dim0.c +3 -3
  97. data/ext/cumo/narray/gen/tmpl/poly.c +6 -6
  98. data/ext/cumo/narray/gen/tmpl/pow.c +28 -28
  99. data/ext/cumo/narray/gen/tmpl/pow_kernel.cu +6 -6
  100. data/ext/cumo/narray/gen/tmpl/rand.c +10 -10
  101. data/ext/cumo/narray/gen/tmpl/rand_norm.c +7 -7
  102. data/ext/cumo/narray/gen/tmpl/real_accum_kernel.cu +6 -6
  103. data/ext/cumo/narray/gen/tmpl/seq.c +7 -7
  104. data/ext/cumo/narray/gen/tmpl/seq_kernel.cu +6 -6
  105. data/ext/cumo/narray/gen/tmpl/set2.c +20 -20
  106. data/ext/cumo/narray/gen/tmpl/sort.c +11 -11
  107. data/ext/cumo/narray/gen/tmpl/sort_index.c +18 -18
  108. data/ext/cumo/narray/gen/tmpl/store.c +6 -6
  109. data/ext/cumo/narray/gen/tmpl/store_array.c +19 -19
  110. data/ext/cumo/narray/gen/tmpl/store_array_kernel.cu +12 -12
  111. data/ext/cumo/narray/gen/tmpl/store_bit.c +23 -23
  112. data/ext/cumo/narray/gen/tmpl/store_bit_kernel.cu +28 -28
  113. data/ext/cumo/narray/gen/tmpl/store_from.c +16 -16
  114. data/ext/cumo/narray/gen/tmpl/store_from_kernel.cu +12 -12
  115. data/ext/cumo/narray/gen/tmpl/to_a.c +10 -10
  116. data/ext/cumo/narray/gen/tmpl/unary.c +25 -25
  117. data/ext/cumo/narray/gen/tmpl/unary2.c +17 -17
  118. data/ext/cumo/narray/gen/tmpl/unary_kernel.cu +15 -15
  119. data/ext/cumo/narray/gen/tmpl/unary_ret2.c +13 -13
  120. data/ext/cumo/narray/gen/tmpl/unary_s.c +17 -17
  121. data/ext/cumo/narray/gen/tmpl/unary_s_kernel.cu +12 -12
  122. data/ext/cumo/narray/gen/tmpl_bit/allocate.c +9 -9
  123. data/ext/cumo/narray/gen/tmpl_bit/aref.c +2 -2
  124. data/ext/cumo/narray/gen/tmpl_bit/aref_cpu.c +5 -5
  125. data/ext/cumo/narray/gen/tmpl_bit/aset.c +2 -2
  126. data/ext/cumo/narray/gen/tmpl_bit/binary.c +29 -29
  127. data/ext/cumo/narray/gen/tmpl_bit/bit_count.c +14 -14
  128. data/ext/cumo/narray/gen/tmpl_bit/bit_count_cpu.c +21 -21
  129. data/ext/cumo/narray/gen/tmpl_bit/bit_count_kernel.cu +28 -28
  130. data/ext/cumo/narray/gen/tmpl_bit/bit_reduce.c +29 -29
  131. data/ext/cumo/narray/gen/tmpl_bit/each.c +10 -10
  132. data/ext/cumo/narray/gen/tmpl_bit/each_with_index.c +10 -10
  133. data/ext/cumo/narray/gen/tmpl_bit/extract.c +8 -8
  134. data/ext/cumo/narray/gen/tmpl_bit/extract_cpu.c +8 -8
  135. data/ext/cumo/narray/gen/tmpl_bit/fill.c +17 -17
  136. data/ext/cumo/narray/gen/tmpl_bit/format.c +14 -14
  137. data/ext/cumo/narray/gen/tmpl_bit/format_to_a.c +11 -11
  138. data/ext/cumo/narray/gen/tmpl_bit/inspect.c +3 -3
  139. data/ext/cumo/narray/gen/tmpl_bit/mask.c +33 -33
  140. data/ext/cumo/narray/gen/tmpl_bit/store_array.c +19 -19
  141. data/ext/cumo/narray/gen/tmpl_bit/store_bit.c +22 -22
  142. data/ext/cumo/narray/gen/tmpl_bit/store_from.c +18 -18
  143. data/ext/cumo/narray/gen/tmpl_bit/to_a.c +12 -12
  144. data/ext/cumo/narray/gen/tmpl_bit/unary.c +24 -24
  145. data/ext/cumo/narray/gen/tmpl_bit/where.c +16 -16
  146. data/ext/cumo/narray/gen/tmpl_bit/where2.c +20 -20
  147. data/ext/cumo/narray/index.c +213 -213
  148. data/ext/cumo/narray/math.c +27 -27
  149. data/ext/cumo/narray/narray.c +484 -484
  150. data/ext/cumo/narray/ndloop.c +259 -258
  151. data/ext/cumo/narray/rand.c +3 -3
  152. data/ext/cumo/narray/step.c +70 -70
  153. data/ext/cumo/narray/struct.c +139 -139
  154. metadata +6 -7
  155. data/ext/cumo/include/cumo/intern_fwd.h +0 -38
  156. data/lib/erbpp.rb +0 -294
  157. data/lib/erbpp/line_number.rb +0 -137
  158. data/lib/erbpp/narray_def.rb +0 -381
@@ -49,13 +49,23 @@ extern "C" {
49
49
  # endif
50
50
  #endif
51
51
 
52
+ #ifndef SZF
52
53
  #define SZF PRI_SIZE_PREFIX // defined in ruby.h
54
+ #endif
53
55
 
54
56
  #if SIZEOF_LONG==8
55
- # define NUM2INT64(x) NUM2LONG(x)
56
- # define INT642NUM(x) LONG2NUM(x)
57
- # define NUM2UINT64(x) NUM2ULONG(x)
58
- # define UINT642NUM(x) ULONG2NUM(x)
57
+ # ifndef NUM2INT64
58
+ # define NUM2INT64(x) NUM2LONG(x)
59
+ # endif
60
+ # ifndef INT642NUM
61
+ # define INT642NUM(x) LONG2NUM(x)
62
+ # endif
63
+ # ifndef NUM2UINT64
64
+ # define NUM2UINT64(x) NUM2ULONG(x)
65
+ # endif
66
+ # ifndef UINT642NUM
67
+ # define UINT642NUM(x) ULONG2NUM(x)
68
+ # endif
59
69
  # ifndef PRId64
60
70
  # define PRId64 "ld"
61
71
  # endif
@@ -63,10 +73,18 @@ extern "C" {
63
73
  # define PRIu64 "lu"
64
74
  # endif
65
75
  #elif SIZEOF_LONG_LONG==8
66
- # define NUM2INT64(x) NUM2LL(x)
67
- # define INT642NUM(x) LL2NUM(x)
68
- # define NUM2UINT64(x) NUM2ULL(x)
69
- # define UINT642NUM(x) ULL2NUM(x)
76
+ # ifndef NUM2INT64
77
+ # define NUM2INT64(x) NUM2LL(x)
78
+ # endif
79
+ # ifndef INT642NUM
80
+ # define INT642NUM(x) LL2NUM(x)
81
+ # endif
82
+ # ifndef NUM2UINT64
83
+ # define NUM2UINT64(x) NUM2ULL(x)
84
+ # endif
85
+ # ifndef UINT642NUM
86
+ # define UINT642NUM(x) ULL2NUM(x)
87
+ # endif
70
88
  # ifndef PRId64
71
89
  # define PRId64 "lld"
72
90
  # endif
@@ -76,10 +94,18 @@ extern "C" {
76
94
  #endif
77
95
 
78
96
  #if SIZEOF_LONG==4
79
- # define NUM2INT32(x) NUM2LONG(x)
80
- # define INT322NUM(x) LONG2NUM(x)
81
- # define NUM2UINT32(x) NUM2ULONG(x)
82
- # define UINT322NUM(x) ULONG2NUM(x)
97
+ # ifndef NUM2INT32
98
+ # define NUM2INT32(x) NUM2LONG(x)
99
+ # endif
100
+ # ifndef INT322NUM
101
+ # define INT322NUM(x) LONG2NUM(x)
102
+ # endif
103
+ # ifndef NUM2UINT32
104
+ # define NUM2UINT32(x) NUM2ULONG(x)
105
+ # endif
106
+ # ifndef UINT322NUM
107
+ # define UINT322NUM(x) ULONG2NUM(x)
108
+ # endif
83
109
  # ifndef PRId32
84
110
  # define PRId32 "ld"
85
111
  # endif
@@ -87,10 +113,18 @@ extern "C" {
87
113
  # define PRIu32 "lu"
88
114
  # endif
89
115
  #elif SIZEOF_INT==4
90
- # define NUM2INT32(x) NUM2INT(x)
91
- # define INT322NUM(x) INT2NUM(x)
92
- # define NUM2UINT32(x) NUM2UINT(x)
93
- # define UINT322NUM(x) UINT2NUM(x)
116
+ # ifndef NUM2INT32
117
+ # define NUM2INT32(x) NUM2INT(x)
118
+ # endif
119
+ # ifndef INT322NUM
120
+ # define INT322NUM(x) INT2NUM(x)
121
+ # endif
122
+ # ifndef NUM2UINT32
123
+ # define NUM2UINT32(x) NUM2UINT(x)
124
+ # endif
125
+ # ifndef UINT322NUM
126
+ # define UINT322NUM(x) UINT2NUM(x)
127
+ # endif
94
128
  # ifndef PRId32
95
129
  # define PRId32 "d"
96
130
  # endif
@@ -109,32 +143,27 @@ extern "C" {
109
143
  # define TRUE 1
110
144
  #endif
111
145
 
112
- typedef struct { float dat[2]; } scomplex;
113
- typedef struct { double dat[2]; } dcomplex;
114
- typedef int fortran_integer;
115
-
116
- #define REAL(x) ((x).dat[0])
117
- #define IMAG(x) ((x).dat[1])
146
+ typedef struct { float dat[2]; } cumo_scomplex;
147
+ typedef struct { double dat[2]; } cumo_dcomplex;
118
148
 
119
- extern int na_debug_flag;
149
+ #define CUMO_REAL(x) ((x).dat[0])
150
+ #define CUMO_IMAG(x) ((x).dat[1])
120
151
 
121
- #define NARRAY_DATA_T 0x1
122
- #define NARRAY_VIEW_T 0x2
123
- #define NARRAY_FILEMAP_T 0x3
152
+ extern int cumo_na_debug_flag;
124
153
 
125
- //#define NA_MAX_DIMENSION (int)(sizeof(VALUE)*8-2)
126
- #define NA_MAX_DIMENSION 12
127
- #define NA_MAX_ELMSZ 65535
154
+ #define CUMO_NARRAY_DATA_T 0x1
155
+ #define CUMO_NARRAY_VIEW_T 0x2
156
+ #define CUMO_NARRAY_FILEMAP_T 0x3
128
157
 
129
- typedef unsigned int BIT_DIGIT;
130
- #define BYTE_BIT_DIGIT sizeof(BIT_DIGIT)
131
- #define NB (sizeof(BIT_DIGIT)*8)
132
- #define BALL (~(BIT_DIGIT)0)
133
- #define SLB(n) (((n)==NB)?~(BIT_DIGIT)0:(~(~(BIT_DIGIT)0<<(n))))
158
+ //#define CUMO_NA_MAX_DIMENSION (int)(sizeof(VALUE)*8-2)
159
+ #define CUMO_NA_MAX_DIMENSION 12
160
+ #define CUMO_NA_MAX_ELMSZ 65535
134
161
 
135
- #define ELEMENT_BIT_SIZE "ELEMENT_BIT_SIZE"
136
- #define ELEMENT_BYTE_SIZE "ELEMENT_BYTE_SIZE"
137
- #define CONTIGUOUS_STRIDE "CONTIGUOUS_STRIDE"
162
+ typedef unsigned int CUMO_BIT_DIGIT;
163
+ #define CUMO_BYTE_BIT_DIGIT sizeof(CUMO_BIT_DIGIT)
164
+ #define CUMO_NB (sizeof(CUMO_BIT_DIGIT)*8)
165
+ #define CUMO_BALL (~(CUMO_BIT_DIGIT)0)
166
+ #define CUMO_SLB(n) (((n)==CUMO_NB)?~(CUMO_BIT_DIGIT)0:(~(~(CUMO_BIT_DIGIT)0<<(n))))
138
167
 
139
168
  #include "cumo/indexer.h"
140
169
  #include "cumo/intern_kernel.h"
@@ -1,29 +1,29 @@
1
1
  #ifndef CUMO_NDLOOP_H
2
2
  #define CUMO_NDLOOP_H
3
3
 
4
- typedef struct NA_LOOP_ITER {
4
+ typedef struct {
5
5
  ssize_t pos; // - required for each dimension.
6
6
  ssize_t step;
7
7
  size_t *idx;
8
- } na_loop_iter_t;
8
+ } cumo_na_loop_iter_t;
9
9
 
10
- typedef struct NA_LOOP_ARGS {
10
+ typedef struct {
11
11
  VALUE value;
12
- ssize_t elmsz;
12
+ ssize_t elmsz; // element size in bytes, e.g., 4 for int, 8 for double
13
13
  char *ptr;
14
14
  //char *buf_ptr; //
15
15
  int ndim; // required for each argument.
16
16
  // ssize_t pos; - not required here.
17
17
  size_t *shape;
18
- na_loop_iter_t *iter; // moved from na_loop_t
19
- } na_loop_args_t;
18
+ cumo_na_loop_iter_t *iter; // moved from cumo_na_loop_t
19
+ } cumo_na_loop_args_t;
20
20
 
21
21
  // pass this structure to user iterator
22
- typedef struct NA_LOOP {
22
+ typedef struct {
23
23
  int narg;
24
24
  int ndim; // n of user dimention used at user function.
25
25
  size_t *n; // n of elements for each dim (=shape)
26
- na_loop_args_t *args; // for each arg
26
+ cumo_na_loop_args_t *args; // for each arg
27
27
  VALUE option;
28
28
  void *opt_ptr;
29
29
  VALUE err_type;
@@ -31,65 +31,65 @@ typedef struct NA_LOOP {
31
31
  // 3 for sum(), 1 for sum(axis: 1), 2 for sum(axis: [1,2])
32
32
  VALUE reduce; // dimension indicies to reduce in reduction kernel (in bits), e.g., for an array of shape:
33
33
  // [2,3,4], 111b for sum(), 010b for sum(axis: 1), 110b for sum(axis: [1,2])
34
- } na_loop_t;
34
+ } cumo_na_loop_t;
35
35
 
36
36
 
37
37
  // ------------------ ndfunc -------------------------------------------
38
38
 
39
- #define NDF_HAS_LOOP (1<<0) // x[i]
40
- #define NDF_STRIDE_LOOP (1<<1) // *(x+stride*i)
41
- #define NDF_INDEX_LOOP (1<<2) // *(x+idx[i])
42
- #define NDF_KEEP_DIM (1<<3)
43
- #define NDF_INPLACE (1<<4)
44
- #define NDF_ACCEPT_BYTESWAP (1<<5)
39
+ #define CUMO_NDF_HAS_LOOP (1<<0) // x[i]
40
+ #define CUMO_NDF_STRIDE_LOOP (1<<1) // *(x+stride*i)
41
+ #define CUMO_NDF_INDEX_LOOP (1<<2) // *(x+idx[i])
42
+ #define CUMO_NDF_KEEP_DIM (1<<3)
43
+ #define CUMO_NDF_INPLACE (1<<4)
44
+ #define CUMO_NDF_ACCEPT_BYTESWAP (1<<5)
45
45
 
46
- #define NDF_FLAT_REDUCE (1<<6)
47
- #define NDF_EXTRACT (1<<7)
48
- #define NDF_CUM (1<<8)
46
+ #define CUMO_NDF_FLAT_REDUCE (1<<6)
47
+ #define CUMO_NDF_EXTRACT (1<<7)
48
+ #define CUMO_NDF_CUM (1<<8)
49
49
 
50
- #define NDF_INDEXER_LOOP (1<<9) // Cumo custom. Use cumo own indexer.
50
+ #define CUMO_NDF_INDEXER_LOOP (1<<9) // Cumo custom. Use cumo own indexer.
51
51
 
52
- #define FULL_LOOP (NDF_HAS_LOOP|NDF_STRIDE_LOOP|NDF_INDEX_LOOP|NDF_INPLACE)
53
- #define FULL_LOOP_NIP (NDF_HAS_LOOP|NDF_STRIDE_LOOP|NDF_INDEX_LOOP)
54
- #define STRIDE_LOOP (NDF_HAS_LOOP|NDF_STRIDE_LOOP|NDF_INPLACE)
55
- #define STRIDE_LOOP_NIP (NDF_HAS_LOOP|NDF_STRIDE_LOOP)
56
- #define NO_LOOP 0
52
+ #define CUMO_FULL_LOOP (CUMO_NDF_HAS_LOOP|CUMO_NDF_STRIDE_LOOP|CUMO_NDF_INDEX_LOOP|CUMO_NDF_INPLACE)
53
+ #define CUMO_FULL_LOOP_NIP (CUMO_NDF_HAS_LOOP|CUMO_NDF_STRIDE_LOOP|CUMO_NDF_INDEX_LOOP)
54
+ #define CUMO_STRIDE_LOOP (CUMO_NDF_HAS_LOOP|CUMO_NDF_STRIDE_LOOP|CUMO_NDF_INPLACE)
55
+ #define CUMO_STRIDE_LOOP_NIP (CUMO_NDF_HAS_LOOP|CUMO_NDF_STRIDE_LOOP)
56
+ #define CUMO_NO_LOOP 0
57
57
 
58
- #define OVERWRITE Qtrue // used for CASTABLE(t)
58
+ #define CUMO_OVERWRITE Qtrue // used for CASTABLE(t)
59
59
 
60
- #define NDF_TEST(nf,fl) ((nf)->flag & (fl))
61
- #define NDF_SET(nf,fl) {(nf)->flag |= (fl);}
60
+ #define CUMO_NDF_TEST(nf,fl) ((nf)->flag & (fl))
61
+ #define CUMO_NDF_SET(nf,fl) {(nf)->flag |= (fl);}
62
62
 
63
- #define NDF_ARG_READ_ONLY 1
64
- #define NDF_ARG_WRITE_ONLY 2
65
- #define NDF_ARG_READ_WRITE 3
63
+ #define CUMO_NDF_ARG_READ_ONLY 1
64
+ #define CUMO_NDF_ARG_WRITE_ONLY 2
65
+ #define CUMO_NDF_ARG_READ_WRITE 3
66
66
 
67
67
  // type of user function
68
- typedef void (*na_iter_func_t) _((na_loop_t *const));
69
- typedef VALUE (*na_text_func_t) _((char *ptr, size_t pos, VALUE opt));
70
- //typedef void (*) void (*loop_func)(ndfunc_t*, na_md_loop_t*))
68
+ typedef void (*cumo_na_iter_func_t) _((cumo_na_loop_t *const));
69
+ typedef VALUE (*cumo_na_text_func_t) _((char *ptr, size_t pos, VALUE opt));
70
+ //typedef void (*) void (*loop_func)(cumo_ndfunc_t*, cumo_na_md_loop_t*))
71
71
 
72
72
 
73
- typedef struct NDF_ARG_IN {
73
+ typedef struct {
74
74
  VALUE type; // argument types
75
75
  int dim; // # of dimension of argument handled by user function
76
76
  // if dim==-1, reduce dimension
77
- } ndfunc_arg_in_t;
77
+ } cumo_ndfunc_arg_in_t;
78
78
 
79
- typedef struct NDF_ARG_OUT {
79
+ typedef struct {
80
80
  VALUE type; // argument types
81
81
  int dim; // # of dimension of argument handled by user function
82
82
  size_t *shape;
83
- } ndfunc_arg_out_t;
83
+ } cumo_ndfunc_arg_out_t;
84
84
 
85
85
  // spec of user function
86
- typedef struct NDFUNCTION {
87
- na_iter_func_t func; // user function
86
+ typedef struct {
87
+ cumo_na_iter_func_t func; // user function
88
88
  unsigned int flag; // what kind of loop user function supports
89
89
  int nin; // # of arguments
90
90
  int nout; // # of results
91
- ndfunc_arg_in_t *ain; // spec of input arguments
92
- ndfunc_arg_out_t *aout; // spec of output result
93
- } ndfunc_t;
91
+ cumo_ndfunc_arg_in_t *ain; // spec of input arguments
92
+ cumo_ndfunc_arg_out_t *aout; // spec of output result
93
+ } cumo_ndfunc_t;
94
94
 
95
95
  #endif /* CUMO_NDLOOP_H */
@@ -7,6 +7,11 @@
7
7
 
8
8
  #include "cumo/indexer.h"
9
9
 
10
+ namespace cumo_detail {
11
+
12
+ static constexpr int64_t max_block_size = 512;
13
+ static constexpr int64_t max_grid_size = 0x7fffffff;
14
+
10
15
  static inline int64_t round_up_to_power_of_2(int64_t x) {
11
16
  --x;
12
17
  x |= x >> 1;
@@ -18,109 +23,88 @@ static inline int64_t round_up_to_power_of_2(int64_t x) {
18
23
  return x + 1;
19
24
  }
20
25
 
21
- #define _REDUCE(offset) \
22
- if (tid < offset) { \
23
- impl.Reduce(sdata[(tid + offset)], sdata[tid]); \
24
- }
25
-
26
- // reference: cupy reduction kernel
26
+ // Reference: cupy reduction kernel
27
+ // Note that reduction and out axis are inverse with cupy. Former axes are out axes, latters are reduce axes.
27
28
 
28
29
  template <typename TypeIn, typename TypeOut, typename ReductionImpl>
29
- __global__ static void reduction_kernel(na_reduction_arg_t arg, ReductionImpl impl) {
30
- na_iarray_t& in_iarray = arg.in;
31
- na_iarray_t& out_iarray = arg.out;
32
- na_indexer_t& in_indexer = arg.in_indexer;
33
- na_indexer_t& out_indexer = arg.out_indexer;
34
- na_indexer_t& reduce_indexer = arg.reduce_indexer;
30
+ __global__ static void reduction_kernel(cumo_na_reduction_arg_t arg, int out_block_size, int reduce_block_size, ReductionImpl impl) {
31
+ cumo_na_iarray_t& in_iarray = arg.in;
32
+ cumo_na_iarray_t& out_iarray = arg.out;
33
+ cumo_na_indexer_t& in_indexer = arg.in_indexer;
34
+ cumo_na_indexer_t& out_indexer = arg.out_indexer;
35
35
 
36
36
  using TypeReduce = decltype(impl.Identity());
37
37
 
38
38
  extern __shared__ __align__(8) char sdata_raw[];
39
- TypeReduce* sdata = (TypeReduce*)sdata_raw;
39
+ TypeReduce* sdata = reinterpret_cast<TypeReduce*>(sdata_raw);
40
40
  unsigned int tid = threadIdx.x;
41
- unsigned int block_size = blockDim.x; // number of threads
42
41
 
43
- for (uint64_t i_out = blockIdx.x; i_out < out_indexer.total_size; i_out += gridDim.x) {
42
+ int64_t reduce_indexer_total_size = in_indexer.total_size / out_indexer.total_size;
43
+ int64_t reduce_offset = tid / out_block_size; // # of cols == # of elems
44
+
45
+ int64_t out_offset = tid % out_block_size; // # of rows
46
+ int64_t out_base = blockIdx.x * out_block_size; // # of rows
47
+ int64_t out_stride = gridDim.x * out_block_size; // # of rows
48
+
49
+ for (int64_t i_out = out_base + out_offset; i_out < out_indexer.total_size; i_out += out_stride) {
44
50
  cumo_na_indexer_set_dim(&out_indexer, i_out);
45
51
  TypeReduce accum = impl.Identity();
46
52
 
47
- for (int8_t i_out_dim = 0; i_out_dim < out_indexer.ndim; ++i_out_dim) {
48
- in_indexer.index[i_out_dim] = out_indexer.index[i_out_dim];
49
- }
50
- for (auto i_reduce = tid; i_reduce < reduce_indexer.total_size; i_reduce += block_size) {
51
- cumo_na_indexer_set_dim(&reduce_indexer, i_reduce);
52
- for (int8_t i_reduce_dim = 0; i_reduce_dim < reduce_indexer.ndim; ++i_reduce_dim) {
53
- in_indexer.index[out_indexer.ndim + i_reduce_dim] = reduce_indexer.index[i_reduce_dim];
54
- }
53
+ int64_t i_in = i_out * reduce_indexer_total_size + reduce_offset;
54
+ for (int64_t i_reduce = reduce_offset; i_reduce < reduce_indexer_total_size; i_reduce += reduce_block_size, i_in += reduce_block_size) {
55
+ cumo_na_indexer_set_dim(&in_indexer, i_in);
55
56
  TypeIn* in_ptr = reinterpret_cast<TypeIn*>(cumo_na_iarray_at_dim(&in_iarray, &in_indexer));
56
- uint64_t i_in = in_ptr - reinterpret_cast<TypeIn*>(in_iarray.ptr);
57
- impl.Reduce(impl.MapIn(*in_ptr, i_in), accum);
57
+ // Note that spec of (min|max)_index of cumo is different with arg(min|max) of cupy.
58
+ // Cumo returns index of input elements, CuPy returns index of reduction axis.
59
+ impl.Reduce(impl.MapIn(*in_ptr, in_ptr - reinterpret_cast<TypeIn*>(in_iarray.ptr)), accum);
60
+ //printf("threadId.x:%d blockIdx.x:%d blockDim.x:%d gridDim.x:%d accum:%d i_in:%ld i_reduce:%ld i_out:%ld in:%p(%d)\n", threadIdx.x, blockIdx.x, blockDim.x, gridDim.x, accum, i_in, i_reduce, i_out, in_ptr, *in_ptr);
58
61
  }
59
62
 
60
- if (block_size >= 2) {
63
+ if (out_block_size <= max_block_size / 2) {
61
64
  sdata[tid] = accum;
62
65
  __syncthreads();
63
-
64
- if (block_size > 2) {
65
- if (block_size > 4) {
66
- if (block_size > 8) {
67
- if (block_size > 16) {
68
- if (block_size > 32) {
69
- if (block_size > 64) {
70
- if (block_size > 128) {
71
- if (block_size > 256) {
72
- _REDUCE(256);
73
- __syncthreads();
74
- }
75
- _REDUCE(128);
76
- __syncthreads();
77
- }
78
- _REDUCE(64);
79
- __syncthreads();
80
- }
81
- _REDUCE(32);
82
- __syncthreads();
83
- }
84
- _REDUCE(16);
85
- __syncthreads();
86
- }
87
- _REDUCE(8);
88
- __syncthreads();
66
+ // NOTE: Compiler optimizes to unroll this loop
67
+ for (int stride = max_block_size / 2; stride > 0; stride >>= 1) {
68
+ if (out_block_size <= stride) {
69
+ if (tid < stride) {
70
+ impl.Reduce(sdata[tid + stride], sdata[tid]);
89
71
  }
90
- _REDUCE(4);
91
72
  __syncthreads();
92
73
  }
93
- _REDUCE(2);
94
- __syncthreads();
95
74
  }
96
- _REDUCE(1);
97
- accum = sdata[0];
75
+ accum = sdata[tid];
76
+ __syncthreads();
98
77
  }
99
- if (tid == 0) {
78
+ if (reduce_offset == 0 && i_out < out_indexer.total_size) {
100
79
  TypeOut* out_ptr = reinterpret_cast<TypeOut*>(cumo_na_iarray_at_dim(&out_iarray, &out_indexer));
101
80
  *out_ptr = impl.MapOut(accum);
102
- //printf("threadId.x:%d blockIdx.x:%d blockDim.x:%d gridDim.x:%d block_size:%d accum:%d out:%p(%d)\n", threadIdx.x, blockIdx.x, blockDim.x, gridDim.x, block_size, accum, out_ptr, *out_ptr);
81
+ //printf("threadId.x:%d blockIdx.x:%d blockDim.x:%d gridDim.x:%d accum:%d i_out:%ld out:%p(%d)\n", threadIdx.x, blockIdx.x, blockDim.x, gridDim.x, accum, i_out, out_ptr, *out_ptr);
103
82
  }
104
83
  }
105
84
  }
106
85
 
107
- #undef _REDUCE
108
-
109
- static constexpr size_t max_block_size = 512;
86
+ } // cumo_detail
110
87
 
88
+ // TODO(sonots): Optimize indexer by squashing (or reducing) dimensions
111
89
  template <typename TypeIn, typename TypeOut, typename ReductionImpl>
112
- void cumo_reduce(na_reduction_arg_t arg, ReductionImpl&& impl) {
113
- na_indexer_t& out_indexer = arg.out_indexer;
114
- na_indexer_t& reduce_indexer = arg.reduce_indexer;
90
+ void cumo_reduce(cumo_na_reduction_arg_t arg, ReductionImpl&& impl) {
91
+ cumo_na_indexer_t& in_indexer = arg.in_indexer;
92
+ cumo_na_indexer_t& out_indexer = arg.out_indexer;
115
93
 
116
- using TypeReduce = decltype(impl.Identity());
94
+ if (out_indexer.total_size == 0) {
95
+ return;
96
+ }
97
+
98
+ int64_t reduce_total_size_pow2 = cumo_detail::round_up_to_power_of_2(std::max(size_t{1}, in_indexer.total_size / out_indexer.total_size));
99
+ int64_t reduce_block_size = std::min(cumo_detail::max_block_size, reduce_total_size_pow2);
100
+ int64_t out_block_size = cumo_detail::max_block_size / reduce_block_size;
101
+ int64_t out_block_num = (out_indexer.total_size + out_block_size - 1) / out_block_size;
117
102
 
118
- size_t block_size = round_up_to_power_of_2(std::max(int64_t{1}, static_cast<int64_t>(reduce_indexer.total_size)));
119
- block_size = std::min(max_block_size, block_size);
120
- size_t grid_size = out_indexer.total_size;
121
- size_t shared_mem_size = sizeof(TypeReduce) * block_size;
103
+ int64_t block_size = cumo_detail::max_block_size;
104
+ int64_t grid_size = std::min(cumo_detail::max_grid_size, out_block_num);
105
+ int64_t shared_mem_size = sizeof(decltype(impl.Identity())) * block_size;
122
106
 
123
- reduction_kernel<TypeIn,TypeOut,ReductionImpl><<<grid_size, block_size, shared_mem_size>>>(arg, impl);
107
+ cumo_detail::reduction_kernel<TypeIn,TypeOut,ReductionImpl><<<grid_size, block_size, shared_mem_size>>>(arg, out_block_size, reduce_block_size, impl);
124
108
  }
125
109
 
126
110
  #endif // CUMO_REDUCE_KERNEL_H