numo-narray-alt 0.10.4 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,46 +1,54 @@
1
1
  #ifndef NUMO_NARRAY_MH_MULSUM_H
2
2
  #define NUMO_NARRAY_MH_MULSUM_H 1
3
3
 
4
- #define DEF_NARRAY_FLT_MULSUM_METHOD_FUNC(tDType, tNAryClass) \
5
- static void iter_##tDType##_mulsum(na_loop_t* const lp) { \
6
- size_t n; \
7
- char* p1; \
8
- char* p2; \
9
- char* p3; \
10
- ssize_t s1; \
11
- ssize_t s2; \
12
- ssize_t s3; \
13
- \
14
- INIT_COUNTER(lp, n); \
15
- INIT_PTR(lp, 0, p1, s1); \
16
- INIT_PTR(lp, 1, p2, s2); \
17
- INIT_PTR(lp, 2, p3, s3); \
18
- \
19
- if (s3 == 0) { \
20
- tDType z; \
21
- GET_DATA(p3, tDType, z); \
22
- for (size_t i = 0; i < n; i++) { \
23
- tDType x; \
24
- tDType y; \
25
- GET_DATA_STRIDE(p1, s1, tDType, x); \
26
- GET_DATA_STRIDE(p2, s2, tDType, y); \
27
- m_mulsum(x, y, z); \
28
- } \
29
- SET_DATA(p3, tDType, z); \
30
- } else { \
31
- for (size_t i = 0; i < n; i++) { \
32
- tDType x; \
33
- tDType y; \
34
- tDType z; \
35
- GET_DATA_STRIDE(p1, s1, tDType, x); \
36
- GET_DATA_STRIDE(p2, s2, tDType, y); \
37
- GET_DATA(p3, tDType, z); \
38
- m_mulsum(x, y, z); \
39
- SET_DATA_STRIDE(p3, s3, tDType, z); \
40
- } \
41
- } \
42
- } \
43
- \
4
+ // Check whether every reduce axis has contig stride (== element size) in both
5
+ // operands. When true, making the reduce axis the inner-most loop turns mulsum
6
+ // into the all-contig + s3==0 SIMD path, which matters for patterns such as
7
+ // x.dot(x.transpose) where the default (last-axis-as-inner) picks a gather
8
+ // stride on the transposed operand.
9
+ static inline bool nary_mulsum_reduce_axes_contig(VALUE reduce, VALUE va, VALUE vb) {
10
+ narray_t *na, *nb;
11
+ GetNArray(va, na);
12
+ GetNArray(vb, nb);
13
+ if (NA_TYPE(na) != NARRAY_DATA_T && NA_TYPE(na) != NARRAY_VIEW_T) return false;
14
+ if (NA_TYPE(nb) != NARRAY_DATA_T && NA_TYPE(nb) != NARRAY_VIEW_T) return false;
15
+ if (TEST_COLUMN_MAJOR(va) || TEST_COLUMN_MAJOR(vb)) return false;
16
+ ssize_t a_elmsz = nary_element_stride(va);
17
+ ssize_t b_elmsz = nary_element_stride(vb);
18
+ int max_ndim = na->ndim > nb->ndim ? na->ndim : nb->ndim;
19
+ for (int i = 0; i < max_ndim; i++) {
20
+ if (!na_test_reduce(reduce, i)) continue;
21
+ int da = i - (max_ndim - na->ndim);
22
+ int db = i - (max_ndim - nb->ndim);
23
+ if (da >= 0 && na->shape[da] > 1) {
24
+ ssize_t s;
25
+ if (NA_TYPE(na) == NARRAY_VIEW_T) {
26
+ stridx_t sdx = NA_VIEW_STRIDX(na)[da];
27
+ if (SDX_IS_INDEX(sdx)) return false;
28
+ s = SDX_GET_STRIDE(sdx);
29
+ } else {
30
+ s = a_elmsz;
31
+ for (int k = na->ndim - 1; k > da; k--) s *= na->shape[k];
32
+ }
33
+ if (s != a_elmsz) return false;
34
+ }
35
+ if (db >= 0 && nb->shape[db] > 1) {
36
+ ssize_t s;
37
+ if (NA_TYPE(nb) == NARRAY_VIEW_T) {
38
+ stridx_t sdx = NA_VIEW_STRIDX(nb)[db];
39
+ if (SDX_IS_INDEX(sdx)) return false;
40
+ s = SDX_GET_STRIDE(sdx);
41
+ } else {
42
+ s = b_elmsz;
43
+ for (int k = nb->ndim - 1; k > db; k--) s *= nb->shape[k];
44
+ }
45
+ if (s != b_elmsz) return false;
46
+ }
47
+ }
48
+ return true;
49
+ }
50
+
51
+ #define DEF_FLT_MULSUM_NAN_ITER_FUNC(tDType) \
44
52
  static void iter_##tDType##_mulsum_nan(na_loop_t* const lp) { \
45
53
  size_t n; \
46
54
  char* p1; \
@@ -49,13 +57,28 @@
49
57
  ssize_t s1; \
50
58
  ssize_t s2; \
51
59
  ssize_t s3; \
52
- \
53
60
  INIT_COUNTER(lp, n); \
54
61
  INIT_PTR(lp, 0, p1, s1); \
55
62
  INIT_PTR(lp, 1, p2, s2); \
56
63
  INIT_PTR(lp, 2, p3, s3); \
57
- \
58
64
  if (s3 == 0) { \
65
+ if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType)) && \
66
+ is_aligned(p3, sizeof(tDType))) { \
67
+ if (s1 == sizeof(tDType) && s2 == sizeof(tDType)) { \
68
+ for (size_t i = 0; i < n; i++) { \
69
+ m_mulsum_nan(((tDType*)p1)[i], ((tDType*)p2)[i], *(tDType*)p3); \
70
+ } \
71
+ return; \
72
+ } \
73
+ if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType))) { \
74
+ for (size_t i = 0; i < n; i++) { \
75
+ m_mulsum_nan(*(tDType*)p1, *(tDType*)p2, *(tDType*)p3); \
76
+ p1 += s1; \
77
+ p2 += s2; \
78
+ } \
79
+ return; \
80
+ } \
81
+ } \
59
82
  tDType z; \
60
83
  GET_DATA(p3, tDType, z); \
61
84
  for (size_t i = 0; i < n; i++) { \
@@ -67,6 +90,25 @@
67
90
  } \
68
91
  SET_DATA(p3, tDType, z); \
69
92
  } else { \
93
+ if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType)) && \
94
+ is_aligned(p3, sizeof(tDType))) { \
95
+ if (s1 == sizeof(tDType) && s2 == sizeof(tDType) && s3 == sizeof(tDType)) { \
96
+ for (size_t i = 0; i < n; i++) { \
97
+ m_mulsum_nan(((tDType*)p1)[i], ((tDType*)p2)[i], ((tDType*)p3)[i]); \
98
+ } \
99
+ return; \
100
+ } \
101
+ if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType)) && \
102
+ is_aligned_step(s3, sizeof(tDType))) { \
103
+ for (size_t i = 0; i < n; i++) { \
104
+ m_mulsum_nan(*(tDType*)p1, *(tDType*)p2, *(tDType*)p3); \
105
+ p1 += s1; \
106
+ p2 += s2; \
107
+ p3 += s3; \
108
+ } \
109
+ return; \
110
+ } \
111
+ } \
70
112
  for (size_t i = 0; i < n; i++) { \
71
113
  tDType x; \
72
114
  tDType y; \
@@ -78,8 +120,9 @@
78
120
  SET_DATA_STRIDE(p3, s3, tDType, z); \
79
121
  } \
80
122
  } \
81
- } \
82
- \
123
+ }
124
+
125
+ #define DEF_FLT_MULSUM_RUBY_FUNCS(tDType, tNAryClass) \
83
126
  static VALUE tDType##_mulsum_self(int argc, VALUE* argv, VALUE self) { \
84
127
  if (argc < 1) { \
85
128
  rb_raise(rb_eArgError, "wrong number of arguments (%d for >=1)", argc); \
@@ -93,6 +136,9 @@
93
136
  VALUE naryv[2] = { self, argv[0] }; \
94
137
  VALUE reduce = \
95
138
  na_reduce_dimension(argc - 1, argv + 1, 2, naryv, &ndf, iter_##tDType##_mulsum_nan); \
139
+ if (nary_mulsum_reduce_axes_contig(reduce, self, argv[0])) { \
140
+ ndf.flag |= NDF_FLAT_REDUCE; \
141
+ } \
96
142
  VALUE v = na_ndloop(&ndf, 4, self, argv[0], reduce, m_mulsum_init); \
97
143
  \
98
144
  return rb_funcall(v, rb_intern("extract"), 0); \
@@ -113,6 +159,115 @@
113
159
  return rb_funcallv_kw(v, rb_intern("mulsum"), argc, argv, RB_PASS_CALLED_KEYWORDS); \
114
160
  }
115
161
 
162
+ #define DEF_INT_MULSUM_RUBY_FUNCS(tDType, tNAryClass) \
163
+ static VALUE tDType##_mulsum_self(int argc, VALUE* argv, VALUE self) { \
164
+ if (argc < 1) { \
165
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for >=1)", argc); \
166
+ } \
167
+ \
168
+ ndfunc_arg_in_t ain[4] = { \
169
+ { tNAryClass, 0 }, { tNAryClass, 0 }, { sym_reduce, 0 }, { sym_init, 0 } \
170
+ }; \
171
+ ndfunc_arg_out_t aout[1] = { { tNAryClass, 0 } }; \
172
+ ndfunc_t ndf = { iter_##tDType##_mulsum, STRIDE_LOOP_NIP, 4, 1, ain, aout }; \
173
+ VALUE naryv[2] = { self, argv[0] }; \
174
+ VALUE reduce = na_reduce_dimension(argc - 1, argv + 1, 2, naryv, &ndf, 0); \
175
+ VALUE v = na_ndloop(&ndf, 4, self, argv[0], reduce, m_mulsum_init); \
176
+ \
177
+ return rb_funcall(v, rb_intern("extract"), 0); \
178
+ } \
179
+ \
180
+ static VALUE tDType##_mulsum(int argc, VALUE* argv, VALUE self) { \
181
+ if (argc < 1) { \
182
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for >=1)", argc); \
183
+ } \
184
+ \
185
+ VALUE klass = na_upcast(rb_obj_class(self), rb_obj_class(argv[0])); \
186
+ if (klass == tNAryClass) { \
187
+ return tDType##_mulsum_self(argc, argv, self); \
188
+ } \
189
+ \
190
+ VALUE v = rb_funcall(klass, id_cast, 1, self); \
191
+ \
192
+ return rb_funcallv_kw(v, rb_intern("mulsum"), argc, argv, RB_PASS_CALLED_KEYWORDS); \
193
+ }
194
+
195
+ #define DEF_NARRAY_FLT_MULSUM_METHOD_FUNC(tDType, tNAryClass) \
196
+ static void iter_##tDType##_mulsum(na_loop_t* const lp) { \
197
+ size_t n; \
198
+ char* p1; \
199
+ char* p2; \
200
+ char* p3; \
201
+ ssize_t s1; \
202
+ ssize_t s2; \
203
+ ssize_t s3; \
204
+ INIT_COUNTER(lp, n); \
205
+ INIT_PTR(lp, 0, p1, s1); \
206
+ INIT_PTR(lp, 1, p2, s2); \
207
+ INIT_PTR(lp, 2, p3, s3); \
208
+ if (s3 == 0) { \
209
+ if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType)) && \
210
+ is_aligned(p3, sizeof(tDType))) { \
211
+ if (s1 == sizeof(tDType) && s2 == sizeof(tDType)) { \
212
+ for (size_t i = 0; i < n; i++) { \
213
+ m_mulsum(((tDType*)p1)[i], ((tDType*)p2)[i], *(tDType*)p3); \
214
+ } \
215
+ return; \
216
+ } \
217
+ if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType))) { \
218
+ for (size_t i = 0; i < n; i++) { \
219
+ m_mulsum(*(tDType*)p1, *(tDType*)p2, *(tDType*)p3); \
220
+ p1 += s1; \
221
+ p2 += s2; \
222
+ } \
223
+ return; \
224
+ } \
225
+ } \
226
+ tDType z; \
227
+ GET_DATA(p3, tDType, z); \
228
+ for (size_t i = 0; i < n; i++) { \
229
+ tDType x; \
230
+ tDType y; \
231
+ GET_DATA_STRIDE(p1, s1, tDType, x); \
232
+ GET_DATA_STRIDE(p2, s2, tDType, y); \
233
+ m_mulsum(x, y, z); \
234
+ } \
235
+ SET_DATA(p3, tDType, z); \
236
+ } else { \
237
+ if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType)) && \
238
+ is_aligned(p3, sizeof(tDType))) { \
239
+ if (s1 == sizeof(tDType) && s2 == sizeof(tDType) && s3 == sizeof(tDType)) { \
240
+ for (size_t i = 0; i < n; i++) { \
241
+ m_mulsum(((tDType*)p1)[i], ((tDType*)p2)[i], ((tDType*)p3)[i]); \
242
+ } \
243
+ return; \
244
+ } \
245
+ if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType)) && \
246
+ is_aligned_step(s3, sizeof(tDType))) { \
247
+ for (size_t i = 0; i < n; i++) { \
248
+ m_mulsum(*(tDType*)p1, *(tDType*)p2, *(tDType*)p3); \
249
+ p1 += s1; \
250
+ p2 += s2; \
251
+ p3 += s3; \
252
+ } \
253
+ return; \
254
+ } \
255
+ } \
256
+ for (size_t i = 0; i < n; i++) { \
257
+ tDType x; \
258
+ tDType y; \
259
+ tDType z; \
260
+ GET_DATA_STRIDE(p1, s1, tDType, x); \
261
+ GET_DATA_STRIDE(p2, s2, tDType, y); \
262
+ GET_DATA(p3, tDType, z); \
263
+ m_mulsum(x, y, z); \
264
+ SET_DATA_STRIDE(p3, s3, tDType, z); \
265
+ } \
266
+ } \
267
+ } \
268
+ DEF_FLT_MULSUM_NAN_ITER_FUNC(tDType) \
269
+ DEF_FLT_MULSUM_RUBY_FUNCS(tDType, tNAryClass)
270
+
116
271
  #define DEF_NARRAY_INT_MULSUM_METHOD_FUNC(tDType, tNAryClass) \
117
272
  static void iter_##tDType##_mulsum(na_loop_t* const lp) { \
118
273
  size_t n; \
@@ -122,13 +277,28 @@
122
277
  ssize_t s1; \
123
278
  ssize_t s2; \
124
279
  ssize_t s3; \
125
- \
126
280
  INIT_COUNTER(lp, n); \
127
281
  INIT_PTR(lp, 0, p1, s1); \
128
282
  INIT_PTR(lp, 1, p2, s2); \
129
283
  INIT_PTR(lp, 2, p3, s3); \
130
- \
131
284
  if (s3 == 0) { \
285
+ if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType)) && \
286
+ is_aligned(p3, sizeof(tDType))) { \
287
+ if (s1 == sizeof(tDType) && s2 == sizeof(tDType)) { \
288
+ for (size_t i = 0; i < n; i++) { \
289
+ m_mulsum(((tDType*)p1)[i], ((tDType*)p2)[i], *(tDType*)p3); \
290
+ } \
291
+ return; \
292
+ } \
293
+ if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType))) { \
294
+ for (size_t i = 0; i < n; i++) { \
295
+ m_mulsum(*(tDType*)p1, *(tDType*)p2, *(tDType*)p3); \
296
+ p1 += s1; \
297
+ p2 += s2; \
298
+ } \
299
+ return; \
300
+ } \
301
+ } \
132
302
  tDType z; \
133
303
  GET_DATA(p3, tDType, z); \
134
304
  for (size_t i = 0; i < n; i++) { \
@@ -139,6 +309,25 @@
139
309
  } \
140
310
  SET_DATA(p3, tDType, z); \
141
311
  } else { \
312
+ if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType)) && \
313
+ is_aligned(p3, sizeof(tDType))) { \
314
+ if (s1 == sizeof(tDType) && s2 == sizeof(tDType) && s3 == sizeof(tDType)) { \
315
+ for (size_t i = 0; i < n; i++) { \
316
+ m_mulsum(((tDType*)p1)[i], ((tDType*)p2)[i], ((tDType*)p3)[i]); \
317
+ } \
318
+ return; \
319
+ } \
320
+ if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType)) && \
321
+ is_aligned_step(s3, sizeof(tDType))) { \
322
+ for (size_t i = 0; i < n; i++) { \
323
+ m_mulsum(*(tDType*)p1, *(tDType*)p2, *(tDType*)p3); \
324
+ p1 += s1; \
325
+ p2 += s2; \
326
+ p3 += s3; \
327
+ } \
328
+ return; \
329
+ } \
330
+ } \
142
331
  for (size_t i = 0; i < n; i++) { \
143
332
  tDType x, y, z; \
144
333
  GET_DATA_STRIDE(p1, s1, tDType, x); \
@@ -149,37 +338,1134 @@
149
338
  } \
150
339
  } \
151
340
  } \
152
- \
153
- static VALUE tDType##_mulsum_self(int argc, VALUE* argv, VALUE self) { \
154
- if (argc < 1) { \
155
- rb_raise(rb_eArgError, "wrong number of arguments (%d for >=1)", argc); \
341
+ DEF_INT_MULSUM_RUBY_FUNCS(tDType, tNAryClass)
342
+
343
+ #define DEF_MULSUM_SFLT_SSE2_ITER_FUNC() \
344
+ static void iter_sfloat_mulsum(na_loop_t* const lp) { \
345
+ size_t i = 0; \
346
+ size_t n; \
347
+ char* p1; \
348
+ char* p2; \
349
+ char* p3; \
350
+ ssize_t s1; \
351
+ ssize_t s2; \
352
+ ssize_t s3; \
353
+ INIT_COUNTER(lp, n); \
354
+ INIT_PTR(lp, 0, p1, s1); \
355
+ INIT_PTR(lp, 1, p2, s2); \
356
+ INIT_PTR(lp, 2, p3, s3); \
357
+ const size_t num_pack = SIMD_ALIGNMENT_SIZE / sizeof(sfloat); \
358
+ if (s3 == 0) { \
359
+ if (is_aligned(p1, sizeof(sfloat)) && is_aligned(p2, sizeof(sfloat)) && \
360
+ is_aligned(p3, sizeof(sfloat))) { \
361
+ if (s1 == sizeof(sfloat) && s2 == sizeof(sfloat)) { \
362
+ sfloat z; \
363
+ GET_DATA(p3, sfloat, z); \
364
+ if (n >= num_pack && \
365
+ is_same_aligned2(&((sfloat*)p1)[i], &((sfloat*)p2)[i], SIMD_ALIGNMENT_SIZE)) { \
366
+ size_t cnt = (size_t)get_count_of_elements_not_aligned_to_simd_size( \
367
+ &((sfloat*)p1)[i], SIMD_ALIGNMENT_SIZE, sizeof(sfloat) \
368
+ ); \
369
+ for (; i < cnt; i++) { \
370
+ m_mulsum(((sfloat*)p1)[i], ((sfloat*)p2)[i], z); \
371
+ } \
372
+ size_t cnt_simd_loop = (n - i) % num_pack; \
373
+ __m128 acc = _mm_setzero_ps(); \
374
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
375
+ __m128 a = _mm_load_ps(&((sfloat*)p1)[i]); \
376
+ __m128 b = _mm_load_ps(&((sfloat*)p2)[i]); \
377
+ acc = _mm_add_ps(acc, _mm_mul_ps(a, b)); \
378
+ } \
379
+ __m128 shuf = _mm_shuffle_ps(acc, acc, _MM_SHUFFLE(2, 3, 0, 1)); \
380
+ acc = _mm_add_ps(acc, shuf); \
381
+ shuf = _mm_shuffle_ps(acc, acc, _MM_SHUFFLE(0, 1, 2, 3)); \
382
+ acc = _mm_add_ps(acc, shuf); \
383
+ z += _mm_cvtss_f32(acc); \
384
+ } \
385
+ for (; i < n; i++) { \
386
+ m_mulsum(((sfloat*)p1)[i], ((sfloat*)p2)[i], z); \
387
+ } \
388
+ SET_DATA(p3, sfloat, z); \
389
+ return; \
390
+ } \
391
+ if ((s1 == sizeof(sfloat) || s2 == sizeof(sfloat)) && \
392
+ is_aligned_step(s1, sizeof(sfloat)) && is_aligned_step(s2, sizeof(sfloat))) { \
393
+ const sfloat* q_contig; \
394
+ const char* q_strided; \
395
+ ssize_t stride; \
396
+ if (s1 == sizeof(sfloat)) { \
397
+ q_contig = (const sfloat*)p1; \
398
+ q_strided = p2; \
399
+ stride = s2; \
400
+ } else { \
401
+ q_contig = (const sfloat*)p2; \
402
+ q_strided = p1; \
403
+ stride = s1; \
404
+ } \
405
+ sfloat z; \
406
+ GET_DATA(p3, sfloat, z); \
407
+ size_t j = 0; \
408
+ if (n >= num_pack) { \
409
+ size_t cnt_simd_loop = n % num_pack; \
410
+ __m128 acc = _mm_setzero_ps(); \
411
+ for (; j < n - cnt_simd_loop; j += num_pack) { \
412
+ __m128 a = _mm_loadu_ps(&q_contig[j]); \
413
+ __m128 b = _mm_set_ps( \
414
+ *(const sfloat*)(q_strided + (ssize_t)(j + 3) * stride), \
415
+ *(const sfloat*)(q_strided + (ssize_t)(j + 2) * stride), \
416
+ *(const sfloat*)(q_strided + (ssize_t)(j + 1) * stride), \
417
+ *(const sfloat*)(q_strided + (ssize_t)(j + 0) * stride) \
418
+ ); \
419
+ acc = _mm_add_ps(acc, _mm_mul_ps(a, b)); \
420
+ } \
421
+ __m128 shuf = _mm_shuffle_ps(acc, acc, _MM_SHUFFLE(2, 3, 0, 1)); \
422
+ acc = _mm_add_ps(acc, shuf); \
423
+ shuf = _mm_shuffle_ps(acc, acc, _MM_SHUFFLE(0, 1, 2, 3)); \
424
+ acc = _mm_add_ps(acc, shuf); \
425
+ z += _mm_cvtss_f32(acc); \
426
+ } \
427
+ for (; j < n; j++) { \
428
+ m_mulsum(q_contig[j], *(const sfloat*)(q_strided + (ssize_t)j * stride), z); \
429
+ } \
430
+ SET_DATA(p3, sfloat, z); \
431
+ return; \
432
+ } \
433
+ if (is_aligned_step(s1, sizeof(sfloat)) && is_aligned_step(s2, sizeof(sfloat))) { \
434
+ for (size_t i = 0; i < n; i++) { \
435
+ m_mulsum(*(sfloat*)p1, *(sfloat*)p2, *(sfloat*)p3); \
436
+ p1 += s1; \
437
+ p2 += s2; \
438
+ } \
439
+ return; \
440
+ } \
441
+ } \
442
+ sfloat z; \
443
+ GET_DATA(p3, sfloat, z); \
444
+ for (size_t i = 0; i < n; i++) { \
445
+ sfloat x; \
446
+ sfloat y; \
447
+ GET_DATA_STRIDE(p1, s1, sfloat, x); \
448
+ GET_DATA_STRIDE(p2, s2, sfloat, y); \
449
+ m_mulsum(x, y, z); \
450
+ } \
451
+ SET_DATA(p3, sfloat, z); \
452
+ } else { \
453
+ if (is_aligned(p1, sizeof(sfloat)) && is_aligned(p2, sizeof(sfloat)) && \
454
+ is_aligned(p3, sizeof(sfloat))) { \
455
+ if (s1 == sizeof(sfloat) && s2 == sizeof(sfloat) && s3 == sizeof(sfloat)) { \
456
+ if (n >= num_pack && \
457
+ is_same_aligned3( \
458
+ &((sfloat*)p1)[i], &((sfloat*)p2)[i], &((sfloat*)p3)[i], SIMD_ALIGNMENT_SIZE \
459
+ )) { \
460
+ size_t cnt = (size_t)get_count_of_elements_not_aligned_to_simd_size( \
461
+ &((sfloat*)p1)[i], SIMD_ALIGNMENT_SIZE, sizeof(sfloat) \
462
+ ); \
463
+ for (; i < cnt; i++) { \
464
+ m_mulsum(((sfloat*)p1)[i], ((sfloat*)p2)[i], ((sfloat*)p3)[i]); \
465
+ } \
466
+ size_t cnt_simd_loop = (n - i) % num_pack; \
467
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
468
+ __m128 a = _mm_load_ps(&((sfloat*)p1)[i]); \
469
+ __m128 b = _mm_load_ps(&((sfloat*)p2)[i]); \
470
+ __m128 c = _mm_load_ps(&((sfloat*)p3)[i]); \
471
+ _mm_store_ps(&((sfloat*)p3)[i], _mm_add_ps(_mm_mul_ps(a, b), c)); \
472
+ } \
473
+ } \
474
+ for (; i < n; i++) { \
475
+ m_mulsum(((sfloat*)p1)[i], ((sfloat*)p2)[i], ((sfloat*)p3)[i]); \
476
+ } \
477
+ return; \
478
+ } \
479
+ if (((s1 == 0 && s2 == sizeof(sfloat)) || (s1 == sizeof(sfloat) && s2 == 0)) && \
480
+ s3 == sizeof(sfloat)) { \
481
+ const sfloat* q_vec; \
482
+ sfloat scalar; \
483
+ if (s1 == 0) { \
484
+ scalar = *(const sfloat*)p1; \
485
+ q_vec = (const sfloat*)p2; \
486
+ } else { \
487
+ scalar = *(const sfloat*)p2; \
488
+ q_vec = (const sfloat*)p1; \
489
+ } \
490
+ sfloat* q_out = (sfloat*)p3; \
491
+ size_t j = 0; \
492
+ if (n >= num_pack) { \
493
+ size_t cnt_simd_loop = n % num_pack; \
494
+ __m128 va = _mm_set1_ps(scalar); \
495
+ for (; j < n - cnt_simd_loop; j += num_pack) { \
496
+ __m128 vb = _mm_loadu_ps(&q_vec[j]); \
497
+ __m128 vc = _mm_loadu_ps(&q_out[j]); \
498
+ _mm_storeu_ps(&q_out[j], _mm_add_ps(_mm_mul_ps(va, vb), vc)); \
499
+ } \
500
+ } \
501
+ for (; j < n; j++) { \
502
+ m_mulsum(scalar, q_vec[j], q_out[j]); \
503
+ } \
504
+ return; \
505
+ } \
506
+ if (is_aligned_step(s1, sizeof(sfloat)) && is_aligned_step(s2, sizeof(sfloat)) && \
507
+ is_aligned_step(s3, sizeof(sfloat))) { \
508
+ for (size_t i = 0; i < n; i++) { \
509
+ m_mulsum(*(sfloat*)p1, *(sfloat*)p2, *(sfloat*)p3); \
510
+ p1 += s1; \
511
+ p2 += s2; \
512
+ p3 += s3; \
513
+ } \
514
+ return; \
515
+ } \
516
+ } \
517
+ for (size_t i = 0; i < n; i++) { \
518
+ sfloat x; \
519
+ sfloat y; \
520
+ sfloat z; \
521
+ GET_DATA_STRIDE(p1, s1, sfloat, x); \
522
+ GET_DATA_STRIDE(p2, s2, sfloat, y); \
523
+ GET_DATA(p3, sfloat, z); \
524
+ m_mulsum(x, y, z); \
525
+ SET_DATA_STRIDE(p3, s3, sfloat, z); \
526
+ } \
156
527
  } \
157
- \
158
- ndfunc_arg_in_t ain[4] = { \
159
- { tNAryClass, 0 }, { tNAryClass, 0 }, { sym_reduce, 0 }, { sym_init, 0 } \
160
- }; \
161
- ndfunc_arg_out_t aout[1] = { { tNAryClass, 0 } }; \
162
- ndfunc_t ndf = { iter_##tDType##_mulsum, STRIDE_LOOP_NIP, 4, 1, ain, aout }; \
163
- VALUE naryv[2] = { self, argv[0] }; \
164
- VALUE reduce = na_reduce_dimension(argc - 1, argv + 1, 2, naryv, &ndf, 0); \
165
- VALUE v = na_ndloop(&ndf, 4, self, argv[0], reduce, m_mulsum_init); \
166
- \
167
- return rb_funcall(v, rb_intern("extract"), 0); \
168
- } \
169
- \
170
- static VALUE tDType##_mulsum(int argc, VALUE* argv, VALUE self) { \
171
- if (argc < 1) { \
172
- rb_raise(rb_eArgError, "wrong number of arguments (%d for >=1)", argc); \
528
+ }
529
+
530
+ #define DEF_MULSUM_DFLT_SSE2_ITER_FUNC() \
531
+ static void iter_dfloat_mulsum(na_loop_t* const lp) { \
532
+ size_t i = 0; \
533
+ size_t n; \
534
+ char* p1; \
535
+ char* p2; \
536
+ char* p3; \
537
+ ssize_t s1; \
538
+ ssize_t s2; \
539
+ ssize_t s3; \
540
+ INIT_COUNTER(lp, n); \
541
+ INIT_PTR(lp, 0, p1, s1); \
542
+ INIT_PTR(lp, 1, p2, s2); \
543
+ INIT_PTR(lp, 2, p3, s3); \
544
+ const size_t num_pack = SIMD_ALIGNMENT_SIZE / sizeof(dfloat); \
545
+ if (s3 == 0) { \
546
+ if (is_aligned(p1, sizeof(dfloat)) && is_aligned(p2, sizeof(dfloat)) && \
547
+ is_aligned(p3, sizeof(dfloat))) { \
548
+ if (s1 == sizeof(dfloat) && s2 == sizeof(dfloat)) { \
549
+ dfloat z; \
550
+ GET_DATA(p3, dfloat, z); \
551
+ if (n >= num_pack && \
552
+ is_same_aligned2(&((dfloat*)p1)[i], &((dfloat*)p2)[i], SIMD_ALIGNMENT_SIZE)) { \
553
+ size_t cnt = (size_t)get_count_of_elements_not_aligned_to_simd_size( \
554
+ &((dfloat*)p1)[i], SIMD_ALIGNMENT_SIZE, sizeof(dfloat) \
555
+ ); \
556
+ for (; i < cnt; i++) { \
557
+ m_mulsum(((dfloat*)p1)[i], ((dfloat*)p2)[i], z); \
558
+ } \
559
+ size_t cnt_simd_loop = (n - i) % num_pack; \
560
+ __m128d acc = _mm_setzero_pd(); \
561
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
562
+ __m128d a = _mm_load_pd(&((dfloat*)p1)[i]); \
563
+ __m128d b = _mm_load_pd(&((dfloat*)p2)[i]); \
564
+ acc = _mm_add_pd(acc, _mm_mul_pd(a, b)); \
565
+ } \
566
+ __m128d shuf = _mm_shuffle_pd(acc, acc, 1); \
567
+ acc = _mm_add_pd(acc, shuf); \
568
+ z += _mm_cvtsd_f64(acc); \
569
+ } \
570
+ for (; i < n; i++) { \
571
+ m_mulsum(((dfloat*)p1)[i], ((dfloat*)p2)[i], z); \
572
+ } \
573
+ SET_DATA(p3, dfloat, z); \
574
+ return; \
575
+ } \
576
+ if ((s1 == sizeof(dfloat) || s2 == sizeof(dfloat)) && \
577
+ is_aligned_step(s1, sizeof(dfloat)) && is_aligned_step(s2, sizeof(dfloat))) { \
578
+ const dfloat* q_contig; \
579
+ const char* q_strided; \
580
+ ssize_t stride; \
581
+ if (s1 == sizeof(dfloat)) { \
582
+ q_contig = (const dfloat*)p1; \
583
+ q_strided = p2; \
584
+ stride = s2; \
585
+ } else { \
586
+ q_contig = (const dfloat*)p2; \
587
+ q_strided = p1; \
588
+ stride = s1; \
589
+ } \
590
+ dfloat z; \
591
+ GET_DATA(p3, dfloat, z); \
592
+ size_t j = 0; \
593
+ if (n >= num_pack) { \
594
+ size_t cnt_simd_loop = n % num_pack; \
595
+ __m128d acc = _mm_setzero_pd(); \
596
+ for (; j < n - cnt_simd_loop; j += num_pack) { \
597
+ __m128d a = _mm_loadu_pd(&q_contig[j]); \
598
+ __m128d b = _mm_set_pd( \
599
+ *(const dfloat*)(q_strided + (ssize_t)(j + 1) * stride), \
600
+ *(const dfloat*)(q_strided + (ssize_t)(j + 0) * stride) \
601
+ ); \
602
+ acc = _mm_add_pd(acc, _mm_mul_pd(a, b)); \
603
+ } \
604
+ __m128d shuf = _mm_shuffle_pd(acc, acc, 1); \
605
+ acc = _mm_add_pd(acc, shuf); \
606
+ z += _mm_cvtsd_f64(acc); \
607
+ } \
608
+ for (; j < n; j++) { \
609
+ m_mulsum(q_contig[j], *(const dfloat*)(q_strided + (ssize_t)j * stride), z); \
610
+ } \
611
+ SET_DATA(p3, dfloat, z); \
612
+ return; \
613
+ } \
614
+ if (is_aligned_step(s1, sizeof(dfloat)) && is_aligned_step(s2, sizeof(dfloat))) { \
615
+ for (size_t i = 0; i < n; i++) { \
616
+ m_mulsum(*(dfloat*)p1, *(dfloat*)p2, *(dfloat*)p3); \
617
+ p1 += s1; \
618
+ p2 += s2; \
619
+ } \
620
+ return; \
621
+ } \
622
+ } \
623
+ dfloat z; \
624
+ GET_DATA(p3, dfloat, z); \
625
+ for (size_t i = 0; i < n; i++) { \
626
+ dfloat x; \
627
+ dfloat y; \
628
+ GET_DATA_STRIDE(p1, s1, dfloat, x); \
629
+ GET_DATA_STRIDE(p2, s2, dfloat, y); \
630
+ m_mulsum(x, y, z); \
631
+ } \
632
+ SET_DATA(p3, dfloat, z); \
633
+ } else { \
634
+ if (is_aligned(p1, sizeof(dfloat)) && is_aligned(p2, sizeof(dfloat)) && \
635
+ is_aligned(p3, sizeof(dfloat))) { \
636
+ if (s1 == sizeof(dfloat) && s2 == sizeof(dfloat) && s3 == sizeof(dfloat)) { \
637
+ if (n >= num_pack && \
638
+ is_same_aligned3( \
639
+ &((dfloat*)p1)[i], &((dfloat*)p2)[i], &((dfloat*)p3)[i], SIMD_ALIGNMENT_SIZE \
640
+ )) { \
641
+ size_t cnt = (size_t)get_count_of_elements_not_aligned_to_simd_size( \
642
+ &((dfloat*)p1)[i], SIMD_ALIGNMENT_SIZE, sizeof(dfloat) \
643
+ ); \
644
+ for (; i < cnt; i++) { \
645
+ m_mulsum(((dfloat*)p1)[i], ((dfloat*)p2)[i], ((dfloat*)p3)[i]); \
646
+ } \
647
+ size_t cnt_simd_loop = (n - i) % num_pack; \
648
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
649
+ __m128d a = _mm_load_pd(&((dfloat*)p1)[i]); \
650
+ __m128d b = _mm_load_pd(&((dfloat*)p2)[i]); \
651
+ __m128d c = _mm_load_pd(&((dfloat*)p3)[i]); \
652
+ _mm_store_pd(&((dfloat*)p3)[i], _mm_add_pd(_mm_mul_pd(a, b), c)); \
653
+ } \
654
+ } \
655
+ for (; i < n; i++) { \
656
+ m_mulsum(((dfloat*)p1)[i], ((dfloat*)p2)[i], ((dfloat*)p3)[i]); \
657
+ } \
658
+ return; \
659
+ } \
660
+ if (((s1 == 0 && s2 == sizeof(dfloat)) || (s1 == sizeof(dfloat) && s2 == 0)) && \
661
+ s3 == sizeof(dfloat)) { \
662
+ const dfloat* q_vec; \
663
+ dfloat scalar; \
664
+ if (s1 == 0) { \
665
+ scalar = *(const dfloat*)p1; \
666
+ q_vec = (const dfloat*)p2; \
667
+ } else { \
668
+ scalar = *(const dfloat*)p2; \
669
+ q_vec = (const dfloat*)p1; \
670
+ } \
671
+ dfloat* q_out = (dfloat*)p3; \
672
+ size_t j = 0; \
673
+ if (n >= num_pack) { \
674
+ size_t cnt_simd_loop = n % num_pack; \
675
+ __m128d va = _mm_set1_pd(scalar); \
676
+ for (; j < n - cnt_simd_loop; j += num_pack) { \
677
+ __m128d vb = _mm_loadu_pd(&q_vec[j]); \
678
+ __m128d vc = _mm_loadu_pd(&q_out[j]); \
679
+ _mm_storeu_pd(&q_out[j], _mm_add_pd(_mm_mul_pd(va, vb), vc)); \
680
+ } \
681
+ } \
682
+ for (; j < n; j++) { \
683
+ m_mulsum(scalar, q_vec[j], q_out[j]); \
684
+ } \
685
+ return; \
686
+ } \
687
+ if (is_aligned_step(s1, sizeof(dfloat)) && is_aligned_step(s2, sizeof(dfloat)) && \
688
+ is_aligned_step(s3, sizeof(dfloat))) { \
689
+ for (size_t i = 0; i < n; i++) { \
690
+ m_mulsum(*(dfloat*)p1, *(dfloat*)p2, *(dfloat*)p3); \
691
+ p1 += s1; \
692
+ p2 += s2; \
693
+ p3 += s3; \
694
+ } \
695
+ return; \
696
+ } \
697
+ } \
698
+ for (size_t i = 0; i < n; i++) { \
699
+ dfloat x; \
700
+ dfloat y; \
701
+ dfloat z; \
702
+ GET_DATA_STRIDE(p1, s1, dfloat, x); \
703
+ GET_DATA_STRIDE(p2, s2, dfloat, y); \
704
+ GET_DATA(p3, dfloat, z); \
705
+ m_mulsum(x, y, z); \
706
+ SET_DATA_STRIDE(p3, s3, dfloat, z); \
707
+ } \
173
708
  } \
174
- \
175
- VALUE klass = na_upcast(rb_obj_class(self), rb_obj_class(argv[0])); \
176
- if (klass == tNAryClass) { \
177
- return tDType##_mulsum_self(argc, argv, self); \
709
+ }
710
+
711
+ #define DEF_NARRAY_SFLT_MULSUM_SSE2_METHOD_FUNC() \
712
+ DEF_MULSUM_SFLT_SSE2_ITER_FUNC() \
713
+ DEF_FLT_MULSUM_NAN_ITER_FUNC(sfloat) \
714
+ DEF_FLT_MULSUM_RUBY_FUNCS(sfloat, numo_cSFloat)
715
+
716
+ #define DEF_NARRAY_DFLT_MULSUM_SSE2_METHOD_FUNC() \
717
+ DEF_MULSUM_DFLT_SSE2_ITER_FUNC() \
718
+ DEF_FLT_MULSUM_NAN_ITER_FUNC(dfloat) \
719
+ DEF_FLT_MULSUM_RUBY_FUNCS(dfloat, numo_cDFloat)
720
+
721
+ #define DEF_MULSUM_SFLT_AVX_ITER_FUNC() \
722
+ static void iter_sfloat_mulsum(na_loop_t* const lp) { \
723
+ size_t i = 0; \
724
+ size_t n; \
725
+ char* p1; \
726
+ char* p2; \
727
+ char* p3; \
728
+ ssize_t s1; \
729
+ ssize_t s2; \
730
+ ssize_t s3; \
731
+ INIT_COUNTER(lp, n); \
732
+ INIT_PTR(lp, 0, p1, s1); \
733
+ INIT_PTR(lp, 1, p2, s2); \
734
+ INIT_PTR(lp, 2, p3, s3); \
735
+ const size_t num_pack = AVX_ALIGNMENT_SIZE / sizeof(sfloat); \
736
+ if (s3 == 0) { \
737
+ if (is_aligned(p1, sizeof(sfloat)) && is_aligned(p2, sizeof(sfloat)) && \
738
+ is_aligned(p3, sizeof(sfloat))) { \
739
+ if (s1 == sizeof(sfloat) && s2 == sizeof(sfloat)) { \
740
+ sfloat z; \
741
+ GET_DATA(p3, sfloat, z); \
742
+ if (n >= num_pack && \
743
+ is_same_aligned2(&((sfloat*)p1)[i], &((sfloat*)p2)[i], AVX_ALIGNMENT_SIZE)) { \
744
+ size_t cnt = (size_t)get_count_of_elements_not_aligned_to_simd_size( \
745
+ &((sfloat*)p1)[i], AVX_ALIGNMENT_SIZE, sizeof(sfloat) \
746
+ ); \
747
+ for (; i < cnt; i++) { \
748
+ m_mulsum(((sfloat*)p1)[i], ((sfloat*)p2)[i], z); \
749
+ } \
750
+ size_t cnt_simd_loop = (n - i) % num_pack; \
751
+ __m256 acc = _mm256_setzero_ps(); \
752
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
753
+ __m256 a = _mm256_load_ps(&((sfloat*)p1)[i]); \
754
+ __m256 b = _mm256_load_ps(&((sfloat*)p2)[i]); \
755
+ acc = _mm256_add_ps(acc, _mm256_mul_ps(a, b)); \
756
+ } \
757
+ __m128 lo = _mm256_castps256_ps128(acc); \
758
+ __m128 hi = _mm256_extractf128_ps(acc, 1); \
759
+ __m128 sum128 = _mm_add_ps(lo, hi); \
760
+ __m128 shuf = _mm_shuffle_ps(sum128, sum128, _MM_SHUFFLE(2, 3, 0, 1)); \
761
+ sum128 = _mm_add_ps(sum128, shuf); \
762
+ shuf = _mm_shuffle_ps(sum128, sum128, _MM_SHUFFLE(0, 1, 2, 3)); \
763
+ sum128 = _mm_add_ps(sum128, shuf); \
764
+ z += _mm_cvtss_f32(sum128); \
765
+ } \
766
+ for (; i < n; i++) { \
767
+ m_mulsum(((sfloat*)p1)[i], ((sfloat*)p2)[i], z); \
768
+ } \
769
+ SET_DATA(p3, sfloat, z); \
770
+ return; \
771
+ } \
772
+ if ((s1 == sizeof(sfloat) || s2 == sizeof(sfloat)) && \
773
+ is_aligned_step(s1, sizeof(sfloat)) && is_aligned_step(s2, sizeof(sfloat))) { \
774
+ const sfloat* q_contig; \
775
+ const char* q_strided; \
776
+ ssize_t stride; \
777
+ if (s1 == sizeof(sfloat)) { \
778
+ q_contig = (const sfloat*)p1; \
779
+ q_strided = p2; \
780
+ stride = s2; \
781
+ } else { \
782
+ q_contig = (const sfloat*)p2; \
783
+ q_strided = p1; \
784
+ stride = s1; \
785
+ } \
786
+ sfloat z; \
787
+ GET_DATA(p3, sfloat, z); \
788
+ size_t j = 0; \
789
+ if (n >= num_pack) { \
790
+ size_t cnt_simd_loop = n % num_pack; \
791
+ __m256 acc = _mm256_setzero_ps(); \
792
+ for (; j < n - cnt_simd_loop; j += num_pack) { \
793
+ __m256 a = _mm256_loadu_ps(&q_contig[j]); \
794
+ __m256 b = _mm256_set_ps( \
795
+ *(const sfloat*)(q_strided + (ssize_t)(j + 7) * stride), \
796
+ *(const sfloat*)(q_strided + (ssize_t)(j + 6) * stride), \
797
+ *(const sfloat*)(q_strided + (ssize_t)(j + 5) * stride), \
798
+ *(const sfloat*)(q_strided + (ssize_t)(j + 4) * stride), \
799
+ *(const sfloat*)(q_strided + (ssize_t)(j + 3) * stride), \
800
+ *(const sfloat*)(q_strided + (ssize_t)(j + 2) * stride), \
801
+ *(const sfloat*)(q_strided + (ssize_t)(j + 1) * stride), \
802
+ *(const sfloat*)(q_strided + (ssize_t)(j + 0) * stride) \
803
+ ); \
804
+ acc = _mm256_add_ps(acc, _mm256_mul_ps(a, b)); \
805
+ } \
806
+ __m128 lo = _mm256_castps256_ps128(acc); \
807
+ __m128 hi = _mm256_extractf128_ps(acc, 1); \
808
+ __m128 sum128 = _mm_add_ps(lo, hi); \
809
+ __m128 shuf = _mm_shuffle_ps(sum128, sum128, _MM_SHUFFLE(2, 3, 0, 1)); \
810
+ sum128 = _mm_add_ps(sum128, shuf); \
811
+ shuf = _mm_shuffle_ps(sum128, sum128, _MM_SHUFFLE(0, 1, 2, 3)); \
812
+ sum128 = _mm_add_ps(sum128, shuf); \
813
+ z += _mm_cvtss_f32(sum128); \
814
+ } \
815
+ for (; j < n; j++) { \
816
+ m_mulsum(q_contig[j], *(const sfloat*)(q_strided + (ssize_t)j * stride), z); \
817
+ } \
818
+ SET_DATA(p3, sfloat, z); \
819
+ return; \
820
+ } \
821
+ if (is_aligned_step(s1, sizeof(sfloat)) && is_aligned_step(s2, sizeof(sfloat))) { \
822
+ for (size_t i = 0; i < n; i++) { \
823
+ m_mulsum(*(sfloat*)p1, *(sfloat*)p2, *(sfloat*)p3); \
824
+ p1 += s1; \
825
+ p2 += s2; \
826
+ } \
827
+ return; \
828
+ } \
829
+ } \
830
+ sfloat z; \
831
+ GET_DATA(p3, sfloat, z); \
832
+ for (size_t i = 0; i < n; i++) { \
833
+ sfloat x; \
834
+ sfloat y; \
835
+ GET_DATA_STRIDE(p1, s1, sfloat, x); \
836
+ GET_DATA_STRIDE(p2, s2, sfloat, y); \
837
+ m_mulsum(x, y, z); \
838
+ } \
839
+ SET_DATA(p3, sfloat, z); \
840
+ } else { \
841
+ if (is_aligned(p1, sizeof(sfloat)) && is_aligned(p2, sizeof(sfloat)) && \
842
+ is_aligned(p3, sizeof(sfloat))) { \
843
+ if (s1 == sizeof(sfloat) && s2 == sizeof(sfloat) && s3 == sizeof(sfloat)) { \
844
+ if (n >= num_pack && \
845
+ is_same_aligned3( \
846
+ &((sfloat*)p1)[i], &((sfloat*)p2)[i], &((sfloat*)p3)[i], AVX_ALIGNMENT_SIZE \
847
+ )) { \
848
+ size_t cnt = (size_t)get_count_of_elements_not_aligned_to_simd_size( \
849
+ &((sfloat*)p1)[i], AVX_ALIGNMENT_SIZE, sizeof(sfloat) \
850
+ ); \
851
+ for (; i < cnt; i++) { \
852
+ m_mulsum(((sfloat*)p1)[i], ((sfloat*)p2)[i], ((sfloat*)p3)[i]); \
853
+ } \
854
+ size_t cnt_simd_loop = (n - i) % num_pack; \
855
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
856
+ __m256 a = _mm256_load_ps(&((sfloat*)p1)[i]); \
857
+ __m256 b = _mm256_load_ps(&((sfloat*)p2)[i]); \
858
+ __m256 c = _mm256_load_ps(&((sfloat*)p3)[i]); \
859
+ _mm256_store_ps(&((sfloat*)p3)[i], _mm256_add_ps(_mm256_mul_ps(a, b), c)); \
860
+ } \
861
+ } \
862
+ for (; i < n; i++) { \
863
+ m_mulsum(((sfloat*)p1)[i], ((sfloat*)p2)[i], ((sfloat*)p3)[i]); \
864
+ } \
865
+ return; \
866
+ } \
867
+ if (((s1 == 0 && s2 == sizeof(sfloat)) || (s1 == sizeof(sfloat) && s2 == 0)) && \
868
+ s3 == sizeof(sfloat)) { \
869
+ const sfloat* q_vec; \
870
+ sfloat scalar; \
871
+ if (s1 == 0) { \
872
+ scalar = *(const sfloat*)p1; \
873
+ q_vec = (const sfloat*)p2; \
874
+ } else { \
875
+ scalar = *(const sfloat*)p2; \
876
+ q_vec = (const sfloat*)p1; \
877
+ } \
878
+ sfloat* q_out = (sfloat*)p3; \
879
+ size_t j = 0; \
880
+ if (n >= num_pack) { \
881
+ size_t cnt_simd_loop = n % num_pack; \
882
+ __m256 va = _mm256_set1_ps(scalar); \
883
+ for (; j < n - cnt_simd_loop; j += num_pack) { \
884
+ __m256 vb = _mm256_loadu_ps(&q_vec[j]); \
885
+ __m256 vc = _mm256_loadu_ps(&q_out[j]); \
886
+ _mm256_storeu_ps(&q_out[j], _mm256_add_ps(_mm256_mul_ps(va, vb), vc)); \
887
+ } \
888
+ } \
889
+ for (; j < n; j++) { \
890
+ m_mulsum(scalar, q_vec[j], q_out[j]); \
891
+ } \
892
+ return; \
893
+ } \
894
+ if (is_aligned_step(s1, sizeof(sfloat)) && is_aligned_step(s2, sizeof(sfloat)) && \
895
+ is_aligned_step(s3, sizeof(sfloat))) { \
896
+ for (size_t i = 0; i < n; i++) { \
897
+ m_mulsum(*(sfloat*)p1, *(sfloat*)p2, *(sfloat*)p3); \
898
+ p1 += s1; \
899
+ p2 += s2; \
900
+ p3 += s3; \
901
+ } \
902
+ return; \
903
+ } \
904
+ } \
905
+ for (size_t i = 0; i < n; i++) { \
906
+ sfloat x; \
907
+ sfloat y; \
908
+ sfloat z; \
909
+ GET_DATA_STRIDE(p1, s1, sfloat, x); \
910
+ GET_DATA_STRIDE(p2, s2, sfloat, y); \
911
+ GET_DATA(p3, sfloat, z); \
912
+ m_mulsum(x, y, z); \
913
+ SET_DATA_STRIDE(p3, s3, sfloat, z); \
914
+ } \
915
+ } \
916
+ }
917
+
918
+ #define DEF_MULSUM_DFLT_AVX_ITER_FUNC() \
919
+ static void iter_dfloat_mulsum(na_loop_t* const lp) { \
920
+ size_t i = 0; \
921
+ size_t n; \
922
+ char* p1; \
923
+ char* p2; \
924
+ char* p3; \
925
+ ssize_t s1; \
926
+ ssize_t s2; \
927
+ ssize_t s3; \
928
+ INIT_COUNTER(lp, n); \
929
+ INIT_PTR(lp, 0, p1, s1); \
930
+ INIT_PTR(lp, 1, p2, s2); \
931
+ INIT_PTR(lp, 2, p3, s3); \
932
+ const size_t num_pack = AVX_ALIGNMENT_SIZE / sizeof(dfloat); \
933
+ if (s3 == 0) { \
934
+ if (is_aligned(p1, sizeof(dfloat)) && is_aligned(p2, sizeof(dfloat)) && \
935
+ is_aligned(p3, sizeof(dfloat))) { \
936
+ if (s1 == sizeof(dfloat) && s2 == sizeof(dfloat)) { \
937
+ dfloat z; \
938
+ GET_DATA(p3, dfloat, z); \
939
+ if (n >= num_pack && \
940
+ is_same_aligned2(&((dfloat*)p1)[i], &((dfloat*)p2)[i], AVX_ALIGNMENT_SIZE)) { \
941
+ size_t cnt = (size_t)get_count_of_elements_not_aligned_to_simd_size( \
942
+ &((dfloat*)p1)[i], AVX_ALIGNMENT_SIZE, sizeof(dfloat) \
943
+ ); \
944
+ for (; i < cnt; i++) { \
945
+ m_mulsum(((dfloat*)p1)[i], ((dfloat*)p2)[i], z); \
946
+ } \
947
+ size_t cnt_simd_loop = (n - i) % num_pack; \
948
+ __m256d acc = _mm256_setzero_pd(); \
949
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
950
+ __m256d a = _mm256_load_pd(&((dfloat*)p1)[i]); \
951
+ __m256d b = _mm256_load_pd(&((dfloat*)p2)[i]); \
952
+ acc = _mm256_add_pd(acc, _mm256_mul_pd(a, b)); \
953
+ } \
954
+ __m128d lo = _mm256_castpd256_pd128(acc); \
955
+ __m128d hi = _mm256_extractf128_pd(acc, 1); \
956
+ __m128d sum128 = _mm_add_pd(lo, hi); \
957
+ __m128d shuf = _mm_shuffle_pd(sum128, sum128, 1); \
958
+ sum128 = _mm_add_pd(sum128, shuf); \
959
+ z += _mm_cvtsd_f64(sum128); \
960
+ } \
961
+ for (; i < n; i++) { \
962
+ m_mulsum(((dfloat*)p1)[i], ((dfloat*)p2)[i], z); \
963
+ } \
964
+ SET_DATA(p3, dfloat, z); \
965
+ return; \
966
+ } \
967
+ if ((s1 == sizeof(dfloat) || s2 == sizeof(dfloat)) && \
968
+ is_aligned_step(s1, sizeof(dfloat)) && is_aligned_step(s2, sizeof(dfloat))) { \
969
+ const dfloat* q_contig; \
970
+ const char* q_strided; \
971
+ ssize_t stride; \
972
+ if (s1 == sizeof(dfloat)) { \
973
+ q_contig = (const dfloat*)p1; \
974
+ q_strided = p2; \
975
+ stride = s2; \
976
+ } else { \
977
+ q_contig = (const dfloat*)p2; \
978
+ q_strided = p1; \
979
+ stride = s1; \
980
+ } \
981
+ dfloat z; \
982
+ GET_DATA(p3, dfloat, z); \
983
+ size_t j = 0; \
984
+ if (n >= num_pack) { \
985
+ size_t cnt_simd_loop = n % num_pack; \
986
+ __m256d acc = _mm256_setzero_pd(); \
987
+ for (; j < n - cnt_simd_loop; j += num_pack) { \
988
+ __m256d a = _mm256_loadu_pd(&q_contig[j]); \
989
+ __m256d b = _mm256_set_pd( \
990
+ *(const dfloat*)(q_strided + (ssize_t)(j + 3) * stride), \
991
+ *(const dfloat*)(q_strided + (ssize_t)(j + 2) * stride), \
992
+ *(const dfloat*)(q_strided + (ssize_t)(j + 1) * stride), \
993
+ *(const dfloat*)(q_strided + (ssize_t)(j + 0) * stride) \
994
+ ); \
995
+ acc = _mm256_add_pd(acc, _mm256_mul_pd(a, b)); \
996
+ } \
997
+ __m128d lo = _mm256_castpd256_pd128(acc); \
998
+ __m128d hi = _mm256_extractf128_pd(acc, 1); \
999
+ __m128d sum128 = _mm_add_pd(lo, hi); \
1000
+ __m128d shuf = _mm_shuffle_pd(sum128, sum128, 1); \
1001
+ sum128 = _mm_add_pd(sum128, shuf); \
1002
+ z += _mm_cvtsd_f64(sum128); \
1003
+ } \
1004
+ for (; j < n; j++) { \
1005
+ m_mulsum(q_contig[j], *(const dfloat*)(q_strided + (ssize_t)j * stride), z); \
1006
+ } \
1007
+ SET_DATA(p3, dfloat, z); \
1008
+ return; \
1009
+ } \
1010
+ if (is_aligned_step(s1, sizeof(dfloat)) && is_aligned_step(s2, sizeof(dfloat))) { \
1011
+ for (size_t i = 0; i < n; i++) { \
1012
+ m_mulsum(*(dfloat*)p1, *(dfloat*)p2, *(dfloat*)p3); \
1013
+ p1 += s1; \
1014
+ p2 += s2; \
1015
+ } \
1016
+ return; \
1017
+ } \
1018
+ } \
1019
+ dfloat z; \
1020
+ GET_DATA(p3, dfloat, z); \
1021
+ for (size_t i = 0; i < n; i++) { \
1022
+ dfloat x; \
1023
+ dfloat y; \
1024
+ GET_DATA_STRIDE(p1, s1, dfloat, x); \
1025
+ GET_DATA_STRIDE(p2, s2, dfloat, y); \
1026
+ m_mulsum(x, y, z); \
1027
+ } \
1028
+ SET_DATA(p3, dfloat, z); \
1029
+ } else { \
1030
+ if (is_aligned(p1, sizeof(dfloat)) && is_aligned(p2, sizeof(dfloat)) && \
1031
+ is_aligned(p3, sizeof(dfloat))) { \
1032
+ if (s1 == sizeof(dfloat) && s2 == sizeof(dfloat) && s3 == sizeof(dfloat)) { \
1033
+ if (n >= num_pack && \
1034
+ is_same_aligned3( \
1035
+ &((dfloat*)p1)[i], &((dfloat*)p2)[i], &((dfloat*)p3)[i], AVX_ALIGNMENT_SIZE \
1036
+ )) { \
1037
+ size_t cnt = (size_t)get_count_of_elements_not_aligned_to_simd_size( \
1038
+ &((dfloat*)p1)[i], AVX_ALIGNMENT_SIZE, sizeof(dfloat) \
1039
+ ); \
1040
+ for (; i < cnt; i++) { \
1041
+ m_mulsum(((dfloat*)p1)[i], ((dfloat*)p2)[i], ((dfloat*)p3)[i]); \
1042
+ } \
1043
+ size_t cnt_simd_loop = (n - i) % num_pack; \
1044
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
1045
+ __m256d a = _mm256_load_pd(&((dfloat*)p1)[i]); \
1046
+ __m256d b = _mm256_load_pd(&((dfloat*)p2)[i]); \
1047
+ __m256d c = _mm256_load_pd(&((dfloat*)p3)[i]); \
1048
+ _mm256_store_pd(&((dfloat*)p3)[i], _mm256_add_pd(_mm256_mul_pd(a, b), c)); \
1049
+ } \
1050
+ } \
1051
+ for (; i < n; i++) { \
1052
+ m_mulsum(((dfloat*)p1)[i], ((dfloat*)p2)[i], ((dfloat*)p3)[i]); \
1053
+ } \
1054
+ return; \
1055
+ } \
1056
+ if (((s1 == 0 && s2 == sizeof(dfloat)) || (s1 == sizeof(dfloat) && s2 == 0)) && \
1057
+ s3 == sizeof(dfloat)) { \
1058
+ const dfloat* q_vec; \
1059
+ dfloat scalar; \
1060
+ if (s1 == 0) { \
1061
+ scalar = *(const dfloat*)p1; \
1062
+ q_vec = (const dfloat*)p2; \
1063
+ } else { \
1064
+ scalar = *(const dfloat*)p2; \
1065
+ q_vec = (const dfloat*)p1; \
1066
+ } \
1067
+ dfloat* q_out = (dfloat*)p3; \
1068
+ size_t j = 0; \
1069
+ if (n >= num_pack) { \
1070
+ size_t cnt_simd_loop = n % num_pack; \
1071
+ __m256d va = _mm256_set1_pd(scalar); \
1072
+ for (; j < n - cnt_simd_loop; j += num_pack) { \
1073
+ __m256d vb = _mm256_loadu_pd(&q_vec[j]); \
1074
+ __m256d vc = _mm256_loadu_pd(&q_out[j]); \
1075
+ _mm256_storeu_pd(&q_out[j], _mm256_add_pd(_mm256_mul_pd(va, vb), vc)); \
1076
+ } \
1077
+ } \
1078
+ for (; j < n; j++) { \
1079
+ m_mulsum(scalar, q_vec[j], q_out[j]); \
1080
+ } \
1081
+ return; \
1082
+ } \
1083
+ if (is_aligned_step(s1, sizeof(dfloat)) && is_aligned_step(s2, sizeof(dfloat)) && \
1084
+ is_aligned_step(s3, sizeof(dfloat))) { \
1085
+ for (size_t i = 0; i < n; i++) { \
1086
+ m_mulsum(*(dfloat*)p1, *(dfloat*)p2, *(dfloat*)p3); \
1087
+ p1 += s1; \
1088
+ p2 += s2; \
1089
+ p3 += s3; \
1090
+ } \
1091
+ return; \
1092
+ } \
1093
+ } \
1094
+ for (size_t i = 0; i < n; i++) { \
1095
+ dfloat x; \
1096
+ dfloat y; \
1097
+ dfloat z; \
1098
+ GET_DATA_STRIDE(p1, s1, dfloat, x); \
1099
+ GET_DATA_STRIDE(p2, s2, dfloat, y); \
1100
+ GET_DATA(p3, dfloat, z); \
1101
+ m_mulsum(x, y, z); \
1102
+ SET_DATA_STRIDE(p3, s3, dfloat, z); \
1103
+ } \
178
1104
  } \
179
- \
180
- VALUE v = rb_funcall(klass, id_cast, 1, self); \
181
- \
182
- return rb_funcallv_kw(v, rb_intern("mulsum"), argc, argv, RB_PASS_CALLED_KEYWORDS); \
183
1105
  }
184
1106
 
1107
+ #define DEF_NARRAY_SFLT_MULSUM_AVX_METHOD_FUNC() \
1108
+ DEF_MULSUM_SFLT_AVX_ITER_FUNC() \
1109
+ DEF_FLT_MULSUM_NAN_ITER_FUNC(sfloat) \
1110
+ DEF_FLT_MULSUM_RUBY_FUNCS(sfloat, numo_cSFloat)
1111
+
1112
+ #define DEF_NARRAY_DFLT_MULSUM_AVX_METHOD_FUNC() \
1113
+ DEF_MULSUM_DFLT_AVX_ITER_FUNC() \
1114
+ DEF_FLT_MULSUM_NAN_ITER_FUNC(dfloat) \
1115
+ DEF_FLT_MULSUM_RUBY_FUNCS(dfloat, numo_cDFloat)
1116
+
1117
+ #define DEF_MULSUM_SFLT_NEON_ITER_FUNC() \
1118
+ static void iter_sfloat_mulsum(na_loop_t* const lp) { \
1119
+ size_t i = 0; \
1120
+ size_t n; \
1121
+ char* p1; \
1122
+ char* p2; \
1123
+ char* p3; \
1124
+ ssize_t s1; \
1125
+ ssize_t s2; \
1126
+ ssize_t s3; \
1127
+ INIT_COUNTER(lp, n); \
1128
+ INIT_PTR(lp, 0, p1, s1); \
1129
+ INIT_PTR(lp, 1, p2, s2); \
1130
+ INIT_PTR(lp, 2, p3, s3); \
1131
+ const size_t num_pack = NEON_ALIGNMENT_SIZE / sizeof(sfloat); \
1132
+ if (s3 == 0) { \
1133
+ if (is_aligned(p1, sizeof(sfloat)) && is_aligned(p2, sizeof(sfloat)) && \
1134
+ is_aligned(p3, sizeof(sfloat))) { \
1135
+ if (s1 == sizeof(sfloat) && s2 == sizeof(sfloat)) { \
1136
+ sfloat z; \
1137
+ GET_DATA(p3, sfloat, z); \
1138
+ if (n >= num_pack && \
1139
+ is_same_aligned2(&((sfloat*)p1)[i], &((sfloat*)p2)[i], NEON_ALIGNMENT_SIZE)) { \
1140
+ size_t cnt = (size_t)get_count_of_elements_not_aligned_to_simd_size( \
1141
+ &((sfloat*)p1)[i], NEON_ALIGNMENT_SIZE, sizeof(sfloat) \
1142
+ ); \
1143
+ for (; i < cnt; i++) { \
1144
+ m_mulsum(((sfloat*)p1)[i], ((sfloat*)p2)[i], z); \
1145
+ } \
1146
+ size_t cnt_simd_loop = (n - i) % num_pack; \
1147
+ float32x4_t acc = vdupq_n_f32(0.0f); \
1148
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
1149
+ float32x4_t a = vld1q_f32(&((sfloat*)p1)[i]); \
1150
+ float32x4_t b = vld1q_f32(&((sfloat*)p2)[i]); \
1151
+ acc = vaddq_f32(acc, vmulq_f32(a, b)); \
1152
+ } \
1153
+ z += vaddvq_f32(acc); \
1154
+ } \
1155
+ for (; i < n; i++) { \
1156
+ m_mulsum(((sfloat*)p1)[i], ((sfloat*)p2)[i], z); \
1157
+ } \
1158
+ SET_DATA(p3, sfloat, z); \
1159
+ return; \
1160
+ } \
1161
+ if ((s1 == sizeof(sfloat) || s2 == sizeof(sfloat)) && \
1162
+ is_aligned_step(s1, sizeof(sfloat)) && is_aligned_step(s2, sizeof(sfloat))) { \
1163
+ const sfloat* q_contig; \
1164
+ const char* q_strided; \
1165
+ ssize_t stride; \
1166
+ if (s1 == sizeof(sfloat)) { \
1167
+ q_contig = (const sfloat*)p1; \
1168
+ q_strided = p2; \
1169
+ stride = s2; \
1170
+ } else { \
1171
+ q_contig = (const sfloat*)p2; \
1172
+ q_strided = p1; \
1173
+ stride = s1; \
1174
+ } \
1175
+ sfloat z; \
1176
+ GET_DATA(p3, sfloat, z); \
1177
+ size_t j = 0; \
1178
+ if (n >= num_pack) { \
1179
+ size_t cnt_simd_loop = n % num_pack; \
1180
+ float32x4_t acc = vdupq_n_f32(0.0f); \
1181
+ for (; j < n - cnt_simd_loop; j += num_pack) { \
1182
+ float32x4_t a = vld1q_f32(&q_contig[j]); \
1183
+ float32x4_t b = vdupq_n_f32(0.0f); \
1184
+ b = \
1185
+ vsetq_lane_f32(*(const sfloat*)(q_strided + (ssize_t)(j + 0) * stride), b, 0); \
1186
+ b = \
1187
+ vsetq_lane_f32(*(const sfloat*)(q_strided + (ssize_t)(j + 1) * stride), b, 1); \
1188
+ b = \
1189
+ vsetq_lane_f32(*(const sfloat*)(q_strided + (ssize_t)(j + 2) * stride), b, 2); \
1190
+ b = \
1191
+ vsetq_lane_f32(*(const sfloat*)(q_strided + (ssize_t)(j + 3) * stride), b, 3); \
1192
+ acc = vaddq_f32(acc, vmulq_f32(a, b)); \
1193
+ } \
1194
+ z += vaddvq_f32(acc); \
1195
+ } \
1196
+ for (; j < n; j++) { \
1197
+ m_mulsum(q_contig[j], *(const sfloat*)(q_strided + (ssize_t)j * stride), z); \
1198
+ } \
1199
+ SET_DATA(p3, sfloat, z); \
1200
+ return; \
1201
+ } \
1202
+ } \
1203
+ sfloat z; \
1204
+ GET_DATA(p3, sfloat, z); \
1205
+ for (size_t i = 0; i < n; i++) { \
1206
+ sfloat x; \
1207
+ sfloat y; \
1208
+ GET_DATA_STRIDE(p1, s1, sfloat, x); \
1209
+ GET_DATA_STRIDE(p2, s2, sfloat, y); \
1210
+ m_mulsum(x, y, z); \
1211
+ } \
1212
+ SET_DATA(p3, sfloat, z); \
1213
+ } else { \
1214
+ if (is_aligned(p1, sizeof(sfloat)) && is_aligned(p2, sizeof(sfloat)) && \
1215
+ is_aligned(p3, sizeof(sfloat))) { \
1216
+ if (s1 == sizeof(sfloat) && s2 == sizeof(sfloat) && s3 == sizeof(sfloat)) { \
1217
+ if (n >= num_pack && \
1218
+ is_same_aligned3( \
1219
+ &((sfloat*)p1)[i], &((sfloat*)p2)[i], &((sfloat*)p3)[i], NEON_ALIGNMENT_SIZE \
1220
+ )) { \
1221
+ size_t cnt = (size_t)get_count_of_elements_not_aligned_to_simd_size( \
1222
+ &((sfloat*)p1)[i], NEON_ALIGNMENT_SIZE, sizeof(sfloat) \
1223
+ ); \
1224
+ for (; i < cnt; i++) { \
1225
+ m_mulsum(((sfloat*)p1)[i], ((sfloat*)p2)[i], ((sfloat*)p3)[i]); \
1226
+ } \
1227
+ size_t cnt_simd_loop = (n - i) % num_pack; \
1228
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
1229
+ float32x4_t a = vld1q_f32(&((sfloat*)p1)[i]); \
1230
+ float32x4_t b = vld1q_f32(&((sfloat*)p2)[i]); \
1231
+ float32x4_t c = vld1q_f32(&((sfloat*)p3)[i]); \
1232
+ vst1q_f32(&((sfloat*)p3)[i], vaddq_f32(vmulq_f32(a, b), c)); \
1233
+ } \
1234
+ } \
1235
+ for (; i < n; i++) { \
1236
+ m_mulsum(((sfloat*)p1)[i], ((sfloat*)p2)[i], ((sfloat*)p3)[i]); \
1237
+ } \
1238
+ return; \
1239
+ } \
1240
+ if (((s1 == 0 && s2 == sizeof(sfloat)) || (s1 == sizeof(sfloat) && s2 == 0)) && \
1241
+ s3 == sizeof(sfloat)) { \
1242
+ const sfloat* q_vec; \
1243
+ sfloat scalar; \
1244
+ if (s1 == 0) { \
1245
+ scalar = *(const sfloat*)p1; \
1246
+ q_vec = (const sfloat*)p2; \
1247
+ } else { \
1248
+ scalar = *(const sfloat*)p2; \
1249
+ q_vec = (const sfloat*)p1; \
1250
+ } \
1251
+ sfloat* q_out = (sfloat*)p3; \
1252
+ size_t j = 0; \
1253
+ if (n >= num_pack) { \
1254
+ size_t cnt_simd_loop = n % num_pack; \
1255
+ float32x4_t va = vdupq_n_f32(scalar); \
1256
+ for (; j < n - cnt_simd_loop; j += num_pack) { \
1257
+ float32x4_t vb = vld1q_f32(&q_vec[j]); \
1258
+ float32x4_t vc = vld1q_f32(&q_out[j]); \
1259
+ vst1q_f32(&q_out[j], vaddq_f32(vmulq_f32(va, vb), vc)); \
1260
+ } \
1261
+ } \
1262
+ for (; j < n; j++) { \
1263
+ m_mulsum(scalar, q_vec[j], q_out[j]); \
1264
+ } \
1265
+ return; \
1266
+ } \
1267
+ if (is_aligned_step(s1, sizeof(sfloat)) && is_aligned_step(s2, sizeof(sfloat)) && \
1268
+ is_aligned_step(s3, sizeof(sfloat))) { \
1269
+ for (size_t i = 0; i < n; i++) { \
1270
+ m_mulsum(*(sfloat*)p1, *(sfloat*)p2, *(sfloat*)p3); \
1271
+ p1 += s1; \
1272
+ p2 += s2; \
1273
+ p3 += s3; \
1274
+ } \
1275
+ return; \
1276
+ } \
1277
+ } \
1278
+ for (size_t i = 0; i < n; i++) { \
1279
+ sfloat x; \
1280
+ sfloat y; \
1281
+ sfloat z; \
1282
+ GET_DATA_STRIDE(p1, s1, sfloat, x); \
1283
+ GET_DATA_STRIDE(p2, s2, sfloat, y); \
1284
+ GET_DATA(p3, sfloat, z); \
1285
+ m_mulsum(x, y, z); \
1286
+ SET_DATA_STRIDE(p3, s3, sfloat, z); \
1287
+ } \
1288
+ } \
1289
+ }
1290
+
1291
+ #define DEF_MULSUM_DFLT_NEON_ITER_FUNC() \
1292
+ static void iter_dfloat_mulsum(na_loop_t* const lp) { \
1293
+ size_t i = 0; \
1294
+ size_t n; \
1295
+ char* p1; \
1296
+ char* p2; \
1297
+ char* p3; \
1298
+ ssize_t s1; \
1299
+ ssize_t s2; \
1300
+ ssize_t s3; \
1301
+ INIT_COUNTER(lp, n); \
1302
+ INIT_PTR(lp, 0, p1, s1); \
1303
+ INIT_PTR(lp, 1, p2, s2); \
1304
+ INIT_PTR(lp, 2, p3, s3); \
1305
+ const size_t num_pack = NEON_ALIGNMENT_SIZE / sizeof(dfloat); \
1306
+ if (s3 == 0) { \
1307
+ if (is_aligned(p1, sizeof(dfloat)) && is_aligned(p2, sizeof(dfloat)) && \
1308
+ is_aligned(p3, sizeof(dfloat))) { \
1309
+ if (s1 == sizeof(dfloat) && s2 == sizeof(dfloat)) { \
1310
+ dfloat z; \
1311
+ GET_DATA(p3, dfloat, z); \
1312
+ if (n >= num_pack && \
1313
+ is_same_aligned2(&((dfloat*)p1)[i], &((dfloat*)p2)[i], NEON_ALIGNMENT_SIZE)) { \
1314
+ size_t cnt = (size_t)get_count_of_elements_not_aligned_to_simd_size( \
1315
+ &((dfloat*)p1)[i], NEON_ALIGNMENT_SIZE, sizeof(dfloat) \
1316
+ ); \
1317
+ for (; i < cnt; i++) { \
1318
+ m_mulsum(((dfloat*)p1)[i], ((dfloat*)p2)[i], z); \
1319
+ } \
1320
+ size_t cnt_simd_loop = (n - i) % num_pack; \
1321
+ float64x2_t acc = vdupq_n_f64(0.0); \
1322
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
1323
+ float64x2_t a = vld1q_f64(&((dfloat*)p1)[i]); \
1324
+ float64x2_t b = vld1q_f64(&((dfloat*)p2)[i]); \
1325
+ acc = vaddq_f64(acc, vmulq_f64(a, b)); \
1326
+ } \
1327
+ z += vaddvq_f64(acc); \
1328
+ } \
1329
+ for (; i < n; i++) { \
1330
+ m_mulsum(((dfloat*)p1)[i], ((dfloat*)p2)[i], z); \
1331
+ } \
1332
+ SET_DATA(p3, dfloat, z); \
1333
+ return; \
1334
+ } \
1335
+ if ((s1 == sizeof(dfloat) || s2 == sizeof(dfloat)) && \
1336
+ is_aligned_step(s1, sizeof(dfloat)) && is_aligned_step(s2, sizeof(dfloat))) { \
1337
+ const dfloat* q_contig; \
1338
+ const char* q_strided; \
1339
+ ssize_t stride; \
1340
+ if (s1 == sizeof(dfloat)) { \
1341
+ q_contig = (const dfloat*)p1; \
1342
+ q_strided = p2; \
1343
+ stride = s2; \
1344
+ } else { \
1345
+ q_contig = (const dfloat*)p2; \
1346
+ q_strided = p1; \
1347
+ stride = s1; \
1348
+ } \
1349
+ dfloat z; \
1350
+ GET_DATA(p3, dfloat, z); \
1351
+ size_t j = 0; \
1352
+ if (n >= num_pack) { \
1353
+ size_t cnt_simd_loop = n % num_pack; \
1354
+ float64x2_t acc = vdupq_n_f64(0.0); \
1355
+ for (; j < n - cnt_simd_loop; j += num_pack) { \
1356
+ float64x2_t a = vld1q_f64(&q_contig[j]); \
1357
+ float64x2_t b = vdupq_n_f64(0.0); \
1358
+ b = \
1359
+ vsetq_lane_f64(*(const dfloat*)(q_strided + (ssize_t)(j + 0) * stride), b, 0); \
1360
+ b = \
1361
+ vsetq_lane_f64(*(const dfloat*)(q_strided + (ssize_t)(j + 1) * stride), b, 1); \
1362
+ acc = vaddq_f64(acc, vmulq_f64(a, b)); \
1363
+ } \
1364
+ z += vaddvq_f64(acc); \
1365
+ } \
1366
+ for (; j < n; j++) { \
1367
+ m_mulsum(q_contig[j], *(const dfloat*)(q_strided + (ssize_t)j * stride), z); \
1368
+ } \
1369
+ SET_DATA(p3, dfloat, z); \
1370
+ return; \
1371
+ } \
1372
+ } \
1373
+ dfloat z; \
1374
+ GET_DATA(p3, dfloat, z); \
1375
+ for (size_t i = 0; i < n; i++) { \
1376
+ dfloat x; \
1377
+ dfloat y; \
1378
+ GET_DATA_STRIDE(p1, s1, dfloat, x); \
1379
+ GET_DATA_STRIDE(p2, s2, dfloat, y); \
1380
+ m_mulsum(x, y, z); \
1381
+ } \
1382
+ SET_DATA(p3, dfloat, z); \
1383
+ } else { \
1384
+ if (is_aligned(p1, sizeof(dfloat)) && is_aligned(p2, sizeof(dfloat)) && \
1385
+ is_aligned(p3, sizeof(dfloat))) { \
1386
+ if (s1 == sizeof(dfloat) && s2 == sizeof(dfloat) && s3 == sizeof(dfloat)) { \
1387
+ if (n >= num_pack && \
1388
+ is_same_aligned3( \
1389
+ &((dfloat*)p1)[i], &((dfloat*)p2)[i], &((dfloat*)p3)[i], NEON_ALIGNMENT_SIZE \
1390
+ )) { \
1391
+ size_t cnt = (size_t)get_count_of_elements_not_aligned_to_simd_size( \
1392
+ &((dfloat*)p1)[i], NEON_ALIGNMENT_SIZE, sizeof(dfloat) \
1393
+ ); \
1394
+ for (; i < cnt; i++) { \
1395
+ m_mulsum(((dfloat*)p1)[i], ((dfloat*)p2)[i], ((dfloat*)p3)[i]); \
1396
+ } \
1397
+ size_t cnt_simd_loop = (n - i) % num_pack; \
1398
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
1399
+ float64x2_t a = vld1q_f64(&((dfloat*)p1)[i]); \
1400
+ float64x2_t b = vld1q_f64(&((dfloat*)p2)[i]); \
1401
+ float64x2_t c = vld1q_f64(&((dfloat*)p3)[i]); \
1402
+ vst1q_f64(&((dfloat*)p3)[i], vaddq_f64(vmulq_f64(a, b), c)); \
1403
+ } \
1404
+ } \
1405
+ for (; i < n; i++) { \
1406
+ m_mulsum(((dfloat*)p1)[i], ((dfloat*)p2)[i], ((dfloat*)p3)[i]); \
1407
+ } \
1408
+ return; \
1409
+ } \
1410
+ if (((s1 == 0 && s2 == sizeof(dfloat)) || (s1 == sizeof(dfloat) && s2 == 0)) && \
1411
+ s3 == sizeof(dfloat)) { \
1412
+ const dfloat* q_vec; \
1413
+ dfloat scalar; \
1414
+ if (s1 == 0) { \
1415
+ scalar = *(const dfloat*)p1; \
1416
+ q_vec = (const dfloat*)p2; \
1417
+ } else { \
1418
+ scalar = *(const dfloat*)p2; \
1419
+ q_vec = (const dfloat*)p1; \
1420
+ } \
1421
+ dfloat* q_out = (dfloat*)p3; \
1422
+ size_t j = 0; \
1423
+ if (n >= num_pack) { \
1424
+ size_t cnt_simd_loop = n % num_pack; \
1425
+ float64x2_t va = vdupq_n_f64(scalar); \
1426
+ for (; j < n - cnt_simd_loop; j += num_pack) { \
1427
+ float64x2_t vb = vld1q_f64(&q_vec[j]); \
1428
+ float64x2_t vc = vld1q_f64(&q_out[j]); \
1429
+ vst1q_f64(&q_out[j], vaddq_f64(vmulq_f64(va, vb), vc)); \
1430
+ } \
1431
+ } \
1432
+ for (; j < n; j++) { \
1433
+ m_mulsum(scalar, q_vec[j], q_out[j]); \
1434
+ } \
1435
+ return; \
1436
+ } \
1437
+ if (is_aligned_step(s1, sizeof(dfloat)) && is_aligned_step(s2, sizeof(dfloat)) && \
1438
+ is_aligned_step(s3, sizeof(dfloat))) { \
1439
+ for (size_t i = 0; i < n; i++) { \
1440
+ m_mulsum(*(dfloat*)p1, *(dfloat*)p2, *(dfloat*)p3); \
1441
+ p1 += s1; \
1442
+ p2 += s2; \
1443
+ p3 += s3; \
1444
+ } \
1445
+ return; \
1446
+ } \
1447
+ } \
1448
+ for (size_t i = 0; i < n; i++) { \
1449
+ dfloat x; \
1450
+ dfloat y; \
1451
+ dfloat z; \
1452
+ GET_DATA_STRIDE(p1, s1, dfloat, x); \
1453
+ GET_DATA_STRIDE(p2, s2, dfloat, y); \
1454
+ GET_DATA(p3, dfloat, z); \
1455
+ m_mulsum(x, y, z); \
1456
+ SET_DATA_STRIDE(p3, s3, dfloat, z); \
1457
+ } \
1458
+ } \
1459
+ }
1460
+
1461
+ #define DEF_NARRAY_SFLT_MULSUM_NEON_METHOD_FUNC() \
1462
+ DEF_MULSUM_SFLT_NEON_ITER_FUNC() \
1463
+ DEF_FLT_MULSUM_NAN_ITER_FUNC(sfloat) \
1464
+ DEF_FLT_MULSUM_RUBY_FUNCS(sfloat, numo_cSFloat)
1465
+
1466
+ #define DEF_NARRAY_DFLT_MULSUM_NEON_METHOD_FUNC() \
1467
+ DEF_MULSUM_DFLT_NEON_ITER_FUNC() \
1468
+ DEF_FLT_MULSUM_NAN_ITER_FUNC(dfloat) \
1469
+ DEF_FLT_MULSUM_RUBY_FUNCS(dfloat, numo_cDFloat)
1470
+
185
1471
  #endif /* NUMO_NARRAY_MH_MULSUM_H */