numo-narray-alt 0.10.5 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +9 -0
- data/ext/numo/narray/extconf.rb +2 -0
- data/ext/numo/narray/numo/narray.h +3 -3
- data/ext/numo/narray/src/mh/math/sqrt.h +372 -0
- data/ext/numo/narray/src/mh/mulsum.h +1360 -74
- data/ext/numo/narray/src/mh/op/add.h +20 -0
- data/ext/numo/narray/src/mh/op/binary_func.h +542 -0
- data/ext/numo/narray/src/mh/op/div.h +20 -0
- data/ext/numo/narray/src/mh/op/mul.h +20 -0
- data/ext/numo/narray/src/mh/op/sub.h +20 -0
- data/ext/numo/narray/src/mh/sort.h +4 -4
- data/ext/numo/narray/src/t_bit.c +0 -5
- data/ext/numo/narray/src/t_dcomplex.c +0 -5
- data/ext/numo/narray/src/t_dfloat.c +35 -3
- data/ext/numo/narray/src/t_int16.c +0 -5
- data/ext/numo/narray/src/t_int32.c +0 -5
- data/ext/numo/narray/src/t_int64.c +0 -5
- data/ext/numo/narray/src/t_int8.c +0 -5
- data/ext/numo/narray/src/t_robject.c +0 -5
- data/ext/numo/narray/src/t_scomplex.c +0 -5
- data/ext/numo/narray/src/t_sfloat.c +35 -3
- data/ext/numo/narray/src/t_uint16.c +0 -5
- data/ext/numo/narray/src/t_uint32.c +0 -5
- data/ext/numo/narray/src/t_uint64.c +0 -5
- data/ext/numo/narray/src/t_uint8.c +0 -5
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9b2e2bc7cc99b7ef2b868b588360ff66299a6570b4d6ea98c45bbb357eed661b
|
|
4
|
+
data.tar.gz: '0887ab3061ad5add393ecd99921ca88e3a06545c9617ffc9686550cb87b3e843'
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: '09f47d21922f7e24e7915222eb9ddacadf4392156e53a15aee05d4f2fc6ebdafce244c3624de375b10411768d7f1a38de7d081504ef0fe21b6b1b82156ef9410'
|
|
7
|
+
data.tar.gz: 4cc3900b5631971fc76195eee5ecf66a6c1647a604d19997b0bcece1d1341f9245ace1c405c3f5e9dbc493389f456c4c9c817ea145a1c3a47f2180edea52ae36
|
data/README.md
CHANGED
|
@@ -24,6 +24,15 @@ This project is in no way intended to adversely affect the development of the or
|
|
|
24
24
|
$ gem install numo-narray-alt
|
|
25
25
|
```
|
|
26
26
|
|
|
27
|
+
### Build options
|
|
28
|
+
|
|
29
|
+
By default, the floating-point classes (`Numo::SFloat` / `Numo::DFloat`) use SIMD instructions
|
|
30
|
+
(SSE2 / AVX / NEON) when the target CPU supports them. To build without SIMD, pass `--with-no-simd`:
|
|
31
|
+
|
|
32
|
+
```shell
|
|
33
|
+
$ gem install numo-narray-alt -- --with-no-simd
|
|
34
|
+
```
|
|
35
|
+
|
|
27
36
|
## Usage
|
|
28
37
|
|
|
29
38
|
The usage is exactly the same as Numo::NArray.
|
data/ext/numo/narray/extconf.rb
CHANGED
|
@@ -13,10 +13,10 @@ extern "C" {
|
|
|
13
13
|
#endif
|
|
14
14
|
#endif
|
|
15
15
|
|
|
16
|
-
#define NARRAY_VERSION "0.
|
|
16
|
+
#define NARRAY_VERSION "0.11.0"
|
|
17
17
|
#define NARRAY_VERSION_MAJOR 0
|
|
18
|
-
#define NARRAY_VERSION_MINOR
|
|
19
|
-
#define NARRAY_VERSION_PATCH
|
|
18
|
+
#define NARRAY_VERSION_MINOR 11
|
|
19
|
+
#define NARRAY_VERSION_PATCH 0
|
|
20
20
|
#define NARRAY_VERSION_CODE \
|
|
21
21
|
(NARRAY_VERSION_MAJOR * 10000 + NARRAY_VERSION_MINOR * 100 + NARRAY_VERSION_PATCH)
|
|
22
22
|
|
|
@@ -200,4 +200,376 @@
|
|
|
200
200
|
return na_ndloop(&ndf, 1, a1); \
|
|
201
201
|
}
|
|
202
202
|
|
|
203
|
+
#define DEF_NARRAY_FLT_SQRT_AVX_SGL_METHOD_FUNC(tDType, tNAryClass) \
|
|
204
|
+
static void iter_##tDType##_math_s_sqrt(na_loop_t* const lp) { \
|
|
205
|
+
size_t i = 0; \
|
|
206
|
+
size_t n; \
|
|
207
|
+
char *p1, *p2; \
|
|
208
|
+
ssize_t s1, s2; \
|
|
209
|
+
size_t *idx1, *idx2; \
|
|
210
|
+
tDType x; \
|
|
211
|
+
size_t cnt; \
|
|
212
|
+
size_t cnt_simd_loop = -1; \
|
|
213
|
+
__m256 a; \
|
|
214
|
+
size_t num_pack; \
|
|
215
|
+
num_pack = AVX_ALIGNMENT_SIZE / sizeof(tDType); \
|
|
216
|
+
\
|
|
217
|
+
INIT_COUNTER(lp, n); \
|
|
218
|
+
INIT_PTR_IDX(lp, 0, p1, s1, idx1); \
|
|
219
|
+
INIT_PTR_IDX(lp, 1, p2, s2, idx2); \
|
|
220
|
+
\
|
|
221
|
+
if (idx1) { \
|
|
222
|
+
if (idx2) { \
|
|
223
|
+
for (i = 0; i < n; i++) { \
|
|
224
|
+
GET_DATA_INDEX(p1, idx1, tDType, x); \
|
|
225
|
+
x = m_sqrt(x); \
|
|
226
|
+
SET_DATA_INDEX(p2, idx2, tDType, x); \
|
|
227
|
+
} \
|
|
228
|
+
} else { \
|
|
229
|
+
for (i = 0; i < n; i++) { \
|
|
230
|
+
GET_DATA_INDEX(p1, idx1, tDType, x); \
|
|
231
|
+
x = m_sqrt(x); \
|
|
232
|
+
SET_DATA_STRIDE(p2, s2, tDType, x); \
|
|
233
|
+
} \
|
|
234
|
+
} \
|
|
235
|
+
} else { \
|
|
236
|
+
if (idx2) { \
|
|
237
|
+
for (i = 0; i < n; i++) { \
|
|
238
|
+
GET_DATA_STRIDE(p1, s1, tDType, x); \
|
|
239
|
+
x = m_sqrt(x); \
|
|
240
|
+
SET_DATA_INDEX(p2, idx2, tDType, x); \
|
|
241
|
+
} \
|
|
242
|
+
} else { \
|
|
243
|
+
if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType))) { \
|
|
244
|
+
if (s1 == sizeof(tDType) && s2 == sizeof(tDType)) { \
|
|
245
|
+
if ((n >= num_pack) && \
|
|
246
|
+
is_same_aligned2(&((tDType*)p1)[i], &((tDType*)p2)[i], AVX_ALIGNMENT_SIZE)) { \
|
|
247
|
+
cnt = get_count_of_elements_not_aligned_to_simd_size( \
|
|
248
|
+
&((tDType*)p1)[i], AVX_ALIGNMENT_SIZE, sizeof(tDType) \
|
|
249
|
+
); \
|
|
250
|
+
for (i = 0; i < cnt; i++) { \
|
|
251
|
+
((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]); \
|
|
252
|
+
} \
|
|
253
|
+
cnt_simd_loop = (n - i) % num_pack; \
|
|
254
|
+
if (p1 == p2) { \
|
|
255
|
+
for (; i < n - cnt_simd_loop; i += num_pack) { \
|
|
256
|
+
a = _mm256_load_ps(&((tDType*)p1)[i]); \
|
|
257
|
+
a = _mm256_sqrt_ps(a); \
|
|
258
|
+
_mm256_store_ps(&((tDType*)p1)[i], a); \
|
|
259
|
+
} \
|
|
260
|
+
} else { \
|
|
261
|
+
for (; i < n - cnt_simd_loop; i += num_pack) { \
|
|
262
|
+
a = _mm256_load_ps(&((tDType*)p1)[i]); \
|
|
263
|
+
a = _mm256_sqrt_ps(a); \
|
|
264
|
+
_mm256_stream_ps(&((tDType*)p2)[i], a); \
|
|
265
|
+
} \
|
|
266
|
+
} \
|
|
267
|
+
} \
|
|
268
|
+
if (cnt_simd_loop != 0) { \
|
|
269
|
+
for (; i < n; i++) { \
|
|
270
|
+
((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]); \
|
|
271
|
+
} \
|
|
272
|
+
} \
|
|
273
|
+
return; \
|
|
274
|
+
} \
|
|
275
|
+
if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType))) { \
|
|
276
|
+
for (i = 0; i < n; i++) { \
|
|
277
|
+
*(tDType*)p2 = m_sqrt(*(tDType*)p1); \
|
|
278
|
+
p1 += s1; \
|
|
279
|
+
p2 += s2; \
|
|
280
|
+
} \
|
|
281
|
+
return; \
|
|
282
|
+
} \
|
|
283
|
+
} \
|
|
284
|
+
for (i = 0; i < n; i++) { \
|
|
285
|
+
GET_DATA_STRIDE(p1, s1, tDType, x); \
|
|
286
|
+
x = m_sqrt(x); \
|
|
287
|
+
SET_DATA_STRIDE(p2, s2, tDType, x); \
|
|
288
|
+
} \
|
|
289
|
+
} \
|
|
290
|
+
} \
|
|
291
|
+
} \
|
|
292
|
+
\
|
|
293
|
+
static VALUE tDType##_math_s_sqrt(VALUE mod, VALUE a1) { \
|
|
294
|
+
ndfunc_arg_in_t ain[1] = { { tNAryClass, 0 } }; \
|
|
295
|
+
ndfunc_arg_out_t aout[1] = { { tNAryClass, 0 } }; \
|
|
296
|
+
ndfunc_t ndf = { iter_##tDType##_math_s_sqrt, FULL_LOOP, 1, 1, ain, aout }; \
|
|
297
|
+
return na_ndloop(&ndf, 1, a1); \
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
#define DEF_NARRAY_FLT_SQRT_AVX_DBL_METHOD_FUNC(tDType, tNAryClass) \
|
|
301
|
+
static void iter_##tDType##_math_s_sqrt(na_loop_t* const lp) { \
|
|
302
|
+
size_t i = 0; \
|
|
303
|
+
size_t n; \
|
|
304
|
+
char *p1, *p2; \
|
|
305
|
+
ssize_t s1, s2; \
|
|
306
|
+
size_t *idx1, *idx2; \
|
|
307
|
+
tDType x; \
|
|
308
|
+
size_t cnt; \
|
|
309
|
+
size_t cnt_simd_loop = -1; \
|
|
310
|
+
__m256d a; \
|
|
311
|
+
size_t num_pack; \
|
|
312
|
+
num_pack = AVX_ALIGNMENT_SIZE / sizeof(tDType); \
|
|
313
|
+
\
|
|
314
|
+
INIT_COUNTER(lp, n); \
|
|
315
|
+
INIT_PTR_IDX(lp, 0, p1, s1, idx1); \
|
|
316
|
+
INIT_PTR_IDX(lp, 1, p2, s2, idx2); \
|
|
317
|
+
\
|
|
318
|
+
if (idx1) { \
|
|
319
|
+
if (idx2) { \
|
|
320
|
+
for (i = 0; i < n; i++) { \
|
|
321
|
+
GET_DATA_INDEX(p1, idx1, tDType, x); \
|
|
322
|
+
x = m_sqrt(x); \
|
|
323
|
+
SET_DATA_INDEX(p2, idx2, tDType, x); \
|
|
324
|
+
} \
|
|
325
|
+
} else { \
|
|
326
|
+
for (i = 0; i < n; i++) { \
|
|
327
|
+
GET_DATA_INDEX(p1, idx1, tDType, x); \
|
|
328
|
+
x = m_sqrt(x); \
|
|
329
|
+
SET_DATA_STRIDE(p2, s2, tDType, x); \
|
|
330
|
+
} \
|
|
331
|
+
} \
|
|
332
|
+
} else { \
|
|
333
|
+
if (idx2) { \
|
|
334
|
+
for (i = 0; i < n; i++) { \
|
|
335
|
+
GET_DATA_STRIDE(p1, s1, tDType, x); \
|
|
336
|
+
x = m_sqrt(x); \
|
|
337
|
+
SET_DATA_INDEX(p2, idx2, tDType, x); \
|
|
338
|
+
} \
|
|
339
|
+
} else { \
|
|
340
|
+
if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType))) { \
|
|
341
|
+
if (s1 == sizeof(tDType) && s2 == sizeof(tDType)) { \
|
|
342
|
+
if ((n >= num_pack) && \
|
|
343
|
+
is_same_aligned2(&((tDType*)p1)[i], &((tDType*)p2)[i], AVX_ALIGNMENT_SIZE)) { \
|
|
344
|
+
cnt = get_count_of_elements_not_aligned_to_simd_size( \
|
|
345
|
+
&((tDType*)p1)[i], AVX_ALIGNMENT_SIZE, sizeof(tDType) \
|
|
346
|
+
); \
|
|
347
|
+
for (i = 0; i < cnt; i++) { \
|
|
348
|
+
((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]); \
|
|
349
|
+
} \
|
|
350
|
+
cnt_simd_loop = (n - i) % num_pack; \
|
|
351
|
+
if (p1 == p2) { \
|
|
352
|
+
for (; i < n - cnt_simd_loop; i += num_pack) { \
|
|
353
|
+
a = _mm256_load_pd(&((tDType*)p1)[i]); \
|
|
354
|
+
a = _mm256_sqrt_pd(a); \
|
|
355
|
+
_mm256_store_pd(&((tDType*)p1)[i], a); \
|
|
356
|
+
} \
|
|
357
|
+
} else { \
|
|
358
|
+
for (; i < n - cnt_simd_loop; i += num_pack) { \
|
|
359
|
+
a = _mm256_load_pd(&((tDType*)p1)[i]); \
|
|
360
|
+
a = _mm256_sqrt_pd(a); \
|
|
361
|
+
_mm256_stream_pd(&((tDType*)p2)[i], a); \
|
|
362
|
+
} \
|
|
363
|
+
} \
|
|
364
|
+
} \
|
|
365
|
+
if (cnt_simd_loop != 0) { \
|
|
366
|
+
for (; i < n; i++) { \
|
|
367
|
+
((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]); \
|
|
368
|
+
} \
|
|
369
|
+
} \
|
|
370
|
+
return; \
|
|
371
|
+
} \
|
|
372
|
+
if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType))) { \
|
|
373
|
+
for (i = 0; i < n; i++) { \
|
|
374
|
+
*(tDType*)p2 = m_sqrt(*(tDType*)p1); \
|
|
375
|
+
p1 += s1; \
|
|
376
|
+
p2 += s2; \
|
|
377
|
+
} \
|
|
378
|
+
return; \
|
|
379
|
+
} \
|
|
380
|
+
} \
|
|
381
|
+
for (i = 0; i < n; i++) { \
|
|
382
|
+
GET_DATA_STRIDE(p1, s1, tDType, x); \
|
|
383
|
+
x = m_sqrt(x); \
|
|
384
|
+
SET_DATA_STRIDE(p2, s2, tDType, x); \
|
|
385
|
+
} \
|
|
386
|
+
} \
|
|
387
|
+
} \
|
|
388
|
+
} \
|
|
389
|
+
\
|
|
390
|
+
static VALUE tDType##_math_s_sqrt(VALUE mod, VALUE a1) { \
|
|
391
|
+
ndfunc_arg_in_t ain[1] = { { tNAryClass, 0 } }; \
|
|
392
|
+
ndfunc_arg_out_t aout[1] = { { tNAryClass, 0 } }; \
|
|
393
|
+
ndfunc_t ndf = { iter_##tDType##_math_s_sqrt, FULL_LOOP, 1, 1, ain, aout }; \
|
|
394
|
+
return na_ndloop(&ndf, 1, a1); \
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
#define DEF_NARRAY_FLT_SQRT_NEON_SGL_METHOD_FUNC(tDType, tNAryClass) \
|
|
398
|
+
static void iter_##tDType##_math_s_sqrt(na_loop_t* const lp) { \
|
|
399
|
+
size_t i = 0; \
|
|
400
|
+
size_t n; \
|
|
401
|
+
char *p1, *p2; \
|
|
402
|
+
ssize_t s1, s2; \
|
|
403
|
+
size_t *idx1, *idx2; \
|
|
404
|
+
tDType x; \
|
|
405
|
+
size_t cnt; \
|
|
406
|
+
size_t cnt_simd_loop = -1; \
|
|
407
|
+
float32x4_t a; \
|
|
408
|
+
size_t num_pack; \
|
|
409
|
+
num_pack = NEON_ALIGNMENT_SIZE / sizeof(tDType); \
|
|
410
|
+
\
|
|
411
|
+
INIT_COUNTER(lp, n); \
|
|
412
|
+
INIT_PTR_IDX(lp, 0, p1, s1, idx1); \
|
|
413
|
+
INIT_PTR_IDX(lp, 1, p2, s2, idx2); \
|
|
414
|
+
\
|
|
415
|
+
if (idx1) { \
|
|
416
|
+
if (idx2) { \
|
|
417
|
+
for (i = 0; i < n; i++) { \
|
|
418
|
+
GET_DATA_INDEX(p1, idx1, tDType, x); \
|
|
419
|
+
x = m_sqrt(x); \
|
|
420
|
+
SET_DATA_INDEX(p2, idx2, tDType, x); \
|
|
421
|
+
} \
|
|
422
|
+
} else { \
|
|
423
|
+
for (i = 0; i < n; i++) { \
|
|
424
|
+
GET_DATA_INDEX(p1, idx1, tDType, x); \
|
|
425
|
+
x = m_sqrt(x); \
|
|
426
|
+
SET_DATA_STRIDE(p2, s2, tDType, x); \
|
|
427
|
+
} \
|
|
428
|
+
} \
|
|
429
|
+
} else { \
|
|
430
|
+
if (idx2) { \
|
|
431
|
+
for (i = 0; i < n; i++) { \
|
|
432
|
+
GET_DATA_STRIDE(p1, s1, tDType, x); \
|
|
433
|
+
x = m_sqrt(x); \
|
|
434
|
+
SET_DATA_INDEX(p2, idx2, tDType, x); \
|
|
435
|
+
} \
|
|
436
|
+
} else { \
|
|
437
|
+
if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType))) { \
|
|
438
|
+
if (s1 == sizeof(tDType) && s2 == sizeof(tDType)) { \
|
|
439
|
+
if ((n >= num_pack) && \
|
|
440
|
+
is_same_aligned2(&((tDType*)p1)[i], &((tDType*)p2)[i], NEON_ALIGNMENT_SIZE)) { \
|
|
441
|
+
cnt = get_count_of_elements_not_aligned_to_simd_size( \
|
|
442
|
+
&((tDType*)p1)[i], NEON_ALIGNMENT_SIZE, sizeof(tDType) \
|
|
443
|
+
); \
|
|
444
|
+
for (i = 0; i < cnt; i++) { \
|
|
445
|
+
((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]); \
|
|
446
|
+
} \
|
|
447
|
+
cnt_simd_loop = (n - i) % num_pack; \
|
|
448
|
+
for (; i < n - cnt_simd_loop; i += num_pack) { \
|
|
449
|
+
a = vld1q_f32(&((tDType*)p1)[i]); \
|
|
450
|
+
a = vsqrtq_f32(a); \
|
|
451
|
+
vst1q_f32(&((tDType*)p2)[i], a); \
|
|
452
|
+
} \
|
|
453
|
+
} \
|
|
454
|
+
if (cnt_simd_loop != 0) { \
|
|
455
|
+
for (; i < n; i++) { \
|
|
456
|
+
((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]); \
|
|
457
|
+
} \
|
|
458
|
+
} \
|
|
459
|
+
return; \
|
|
460
|
+
} \
|
|
461
|
+
if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType))) { \
|
|
462
|
+
for (i = 0; i < n; i++) { \
|
|
463
|
+
*(tDType*)p2 = m_sqrt(*(tDType*)p1); \
|
|
464
|
+
p1 += s1; \
|
|
465
|
+
p2 += s2; \
|
|
466
|
+
} \
|
|
467
|
+
return; \
|
|
468
|
+
} \
|
|
469
|
+
} \
|
|
470
|
+
for (i = 0; i < n; i++) { \
|
|
471
|
+
GET_DATA_STRIDE(p1, s1, tDType, x); \
|
|
472
|
+
x = m_sqrt(x); \
|
|
473
|
+
SET_DATA_STRIDE(p2, s2, tDType, x); \
|
|
474
|
+
} \
|
|
475
|
+
} \
|
|
476
|
+
} \
|
|
477
|
+
} \
|
|
478
|
+
\
|
|
479
|
+
static VALUE tDType##_math_s_sqrt(VALUE mod, VALUE a1) { \
|
|
480
|
+
ndfunc_arg_in_t ain[1] = { { tNAryClass, 0 } }; \
|
|
481
|
+
ndfunc_arg_out_t aout[1] = { { tNAryClass, 0 } }; \
|
|
482
|
+
ndfunc_t ndf = { iter_##tDType##_math_s_sqrt, FULL_LOOP, 1, 1, ain, aout }; \
|
|
483
|
+
return na_ndloop(&ndf, 1, a1); \
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
#define DEF_NARRAY_FLT_SQRT_NEON_DBL_METHOD_FUNC(tDType, tNAryClass) \
|
|
487
|
+
static void iter_##tDType##_math_s_sqrt(na_loop_t* const lp) { \
|
|
488
|
+
size_t i = 0; \
|
|
489
|
+
size_t n; \
|
|
490
|
+
char *p1, *p2; \
|
|
491
|
+
ssize_t s1, s2; \
|
|
492
|
+
size_t *idx1, *idx2; \
|
|
493
|
+
tDType x; \
|
|
494
|
+
size_t cnt; \
|
|
495
|
+
size_t cnt_simd_loop = -1; \
|
|
496
|
+
float64x2_t a; \
|
|
497
|
+
size_t num_pack; \
|
|
498
|
+
num_pack = NEON_ALIGNMENT_SIZE / sizeof(tDType); \
|
|
499
|
+
\
|
|
500
|
+
INIT_COUNTER(lp, n); \
|
|
501
|
+
INIT_PTR_IDX(lp, 0, p1, s1, idx1); \
|
|
502
|
+
INIT_PTR_IDX(lp, 1, p2, s2, idx2); \
|
|
503
|
+
\
|
|
504
|
+
if (idx1) { \
|
|
505
|
+
if (idx2) { \
|
|
506
|
+
for (i = 0; i < n; i++) { \
|
|
507
|
+
GET_DATA_INDEX(p1, idx1, tDType, x); \
|
|
508
|
+
x = m_sqrt(x); \
|
|
509
|
+
SET_DATA_INDEX(p2, idx2, tDType, x); \
|
|
510
|
+
} \
|
|
511
|
+
} else { \
|
|
512
|
+
for (i = 0; i < n; i++) { \
|
|
513
|
+
GET_DATA_INDEX(p1, idx1, tDType, x); \
|
|
514
|
+
x = m_sqrt(x); \
|
|
515
|
+
SET_DATA_STRIDE(p2, s2, tDType, x); \
|
|
516
|
+
} \
|
|
517
|
+
} \
|
|
518
|
+
} else { \
|
|
519
|
+
if (idx2) { \
|
|
520
|
+
for (i = 0; i < n; i++) { \
|
|
521
|
+
GET_DATA_STRIDE(p1, s1, tDType, x); \
|
|
522
|
+
x = m_sqrt(x); \
|
|
523
|
+
SET_DATA_INDEX(p2, idx2, tDType, x); \
|
|
524
|
+
} \
|
|
525
|
+
} else { \
|
|
526
|
+
if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType))) { \
|
|
527
|
+
if (s1 == sizeof(tDType) && s2 == sizeof(tDType)) { \
|
|
528
|
+
if ((n >= num_pack) && \
|
|
529
|
+
is_same_aligned2(&((tDType*)p1)[i], &((tDType*)p2)[i], NEON_ALIGNMENT_SIZE)) { \
|
|
530
|
+
cnt = get_count_of_elements_not_aligned_to_simd_size( \
|
|
531
|
+
&((tDType*)p1)[i], NEON_ALIGNMENT_SIZE, sizeof(tDType) \
|
|
532
|
+
); \
|
|
533
|
+
for (i = 0; i < cnt; i++) { \
|
|
534
|
+
((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]); \
|
|
535
|
+
} \
|
|
536
|
+
cnt_simd_loop = (n - i) % num_pack; \
|
|
537
|
+
for (; i < n - cnt_simd_loop; i += num_pack) { \
|
|
538
|
+
a = vld1q_f64(&((tDType*)p1)[i]); \
|
|
539
|
+
a = vsqrtq_f64(a); \
|
|
540
|
+
vst1q_f64(&((tDType*)p2)[i], a); \
|
|
541
|
+
} \
|
|
542
|
+
} \
|
|
543
|
+
if (cnt_simd_loop != 0) { \
|
|
544
|
+
for (; i < n; i++) { \
|
|
545
|
+
((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]); \
|
|
546
|
+
} \
|
|
547
|
+
} \
|
|
548
|
+
return; \
|
|
549
|
+
} \
|
|
550
|
+
if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType))) { \
|
|
551
|
+
for (i = 0; i < n; i++) { \
|
|
552
|
+
*(tDType*)p2 = m_sqrt(*(tDType*)p1); \
|
|
553
|
+
p1 += s1; \
|
|
554
|
+
p2 += s2; \
|
|
555
|
+
} \
|
|
556
|
+
return; \
|
|
557
|
+
} \
|
|
558
|
+
} \
|
|
559
|
+
for (i = 0; i < n; i++) { \
|
|
560
|
+
GET_DATA_STRIDE(p1, s1, tDType, x); \
|
|
561
|
+
x = m_sqrt(x); \
|
|
562
|
+
SET_DATA_STRIDE(p2, s2, tDType, x); \
|
|
563
|
+
} \
|
|
564
|
+
} \
|
|
565
|
+
} \
|
|
566
|
+
} \
|
|
567
|
+
\
|
|
568
|
+
static VALUE tDType##_math_s_sqrt(VALUE mod, VALUE a1) { \
|
|
569
|
+
ndfunc_arg_in_t ain[1] = { { tNAryClass, 0 } }; \
|
|
570
|
+
ndfunc_arg_out_t aout[1] = { { tNAryClass, 0 } }; \
|
|
571
|
+
ndfunc_t ndf = { iter_##tDType##_math_s_sqrt, FULL_LOOP, 1, 1, ain, aout }; \
|
|
572
|
+
return na_ndloop(&ndf, 1, a1); \
|
|
573
|
+
}
|
|
574
|
+
|
|
203
575
|
#endif /* NUMO_NARRAY_MH_MATH_SQRT_H */
|