numo-narray-alt 0.10.4 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b00e8da08175d2c19b50dc634303140c251a2139c223b2c8f3ef515858314da0
4
- data.tar.gz: 484a78cdd0959d3d2a09e438c8eaeef953964612590c3c44050d5ca906e65a75
3
+ metadata.gz: 9b2e2bc7cc99b7ef2b868b588360ff66299a6570b4d6ea98c45bbb357eed661b
4
+ data.tar.gz: '0887ab3061ad5add393ecd99921ca88e3a06545c9617ffc9686550cb87b3e843'
5
5
  SHA512:
6
- metadata.gz: 58600b9b39c99a28f4ec6df40d1dfba22db1c1aa38dc50d61fa905f31cdb3dcd623a0b07b81519d8341d0afa644b4ec3695c5ddf9889ffbb82decca415b34959
7
- data.tar.gz: e0112d66a2bd84afb953ab911119884969fb6004815313c2d85e8898963393de4434d7ca0286bc9da84a4d880dfc16f5c609c218fcfd61fc65add01186e10d8c
6
+ metadata.gz: '09f47d21922f7e24e7915222eb9ddacadf4392156e53a15aee05d4f2fc6ebdafce244c3624de375b10411768d7f1a38de7d081504ef0fe21b6b1b82156ef9410'
7
+ data.tar.gz: 4cc3900b5631971fc76195eee5ecf66a6c1647a604d19997b0bcece1d1341f9245ace1c405c3f5e9dbc493389f456c4c9c817ea145a1c3a47f2180edea52ae36
data/README.md CHANGED
@@ -24,6 +24,15 @@ This project is in no way intended to adversely affect the development of the or
24
24
  $ gem install numo-narray-alt
25
25
  ```
26
26
 
27
+ ### Build options
28
+
29
+ By default, the floating-point classes (`Numo::SFloat` / `Numo::DFloat`) use SIMD instructions
30
+ (SSE2 / AVX / NEON) when the target CPU supports them. To build without SIMD, pass `--with-no-simd`:
31
+
32
+ ```shell
33
+ $ gem install numo-narray-alt -- --with-no-simd
34
+ ```
35
+
27
36
  ## Usage
28
37
 
29
38
  The usage is exactly the same as Numo::NArray.
@@ -66,6 +66,8 @@ have_func('RTYPEDDATA_GET_DATA')
66
66
 
67
67
  have_var('rb_cComplex')
68
68
 
69
+ $defs << '-DNUMO_NO_SIMD' if with_config('no-simd', false)
70
+
69
71
  $objs = srcs.collect { |i| "#{i}.o" }
70
72
 
71
73
  create_header d('numo/extconf.h')
@@ -13,10 +13,10 @@ extern "C" {
13
13
  #endif
14
14
  #endif
15
15
 
16
- #define NARRAY_VERSION "0.10.4"
16
+ #define NARRAY_VERSION "0.11.0"
17
17
  #define NARRAY_VERSION_MAJOR 0
18
- #define NARRAY_VERSION_MINOR 10
19
- #define NARRAY_VERSION_PATCH 4
18
+ #define NARRAY_VERSION_MINOR 11
19
+ #define NARRAY_VERSION_PATCH 0
20
20
  #define NARRAY_VERSION_CODE \
21
21
  (NARRAY_VERSION_MAJOR * 10000 + NARRAY_VERSION_MINOR * 100 + NARRAY_VERSION_PATCH)
22
22
 
@@ -12,7 +12,7 @@ extern double pow(double, double);
12
12
  #define m_zero 0.0
13
13
  #define m_one 1.0
14
14
 
15
- #define m_num_to_data(x) (NIL_P(x) ? nan("") : NUM2DBL(x))
15
+ #define m_num_to_data(x) f_num_to_data(x)
16
16
  #define m_data_to_num(x) rb_float_new(x)
17
17
 
18
18
  #define m_from_double(x) (x)
@@ -126,6 +126,10 @@ extern double pow(double, double);
126
126
  #define m_ldexp(x, y) ldexp(x, y)
127
127
  #define m_frexp(x, exp) frexp(x, exp)
128
128
 
129
+ static inline dtype f_num_to_data(VALUE x) {
130
+ return NIL_P(x) ? nan("") : NUM2DBL(x);
131
+ }
132
+
129
133
  static inline dtype pow_int(dtype x, int p) {
130
134
  dtype r = 1;
131
135
  switch (p) {
@@ -200,4 +200,376 @@
200
200
  return na_ndloop(&ndf, 1, a1); \
201
201
  }
202
202
 
203
+ #define DEF_NARRAY_FLT_SQRT_AVX_SGL_METHOD_FUNC(tDType, tNAryClass) \
204
+ static void iter_##tDType##_math_s_sqrt(na_loop_t* const lp) { \
205
+ size_t i = 0; \
206
+ size_t n; \
207
+ char *p1, *p2; \
208
+ ssize_t s1, s2; \
209
+ size_t *idx1, *idx2; \
210
+ tDType x; \
211
+ size_t cnt; \
212
+ size_t cnt_simd_loop = -1; \
213
+ __m256 a; \
214
+ size_t num_pack; \
215
+ num_pack = AVX_ALIGNMENT_SIZE / sizeof(tDType); \
216
+ \
217
+ INIT_COUNTER(lp, n); \
218
+ INIT_PTR_IDX(lp, 0, p1, s1, idx1); \
219
+ INIT_PTR_IDX(lp, 1, p2, s2, idx2); \
220
+ \
221
+ if (idx1) { \
222
+ if (idx2) { \
223
+ for (i = 0; i < n; i++) { \
224
+ GET_DATA_INDEX(p1, idx1, tDType, x); \
225
+ x = m_sqrt(x); \
226
+ SET_DATA_INDEX(p2, idx2, tDType, x); \
227
+ } \
228
+ } else { \
229
+ for (i = 0; i < n; i++) { \
230
+ GET_DATA_INDEX(p1, idx1, tDType, x); \
231
+ x = m_sqrt(x); \
232
+ SET_DATA_STRIDE(p2, s2, tDType, x); \
233
+ } \
234
+ } \
235
+ } else { \
236
+ if (idx2) { \
237
+ for (i = 0; i < n; i++) { \
238
+ GET_DATA_STRIDE(p1, s1, tDType, x); \
239
+ x = m_sqrt(x); \
240
+ SET_DATA_INDEX(p2, idx2, tDType, x); \
241
+ } \
242
+ } else { \
243
+ if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType))) { \
244
+ if (s1 == sizeof(tDType) && s2 == sizeof(tDType)) { \
245
+ if ((n >= num_pack) && \
246
+ is_same_aligned2(&((tDType*)p1)[i], &((tDType*)p2)[i], AVX_ALIGNMENT_SIZE)) { \
247
+ cnt = get_count_of_elements_not_aligned_to_simd_size( \
248
+ &((tDType*)p1)[i], AVX_ALIGNMENT_SIZE, sizeof(tDType) \
249
+ ); \
250
+ for (i = 0; i < cnt; i++) { \
251
+ ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]); \
252
+ } \
253
+ cnt_simd_loop = (n - i) % num_pack; \
254
+ if (p1 == p2) { \
255
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
256
+ a = _mm256_load_ps(&((tDType*)p1)[i]); \
257
+ a = _mm256_sqrt_ps(a); \
258
+ _mm256_store_ps(&((tDType*)p1)[i], a); \
259
+ } \
260
+ } else { \
261
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
262
+ a = _mm256_load_ps(&((tDType*)p1)[i]); \
263
+ a = _mm256_sqrt_ps(a); \
264
+ _mm256_stream_ps(&((tDType*)p2)[i], a); \
265
+ } \
266
+ } \
267
+ } \
268
+ if (cnt_simd_loop != 0) { \
269
+ for (; i < n; i++) { \
270
+ ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]); \
271
+ } \
272
+ } \
273
+ return; \
274
+ } \
275
+ if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType))) { \
276
+ for (i = 0; i < n; i++) { \
277
+ *(tDType*)p2 = m_sqrt(*(tDType*)p1); \
278
+ p1 += s1; \
279
+ p2 += s2; \
280
+ } \
281
+ return; \
282
+ } \
283
+ } \
284
+ for (i = 0; i < n; i++) { \
285
+ GET_DATA_STRIDE(p1, s1, tDType, x); \
286
+ x = m_sqrt(x); \
287
+ SET_DATA_STRIDE(p2, s2, tDType, x); \
288
+ } \
289
+ } \
290
+ } \
291
+ } \
292
+ \
293
+ static VALUE tDType##_math_s_sqrt(VALUE mod, VALUE a1) { \
294
+ ndfunc_arg_in_t ain[1] = { { tNAryClass, 0 } }; \
295
+ ndfunc_arg_out_t aout[1] = { { tNAryClass, 0 } }; \
296
+ ndfunc_t ndf = { iter_##tDType##_math_s_sqrt, FULL_LOOP, 1, 1, ain, aout }; \
297
+ return na_ndloop(&ndf, 1, a1); \
298
+ }
299
+
300
+ #define DEF_NARRAY_FLT_SQRT_AVX_DBL_METHOD_FUNC(tDType, tNAryClass) \
301
+ static void iter_##tDType##_math_s_sqrt(na_loop_t* const lp) { \
302
+ size_t i = 0; \
303
+ size_t n; \
304
+ char *p1, *p2; \
305
+ ssize_t s1, s2; \
306
+ size_t *idx1, *idx2; \
307
+ tDType x; \
308
+ size_t cnt; \
309
+ size_t cnt_simd_loop = -1; \
310
+ __m256d a; \
311
+ size_t num_pack; \
312
+ num_pack = AVX_ALIGNMENT_SIZE / sizeof(tDType); \
313
+ \
314
+ INIT_COUNTER(lp, n); \
315
+ INIT_PTR_IDX(lp, 0, p1, s1, idx1); \
316
+ INIT_PTR_IDX(lp, 1, p2, s2, idx2); \
317
+ \
318
+ if (idx1) { \
319
+ if (idx2) { \
320
+ for (i = 0; i < n; i++) { \
321
+ GET_DATA_INDEX(p1, idx1, tDType, x); \
322
+ x = m_sqrt(x); \
323
+ SET_DATA_INDEX(p2, idx2, tDType, x); \
324
+ } \
325
+ } else { \
326
+ for (i = 0; i < n; i++) { \
327
+ GET_DATA_INDEX(p1, idx1, tDType, x); \
328
+ x = m_sqrt(x); \
329
+ SET_DATA_STRIDE(p2, s2, tDType, x); \
330
+ } \
331
+ } \
332
+ } else { \
333
+ if (idx2) { \
334
+ for (i = 0; i < n; i++) { \
335
+ GET_DATA_STRIDE(p1, s1, tDType, x); \
336
+ x = m_sqrt(x); \
337
+ SET_DATA_INDEX(p2, idx2, tDType, x); \
338
+ } \
339
+ } else { \
340
+ if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType))) { \
341
+ if (s1 == sizeof(tDType) && s2 == sizeof(tDType)) { \
342
+ if ((n >= num_pack) && \
343
+ is_same_aligned2(&((tDType*)p1)[i], &((tDType*)p2)[i], AVX_ALIGNMENT_SIZE)) { \
344
+ cnt = get_count_of_elements_not_aligned_to_simd_size( \
345
+ &((tDType*)p1)[i], AVX_ALIGNMENT_SIZE, sizeof(tDType) \
346
+ ); \
347
+ for (i = 0; i < cnt; i++) { \
348
+ ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]); \
349
+ } \
350
+ cnt_simd_loop = (n - i) % num_pack; \
351
+ if (p1 == p2) { \
352
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
353
+ a = _mm256_load_pd(&((tDType*)p1)[i]); \
354
+ a = _mm256_sqrt_pd(a); \
355
+ _mm256_store_pd(&((tDType*)p1)[i], a); \
356
+ } \
357
+ } else { \
358
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
359
+ a = _mm256_load_pd(&((tDType*)p1)[i]); \
360
+ a = _mm256_sqrt_pd(a); \
361
+ _mm256_stream_pd(&((tDType*)p2)[i], a); \
362
+ } \
363
+ } \
364
+ } \
365
+ if (cnt_simd_loop != 0) { \
366
+ for (; i < n; i++) { \
367
+ ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]); \
368
+ } \
369
+ } \
370
+ return; \
371
+ } \
372
+ if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType))) { \
373
+ for (i = 0; i < n; i++) { \
374
+ *(tDType*)p2 = m_sqrt(*(tDType*)p1); \
375
+ p1 += s1; \
376
+ p2 += s2; \
377
+ } \
378
+ return; \
379
+ } \
380
+ } \
381
+ for (i = 0; i < n; i++) { \
382
+ GET_DATA_STRIDE(p1, s1, tDType, x); \
383
+ x = m_sqrt(x); \
384
+ SET_DATA_STRIDE(p2, s2, tDType, x); \
385
+ } \
386
+ } \
387
+ } \
388
+ } \
389
+ \
390
+ static VALUE tDType##_math_s_sqrt(VALUE mod, VALUE a1) { \
391
+ ndfunc_arg_in_t ain[1] = { { tNAryClass, 0 } }; \
392
+ ndfunc_arg_out_t aout[1] = { { tNAryClass, 0 } }; \
393
+ ndfunc_t ndf = { iter_##tDType##_math_s_sqrt, FULL_LOOP, 1, 1, ain, aout }; \
394
+ return na_ndloop(&ndf, 1, a1); \
395
+ }
396
+
397
+ #define DEF_NARRAY_FLT_SQRT_NEON_SGL_METHOD_FUNC(tDType, tNAryClass) \
398
+ static void iter_##tDType##_math_s_sqrt(na_loop_t* const lp) { \
399
+ size_t i = 0; \
400
+ size_t n; \
401
+ char *p1, *p2; \
402
+ ssize_t s1, s2; \
403
+ size_t *idx1, *idx2; \
404
+ tDType x; \
405
+ size_t cnt; \
406
+ size_t cnt_simd_loop = -1; \
407
+ float32x4_t a; \
408
+ size_t num_pack; \
409
+ num_pack = NEON_ALIGNMENT_SIZE / sizeof(tDType); \
410
+ \
411
+ INIT_COUNTER(lp, n); \
412
+ INIT_PTR_IDX(lp, 0, p1, s1, idx1); \
413
+ INIT_PTR_IDX(lp, 1, p2, s2, idx2); \
414
+ \
415
+ if (idx1) { \
416
+ if (idx2) { \
417
+ for (i = 0; i < n; i++) { \
418
+ GET_DATA_INDEX(p1, idx1, tDType, x); \
419
+ x = m_sqrt(x); \
420
+ SET_DATA_INDEX(p2, idx2, tDType, x); \
421
+ } \
422
+ } else { \
423
+ for (i = 0; i < n; i++) { \
424
+ GET_DATA_INDEX(p1, idx1, tDType, x); \
425
+ x = m_sqrt(x); \
426
+ SET_DATA_STRIDE(p2, s2, tDType, x); \
427
+ } \
428
+ } \
429
+ } else { \
430
+ if (idx2) { \
431
+ for (i = 0; i < n; i++) { \
432
+ GET_DATA_STRIDE(p1, s1, tDType, x); \
433
+ x = m_sqrt(x); \
434
+ SET_DATA_INDEX(p2, idx2, tDType, x); \
435
+ } \
436
+ } else { \
437
+ if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType))) { \
438
+ if (s1 == sizeof(tDType) && s2 == sizeof(tDType)) { \
439
+ if ((n >= num_pack) && \
440
+ is_same_aligned2(&((tDType*)p1)[i], &((tDType*)p2)[i], NEON_ALIGNMENT_SIZE)) { \
441
+ cnt = get_count_of_elements_not_aligned_to_simd_size( \
442
+ &((tDType*)p1)[i], NEON_ALIGNMENT_SIZE, sizeof(tDType) \
443
+ ); \
444
+ for (i = 0; i < cnt; i++) { \
445
+ ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]); \
446
+ } \
447
+ cnt_simd_loop = (n - i) % num_pack; \
448
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
449
+ a = vld1q_f32(&((tDType*)p1)[i]); \
450
+ a = vsqrtq_f32(a); \
451
+ vst1q_f32(&((tDType*)p2)[i], a); \
452
+ } \
453
+ } \
454
+ if (cnt_simd_loop != 0) { \
455
+ for (; i < n; i++) { \
456
+ ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]); \
457
+ } \
458
+ } \
459
+ return; \
460
+ } \
461
+ if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType))) { \
462
+ for (i = 0; i < n; i++) { \
463
+ *(tDType*)p2 = m_sqrt(*(tDType*)p1); \
464
+ p1 += s1; \
465
+ p2 += s2; \
466
+ } \
467
+ return; \
468
+ } \
469
+ } \
470
+ for (i = 0; i < n; i++) { \
471
+ GET_DATA_STRIDE(p1, s1, tDType, x); \
472
+ x = m_sqrt(x); \
473
+ SET_DATA_STRIDE(p2, s2, tDType, x); \
474
+ } \
475
+ } \
476
+ } \
477
+ } \
478
+ \
479
+ static VALUE tDType##_math_s_sqrt(VALUE mod, VALUE a1) { \
480
+ ndfunc_arg_in_t ain[1] = { { tNAryClass, 0 } }; \
481
+ ndfunc_arg_out_t aout[1] = { { tNAryClass, 0 } }; \
482
+ ndfunc_t ndf = { iter_##tDType##_math_s_sqrt, FULL_LOOP, 1, 1, ain, aout }; \
483
+ return na_ndloop(&ndf, 1, a1); \
484
+ }
485
+
486
+ #define DEF_NARRAY_FLT_SQRT_NEON_DBL_METHOD_FUNC(tDType, tNAryClass) \
487
+ static void iter_##tDType##_math_s_sqrt(na_loop_t* const lp) { \
488
+ size_t i = 0; \
489
+ size_t n; \
490
+ char *p1, *p2; \
491
+ ssize_t s1, s2; \
492
+ size_t *idx1, *idx2; \
493
+ tDType x; \
494
+ size_t cnt; \
495
+ size_t cnt_simd_loop = -1; \
496
+ float64x2_t a; \
497
+ size_t num_pack; \
498
+ num_pack = NEON_ALIGNMENT_SIZE / sizeof(tDType); \
499
+ \
500
+ INIT_COUNTER(lp, n); \
501
+ INIT_PTR_IDX(lp, 0, p1, s1, idx1); \
502
+ INIT_PTR_IDX(lp, 1, p2, s2, idx2); \
503
+ \
504
+ if (idx1) { \
505
+ if (idx2) { \
506
+ for (i = 0; i < n; i++) { \
507
+ GET_DATA_INDEX(p1, idx1, tDType, x); \
508
+ x = m_sqrt(x); \
509
+ SET_DATA_INDEX(p2, idx2, tDType, x); \
510
+ } \
511
+ } else { \
512
+ for (i = 0; i < n; i++) { \
513
+ GET_DATA_INDEX(p1, idx1, tDType, x); \
514
+ x = m_sqrt(x); \
515
+ SET_DATA_STRIDE(p2, s2, tDType, x); \
516
+ } \
517
+ } \
518
+ } else { \
519
+ if (idx2) { \
520
+ for (i = 0; i < n; i++) { \
521
+ GET_DATA_STRIDE(p1, s1, tDType, x); \
522
+ x = m_sqrt(x); \
523
+ SET_DATA_INDEX(p2, idx2, tDType, x); \
524
+ } \
525
+ } else { \
526
+ if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType))) { \
527
+ if (s1 == sizeof(tDType) && s2 == sizeof(tDType)) { \
528
+ if ((n >= num_pack) && \
529
+ is_same_aligned2(&((tDType*)p1)[i], &((tDType*)p2)[i], NEON_ALIGNMENT_SIZE)) { \
530
+ cnt = get_count_of_elements_not_aligned_to_simd_size( \
531
+ &((tDType*)p1)[i], NEON_ALIGNMENT_SIZE, sizeof(tDType) \
532
+ ); \
533
+ for (i = 0; i < cnt; i++) { \
534
+ ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]); \
535
+ } \
536
+ cnt_simd_loop = (n - i) % num_pack; \
537
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
538
+ a = vld1q_f64(&((tDType*)p1)[i]); \
539
+ a = vsqrtq_f64(a); \
540
+ vst1q_f64(&((tDType*)p2)[i], a); \
541
+ } \
542
+ } \
543
+ if (cnt_simd_loop != 0) { \
544
+ for (; i < n; i++) { \
545
+ ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]); \
546
+ } \
547
+ } \
548
+ return; \
549
+ } \
550
+ if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType))) { \
551
+ for (i = 0; i < n; i++) { \
552
+ *(tDType*)p2 = m_sqrt(*(tDType*)p1); \
553
+ p1 += s1; \
554
+ p2 += s2; \
555
+ } \
556
+ return; \
557
+ } \
558
+ } \
559
+ for (i = 0; i < n; i++) { \
560
+ GET_DATA_STRIDE(p1, s1, tDType, x); \
561
+ x = m_sqrt(x); \
562
+ SET_DATA_STRIDE(p2, s2, tDType, x); \
563
+ } \
564
+ } \
565
+ } \
566
+ } \
567
+ \
568
+ static VALUE tDType##_math_s_sqrt(VALUE mod, VALUE a1) { \
569
+ ndfunc_arg_in_t ain[1] = { { tNAryClass, 0 } }; \
570
+ ndfunc_arg_out_t aout[1] = { { tNAryClass, 0 } }; \
571
+ ndfunc_t ndf = { iter_##tDType##_math_s_sqrt, FULL_LOOP, 1, 1, ain, aout }; \
572
+ return na_ndloop(&ndf, 1, a1); \
573
+ }
574
+
203
575
  #endif /* NUMO_NARRAY_MH_MATH_SQRT_H */