numo-narray-alt 0.10.5 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -75,4 +75,24 @@
75
75
  DEF_BINARY_SELF_FUNC(add, dfloat, numo_cDFloat) \
76
76
  DEF_BINARY_FUNC(add, '+', dfloat, numo_cDFloat)
77
77
 
78
+ #define DEF_NARRAY_SFLT_ADD_AVX_METHOD_FUNC() \
79
+ DEF_BINARY_SFLT_AVX_ITER_FUNC(add, _mm256_add_ps) \
80
+ DEF_BINARY_SELF_FUNC(add, sfloat, numo_cSFloat) \
81
+ DEF_BINARY_FUNC(add, '+', sfloat, numo_cSFloat)
82
+
83
+ #define DEF_NARRAY_DFLT_ADD_AVX_METHOD_FUNC() \
84
+ DEF_BINARY_DFLT_AVX_ITER_FUNC(add, _mm256_add_pd) \
85
+ DEF_BINARY_SELF_FUNC(add, dfloat, numo_cDFloat) \
86
+ DEF_BINARY_FUNC(add, '+', dfloat, numo_cDFloat)
87
+
88
+ #define DEF_NARRAY_SFLT_ADD_NEON_METHOD_FUNC() \
89
+ DEF_BINARY_SFLT_NEON_ITER_FUNC(add, vaddq_f32) \
90
+ DEF_BINARY_SELF_FUNC(add, sfloat, numo_cSFloat) \
91
+ DEF_BINARY_FUNC(add, '+', sfloat, numo_cSFloat)
92
+
93
+ #define DEF_NARRAY_DFLT_ADD_NEON_METHOD_FUNC() \
94
+ DEF_BINARY_DFLT_NEON_ITER_FUNC(add, vaddq_f64) \
95
+ DEF_BINARY_SELF_FUNC(add, dfloat, numo_cDFloat) \
96
+ DEF_BINARY_FUNC(add, '+', dfloat, numo_cDFloat)
97
+
78
98
  #endif /* NUMO_NARRAY_MH_OP_ADD_H */
@@ -420,4 +420,546 @@
420
420
  } \
421
421
  }
422
422
 
423
+ #define DEF_BINARY_SFLT_AVX_ITER_FUNC(fOpFunc, fSimdOp) \
424
+ static void iter_sfloat_##fOpFunc(na_loop_t* const lp) { \
425
+ size_t i = 0; \
426
+ ITER_BINARY_INIT_VARS() \
427
+ \
428
+ size_t cnt; \
429
+ size_t cnt_simd_loop = -1; \
430
+ __m256 a; \
431
+ __m256 b; \
432
+ size_t num_pack; \
433
+ num_pack = AVX_ALIGNMENT_SIZE / sizeof(sfloat); \
434
+ \
435
+ if (is_aligned(p1, sizeof(sfloat)) && is_aligned(p2, sizeof(sfloat)) && \
436
+ is_aligned(p3, sizeof(sfloat))) { \
437
+ if (s1 == sizeof(sfloat) && s2 == sizeof(sfloat) && s3 == sizeof(sfloat)) { \
438
+ if ((n >= num_pack) && \
439
+ is_same_aligned3( \
440
+ &((sfloat*)p1)[i], &((sfloat*)p2)[i], &((sfloat*)p3)[i], AVX_ALIGNMENT_SIZE \
441
+ )) { \
442
+ cnt = get_count_of_elements_not_aligned_to_simd_size( \
443
+ &((sfloat*)p1)[i], AVX_ALIGNMENT_SIZE, sizeof(sfloat) \
444
+ ); \
445
+ if (p1 == p3) { \
446
+ for (i = 0; i < cnt; i++) { \
447
+ ((sfloat*)p1)[i] = m_##fOpFunc(((sfloat*)p1)[i], ((sfloat*)p2)[i]); \
448
+ } \
449
+ } else { \
450
+ for (i = 0; i < cnt; i++) { \
451
+ ((sfloat*)p3)[i] = m_##fOpFunc(((sfloat*)p1)[i], ((sfloat*)p2)[i]); \
452
+ } \
453
+ } \
454
+ cnt_simd_loop = (n - i) % num_pack; \
455
+ if (p1 == p3) { \
456
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
457
+ a = _mm256_load_ps(&((sfloat*)p1)[i]); \
458
+ b = _mm256_load_ps(&((sfloat*)p2)[i]); \
459
+ a = fSimdOp(a, b); \
460
+ _mm256_store_ps(&((sfloat*)p1)[i], a); \
461
+ } \
462
+ } else { \
463
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
464
+ a = _mm256_load_ps(&((sfloat*)p1)[i]); \
465
+ b = _mm256_load_ps(&((sfloat*)p2)[i]); \
466
+ a = fSimdOp(a, b); \
467
+ _mm256_stream_ps(&((sfloat*)p3)[i], a); \
468
+ } \
469
+ } \
470
+ } \
471
+ if (cnt_simd_loop != 0) { \
472
+ if (p1 == p3) { \
473
+ for (; i < n; i++) { \
474
+ ((sfloat*)p1)[i] = m_##fOpFunc(((sfloat*)p1)[i], ((sfloat*)p2)[i]); \
475
+ } \
476
+ } else { \
477
+ for (; i < n; i++) { \
478
+ ((sfloat*)p3)[i] = m_##fOpFunc(((sfloat*)p1)[i], ((sfloat*)p2)[i]); \
479
+ } \
480
+ } \
481
+ } \
482
+ return; \
483
+ } \
484
+ if (is_aligned_step(s1, sizeof(sfloat)) && is_aligned_step(s2, sizeof(sfloat)) && \
485
+ is_aligned_step(s3, sizeof(sfloat))) { \
486
+ if (s2 == 0) { \
487
+ if (s1 == sizeof(sfloat) && s3 == sizeof(sfloat)) { \
488
+ b = _mm256_broadcast_ss(&((sfloat*)p2)[0]); \
489
+ if ((n >= num_pack) && \
490
+ is_same_aligned2(&((sfloat*)p1)[i], &((sfloat*)p3)[i], AVX_ALIGNMENT_SIZE)) { \
491
+ cnt = get_count_of_elements_not_aligned_to_simd_size( \
492
+ &((sfloat*)p1)[i], AVX_ALIGNMENT_SIZE, sizeof(sfloat) \
493
+ ); \
494
+ if (p1 == p3) { \
495
+ for (i = 0; i < cnt; i++) { \
496
+ ((sfloat*)p1)[i] = m_##fOpFunc(((sfloat*)p1)[i], *(sfloat*)p2); \
497
+ } \
498
+ } else { \
499
+ for (i = 0; i < cnt; i++) { \
500
+ ((sfloat*)p3)[i] = m_##fOpFunc(((sfloat*)p1)[i], *(sfloat*)p2); \
501
+ } \
502
+ } \
503
+ cnt_simd_loop = (n - i) % num_pack; \
504
+ if (p1 == p3) { \
505
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
506
+ a = _mm256_load_ps(&((sfloat*)p1)[i]); \
507
+ a = fSimdOp(a, b); \
508
+ _mm256_store_ps(&((sfloat*)p1)[i], a); \
509
+ } \
510
+ } else { \
511
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
512
+ a = _mm256_load_ps(&((sfloat*)p1)[i]); \
513
+ a = fSimdOp(a, b); \
514
+ _mm256_stream_ps(&((sfloat*)p3)[i], a); \
515
+ } \
516
+ } \
517
+ } \
518
+ if (cnt_simd_loop != 0) { \
519
+ if (p1 == p3) { \
520
+ for (; i < n; i++) { \
521
+ ((sfloat*)p1)[i] = m_##fOpFunc(((sfloat*)p1)[i], *(sfloat*)p2); \
522
+ } \
523
+ } else { \
524
+ for (; i < n; i++) { \
525
+ ((sfloat*)p3)[i] = m_##fOpFunc(((sfloat*)p1)[i], *(sfloat*)p2); \
526
+ } \
527
+ } \
528
+ } \
529
+ } else { \
530
+ for (i = 0; i < n; i++) { \
531
+ *(sfloat*)p3 = m_##fOpFunc(*(sfloat*)p1, *(sfloat*)p2); \
532
+ p1 += s1; \
533
+ p3 += s3; \
534
+ } \
535
+ } \
536
+ } else { \
537
+ if (p1 == p3) { \
538
+ for (i = 0; i < n; i++) { \
539
+ *(sfloat*)p1 = m_##fOpFunc(*(sfloat*)p1, *(sfloat*)p2); \
540
+ p1 += s1; \
541
+ p2 += s2; \
542
+ } \
543
+ } else { \
544
+ for (i = 0; i < n; i++) { \
545
+ *(sfloat*)p3 = m_##fOpFunc(*(sfloat*)p1, *(sfloat*)p2); \
546
+ p1 += s1; \
547
+ p2 += s2; \
548
+ p3 += s3; \
549
+ } \
550
+ } \
551
+ } \
552
+ return; \
553
+ } \
554
+ } \
555
+ \
556
+ for (i = 0; i < n; i++) { \
557
+ sfloat x; \
558
+ sfloat y; \
559
+ sfloat z; \
560
+ GET_DATA_STRIDE(p1, s1, sfloat, x); \
561
+ GET_DATA_STRIDE(p2, s2, sfloat, y); \
562
+ z = m_##fOpFunc(x, y); \
563
+ SET_DATA_STRIDE(p3, s3, sfloat, z); \
564
+ } \
565
+ }
566
+
567
+ #define DEF_BINARY_DFLT_AVX_ITER_FUNC(fOpFunc, fSimdOp) \
568
+ static void iter_dfloat_##fOpFunc(na_loop_t* const lp) { \
569
+ size_t i = 0; \
570
+ ITER_BINARY_INIT_VARS() \
571
+ \
572
+ size_t cnt; \
573
+ size_t cnt_simd_loop = -1; \
574
+ __m256d a; \
575
+ __m256d b; \
576
+ size_t num_pack; \
577
+ num_pack = AVX_ALIGNMENT_SIZE / sizeof(dfloat); \
578
+ \
579
+ if (is_aligned(p1, sizeof(dfloat)) && is_aligned(p2, sizeof(dfloat)) && \
580
+ is_aligned(p3, sizeof(dfloat))) { \
581
+ if (s1 == sizeof(dfloat) && s2 == sizeof(dfloat) && s3 == sizeof(dfloat)) { \
582
+ if ((n >= num_pack) && \
583
+ is_same_aligned3( \
584
+ &((dfloat*)p1)[i], &((dfloat*)p2)[i], &((dfloat*)p3)[i], AVX_ALIGNMENT_SIZE \
585
+ )) { \
586
+ cnt = get_count_of_elements_not_aligned_to_simd_size( \
587
+ &((dfloat*)p1)[i], AVX_ALIGNMENT_SIZE, sizeof(dfloat) \
588
+ ); \
589
+ if (p1 == p3) { \
590
+ for (i = 0; i < cnt; i++) { \
591
+ ((dfloat*)p1)[i] = m_##fOpFunc(((dfloat*)p1)[i], ((dfloat*)p2)[i]); \
592
+ } \
593
+ } else { \
594
+ for (i = 0; i < cnt; i++) { \
595
+ ((dfloat*)p3)[i] = m_##fOpFunc(((dfloat*)p1)[i], ((dfloat*)p2)[i]); \
596
+ } \
597
+ } \
598
+ cnt_simd_loop = (n - i) % num_pack; \
599
+ if (p1 == p3) { \
600
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
601
+ a = _mm256_load_pd(&((dfloat*)p1)[i]); \
602
+ b = _mm256_load_pd(&((dfloat*)p2)[i]); \
603
+ a = fSimdOp(a, b); \
604
+ _mm256_store_pd(&((dfloat*)p1)[i], a); \
605
+ } \
606
+ } else { \
607
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
608
+ a = _mm256_load_pd(&((dfloat*)p1)[i]); \
609
+ b = _mm256_load_pd(&((dfloat*)p2)[i]); \
610
+ a = fSimdOp(a, b); \
611
+ _mm256_stream_pd(&((dfloat*)p3)[i], a); \
612
+ } \
613
+ } \
614
+ } \
615
+ if (cnt_simd_loop != 0) { \
616
+ if (p1 == p3) { \
617
+ for (; i < n; i++) { \
618
+ ((dfloat*)p1)[i] = m_##fOpFunc(((dfloat*)p1)[i], ((dfloat*)p2)[i]); \
619
+ } \
620
+ } else { \
621
+ for (; i < n; i++) { \
622
+ ((dfloat*)p3)[i] = m_##fOpFunc(((dfloat*)p1)[i], ((dfloat*)p2)[i]); \
623
+ } \
624
+ } \
625
+ } \
626
+ return; \
627
+ } \
628
+ if (is_aligned_step(s1, sizeof(dfloat)) && is_aligned_step(s2, sizeof(dfloat)) && \
629
+ is_aligned_step(s3, sizeof(dfloat))) { \
630
+ if (s2 == 0) { \
631
+ if (s1 == sizeof(dfloat) && s3 == sizeof(dfloat)) { \
632
+ b = _mm256_broadcast_sd(&((dfloat*)p2)[0]); \
633
+ if ((n >= num_pack) && \
634
+ is_same_aligned2(&((dfloat*)p1)[i], &((dfloat*)p3)[i], AVX_ALIGNMENT_SIZE)) { \
635
+ cnt = get_count_of_elements_not_aligned_to_simd_size( \
636
+ &((dfloat*)p1)[i], AVX_ALIGNMENT_SIZE, sizeof(dfloat) \
637
+ ); \
638
+ if (p1 == p3) { \
639
+ for (; i < cnt; i++) { \
640
+ ((dfloat*)p1)[i] = m_##fOpFunc(((dfloat*)p1)[i], *(dfloat*)p2); \
641
+ } \
642
+ } else { \
643
+ for (; i < cnt; i++) { \
644
+ ((dfloat*)p3)[i] = m_##fOpFunc(((dfloat*)p1)[i], *(dfloat*)p2); \
645
+ } \
646
+ } \
647
+ cnt_simd_loop = (n - i) % num_pack; \
648
+ if (p1 == p3) { \
649
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
650
+ a = _mm256_load_pd(&((dfloat*)p1)[i]); \
651
+ a = fSimdOp(a, b); \
652
+ _mm256_store_pd(&((dfloat*)p1)[i], a); \
653
+ } \
654
+ } else { \
655
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
656
+ a = _mm256_load_pd(&((dfloat*)p1)[i]); \
657
+ a = fSimdOp(a, b); \
658
+ _mm256_stream_pd(&((dfloat*)p3)[i], a); \
659
+ } \
660
+ } \
661
+ } \
662
+ if (cnt_simd_loop != 0) { \
663
+ if (p1 == p3) { \
664
+ for (; i < n; i++) { \
665
+ ((dfloat*)p1)[i] = m_##fOpFunc(((dfloat*)p1)[i], *(dfloat*)p2); \
666
+ } \
667
+ } else { \
668
+ for (; i < n; i++) { \
669
+ ((dfloat*)p3)[i] = m_##fOpFunc(((dfloat*)p1)[i], *(dfloat*)p2); \
670
+ } \
671
+ } \
672
+ } \
673
+ } else { \
674
+ for (i = 0; i < n; i++) { \
675
+ *(dfloat*)p3 = m_##fOpFunc(*(dfloat*)p1, *(dfloat*)p2); \
676
+ p1 += s1; \
677
+ p3 += s3; \
678
+ } \
679
+ } \
680
+ } else { \
681
+ if (p1 == p3) { \
682
+ for (i = 0; i < n; i++) { \
683
+ *(dfloat*)p1 = m_##fOpFunc(*(dfloat*)p1, *(dfloat*)p2); \
684
+ p1 += s1; \
685
+ p2 += s2; \
686
+ } \
687
+ } else { \
688
+ for (i = 0; i < n; i++) { \
689
+ *(dfloat*)p3 = m_##fOpFunc(*(dfloat*)p1, *(dfloat*)p2); \
690
+ p1 += s1; \
691
+ p2 += s2; \
692
+ p3 += s3; \
693
+ } \
694
+ } \
695
+ } \
696
+ return; \
697
+ } \
698
+ } \
699
+ \
700
+ for (i = 0; i < n; i++) { \
701
+ dfloat x; \
702
+ dfloat y; \
703
+ dfloat z; \
704
+ GET_DATA_STRIDE(p1, s1, dfloat, x); \
705
+ GET_DATA_STRIDE(p2, s2, dfloat, y); \
706
+ z = m_##fOpFunc(x, y); \
707
+ SET_DATA_STRIDE(p3, s3, dfloat, z); \
708
+ } \
709
+ }
710
+
711
+ #define DEF_BINARY_SFLT_NEON_ITER_FUNC(fOpFunc, fSimdOp) \
712
+ static void iter_sfloat_##fOpFunc(na_loop_t* const lp) { \
713
+ size_t i = 0; \
714
+ ITER_BINARY_INIT_VARS() \
715
+ \
716
+ size_t cnt; \
717
+ size_t cnt_simd_loop = -1; \
718
+ float32x4_t a; \
719
+ float32x4_t b; \
720
+ size_t num_pack; \
721
+ num_pack = NEON_ALIGNMENT_SIZE / sizeof(sfloat); \
722
+ \
723
+ if (is_aligned(p1, sizeof(sfloat)) && is_aligned(p2, sizeof(sfloat)) && \
724
+ is_aligned(p3, sizeof(sfloat))) { \
725
+ if (s1 == sizeof(sfloat) && s2 == sizeof(sfloat) && s3 == sizeof(sfloat)) { \
726
+ if ((n >= num_pack) && \
727
+ is_same_aligned3( \
728
+ &((sfloat*)p1)[i], &((sfloat*)p2)[i], &((sfloat*)p3)[i], NEON_ALIGNMENT_SIZE \
729
+ )) { \
730
+ cnt = get_count_of_elements_not_aligned_to_simd_size( \
731
+ &((sfloat*)p1)[i], NEON_ALIGNMENT_SIZE, sizeof(sfloat) \
732
+ ); \
733
+ if (p1 == p3) { \
734
+ for (i = 0; i < cnt; i++) { \
735
+ ((sfloat*)p1)[i] = m_##fOpFunc(((sfloat*)p1)[i], ((sfloat*)p2)[i]); \
736
+ } \
737
+ } else { \
738
+ for (i = 0; i < cnt; i++) { \
739
+ ((sfloat*)p3)[i] = m_##fOpFunc(((sfloat*)p1)[i], ((sfloat*)p2)[i]); \
740
+ } \
741
+ } \
742
+ cnt_simd_loop = (n - i) % num_pack; \
743
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
744
+ a = vld1q_f32(&((sfloat*)p1)[i]); \
745
+ b = vld1q_f32(&((sfloat*)p2)[i]); \
746
+ a = fSimdOp(a, b); \
747
+ vst1q_f32(&((sfloat*)p3)[i], a); \
748
+ } \
749
+ } \
750
+ if (cnt_simd_loop != 0) { \
751
+ if (p1 == p3) { \
752
+ for (; i < n; i++) { \
753
+ ((sfloat*)p1)[i] = m_##fOpFunc(((sfloat*)p1)[i], ((sfloat*)p2)[i]); \
754
+ } \
755
+ } else { \
756
+ for (; i < n; i++) { \
757
+ ((sfloat*)p3)[i] = m_##fOpFunc(((sfloat*)p1)[i], ((sfloat*)p2)[i]); \
758
+ } \
759
+ } \
760
+ } \
761
+ return; \
762
+ } \
763
+ if (is_aligned_step(s1, sizeof(sfloat)) && is_aligned_step(s2, sizeof(sfloat)) && \
764
+ is_aligned_step(s3, sizeof(sfloat))) { \
765
+ if (s2 == 0) { \
766
+ if (s1 == sizeof(sfloat) && s3 == sizeof(sfloat)) { \
767
+ b = vld1q_dup_f32(&((sfloat*)p2)[0]); \
768
+ if ((n >= num_pack) && \
769
+ is_same_aligned2(&((sfloat*)p1)[i], &((sfloat*)p3)[i], NEON_ALIGNMENT_SIZE)) { \
770
+ cnt = get_count_of_elements_not_aligned_to_simd_size( \
771
+ &((sfloat*)p1)[i], NEON_ALIGNMENT_SIZE, sizeof(sfloat) \
772
+ ); \
773
+ if (p1 == p3) { \
774
+ for (i = 0; i < cnt; i++) { \
775
+ ((sfloat*)p1)[i] = m_##fOpFunc(((sfloat*)p1)[i], *(sfloat*)p2); \
776
+ } \
777
+ } else { \
778
+ for (i = 0; i < cnt; i++) { \
779
+ ((sfloat*)p3)[i] = m_##fOpFunc(((sfloat*)p1)[i], *(sfloat*)p2); \
780
+ } \
781
+ } \
782
+ cnt_simd_loop = (n - i) % num_pack; \
783
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
784
+ a = vld1q_f32(&((sfloat*)p1)[i]); \
785
+ a = fSimdOp(a, b); \
786
+ vst1q_f32(&((sfloat*)p3)[i], a); \
787
+ } \
788
+ } \
789
+ if (cnt_simd_loop != 0) { \
790
+ if (p1 == p3) { \
791
+ for (; i < n; i++) { \
792
+ ((sfloat*)p1)[i] = m_##fOpFunc(((sfloat*)p1)[i], *(sfloat*)p2); \
793
+ } \
794
+ } else { \
795
+ for (; i < n; i++) { \
796
+ ((sfloat*)p3)[i] = m_##fOpFunc(((sfloat*)p1)[i], *(sfloat*)p2); \
797
+ } \
798
+ } \
799
+ } \
800
+ } else { \
801
+ for (i = 0; i < n; i++) { \
802
+ *(sfloat*)p3 = m_##fOpFunc(*(sfloat*)p1, *(sfloat*)p2); \
803
+ p1 += s1; \
804
+ p3 += s3; \
805
+ } \
806
+ } \
807
+ } else { \
808
+ if (p1 == p3) { \
809
+ for (i = 0; i < n; i++) { \
810
+ *(sfloat*)p1 = m_##fOpFunc(*(sfloat*)p1, *(sfloat*)p2); \
811
+ p1 += s1; \
812
+ p2 += s2; \
813
+ } \
814
+ } else { \
815
+ for (i = 0; i < n; i++) { \
816
+ *(sfloat*)p3 = m_##fOpFunc(*(sfloat*)p1, *(sfloat*)p2); \
817
+ p1 += s1; \
818
+ p2 += s2; \
819
+ p3 += s3; \
820
+ } \
821
+ } \
822
+ } \
823
+ return; \
824
+ } \
825
+ } \
826
+ \
827
+ for (i = 0; i < n; i++) { \
828
+ sfloat x; \
829
+ sfloat y; \
830
+ sfloat z; \
831
+ GET_DATA_STRIDE(p1, s1, sfloat, x); \
832
+ GET_DATA_STRIDE(p2, s2, sfloat, y); \
833
+ z = m_##fOpFunc(x, y); \
834
+ SET_DATA_STRIDE(p3, s3, sfloat, z); \
835
+ } \
836
+ }
837
+
838
+ #define DEF_BINARY_DFLT_NEON_ITER_FUNC(fOpFunc, fSimdOp) \
839
+ static void iter_dfloat_##fOpFunc(na_loop_t* const lp) { \
840
+ size_t i = 0; \
841
+ ITER_BINARY_INIT_VARS() \
842
+ \
843
+ size_t cnt; \
844
+ size_t cnt_simd_loop = -1; \
845
+ float64x2_t a; \
846
+ float64x2_t b; \
847
+ size_t num_pack; \
848
+ num_pack = NEON_ALIGNMENT_SIZE / sizeof(dfloat); \
849
+ \
850
+ if (is_aligned(p1, sizeof(dfloat)) && is_aligned(p2, sizeof(dfloat)) && \
851
+ is_aligned(p3, sizeof(dfloat))) { \
852
+ if (s1 == sizeof(dfloat) && s2 == sizeof(dfloat) && s3 == sizeof(dfloat)) { \
853
+ if ((n >= num_pack) && \
854
+ is_same_aligned3( \
855
+ &((dfloat*)p1)[i], &((dfloat*)p2)[i], &((dfloat*)p3)[i], NEON_ALIGNMENT_SIZE \
856
+ )) { \
857
+ cnt = get_count_of_elements_not_aligned_to_simd_size( \
858
+ &((dfloat*)p1)[i], NEON_ALIGNMENT_SIZE, sizeof(dfloat) \
859
+ ); \
860
+ if (p1 == p3) { \
861
+ for (i = 0; i < cnt; i++) { \
862
+ ((dfloat*)p1)[i] = m_##fOpFunc(((dfloat*)p1)[i], ((dfloat*)p2)[i]); \
863
+ } \
864
+ } else { \
865
+ for (i = 0; i < cnt; i++) { \
866
+ ((dfloat*)p3)[i] = m_##fOpFunc(((dfloat*)p1)[i], ((dfloat*)p2)[i]); \
867
+ } \
868
+ } \
869
+ cnt_simd_loop = (n - i) % num_pack; \
870
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
871
+ a = vld1q_f64(&((dfloat*)p1)[i]); \
872
+ b = vld1q_f64(&((dfloat*)p2)[i]); \
873
+ a = fSimdOp(a, b); \
874
+ vst1q_f64(&((dfloat*)p3)[i], a); \
875
+ } \
876
+ } \
877
+ if (cnt_simd_loop != 0) { \
878
+ if (p1 == p3) { \
879
+ for (; i < n; i++) { \
880
+ ((dfloat*)p1)[i] = m_##fOpFunc(((dfloat*)p1)[i], ((dfloat*)p2)[i]); \
881
+ } \
882
+ } else { \
883
+ for (; i < n; i++) { \
884
+ ((dfloat*)p3)[i] = m_##fOpFunc(((dfloat*)p1)[i], ((dfloat*)p2)[i]); \
885
+ } \
886
+ } \
887
+ } \
888
+ return; \
889
+ } \
890
+ if (is_aligned_step(s1, sizeof(dfloat)) && is_aligned_step(s2, sizeof(dfloat)) && \
891
+ is_aligned_step(s3, sizeof(dfloat))) { \
892
+ if (s2 == 0) { \
893
+ if (s1 == sizeof(dfloat) && s3 == sizeof(dfloat)) { \
894
+ b = vld1q_dup_f64(&((dfloat*)p2)[0]); \
895
+ if ((n >= num_pack) && \
896
+ is_same_aligned2(&((dfloat*)p1)[i], &((dfloat*)p3)[i], NEON_ALIGNMENT_SIZE)) { \
897
+ cnt = get_count_of_elements_not_aligned_to_simd_size( \
898
+ &((dfloat*)p1)[i], NEON_ALIGNMENT_SIZE, sizeof(dfloat) \
899
+ ); \
900
+ if (p1 == p3) { \
901
+ for (; i < cnt; i++) { \
902
+ ((dfloat*)p1)[i] = m_##fOpFunc(((dfloat*)p1)[i], *(dfloat*)p2); \
903
+ } \
904
+ } else { \
905
+ for (; i < cnt; i++) { \
906
+ ((dfloat*)p3)[i] = m_##fOpFunc(((dfloat*)p1)[i], *(dfloat*)p2); \
907
+ } \
908
+ } \
909
+ cnt_simd_loop = (n - i) % num_pack; \
910
+ for (; i < n - cnt_simd_loop; i += num_pack) { \
911
+ a = vld1q_f64(&((dfloat*)p1)[i]); \
912
+ a = fSimdOp(a, b); \
913
+ vst1q_f64(&((dfloat*)p3)[i], a); \
914
+ } \
915
+ } \
916
+ if (cnt_simd_loop != 0) { \
917
+ if (p1 == p3) { \
918
+ for (; i < n; i++) { \
919
+ ((dfloat*)p1)[i] = m_##fOpFunc(((dfloat*)p1)[i], *(dfloat*)p2); \
920
+ } \
921
+ } else { \
922
+ for (; i < n; i++) { \
923
+ ((dfloat*)p3)[i] = m_##fOpFunc(((dfloat*)p1)[i], *(dfloat*)p2); \
924
+ } \
925
+ } \
926
+ } \
927
+ } else { \
928
+ for (i = 0; i < n; i++) { \
929
+ *(dfloat*)p3 = m_##fOpFunc(*(dfloat*)p1, *(dfloat*)p2); \
930
+ p1 += s1; \
931
+ p3 += s3; \
932
+ } \
933
+ } \
934
+ } else { \
935
+ if (p1 == p3) { \
936
+ for (i = 0; i < n; i++) { \
937
+ *(dfloat*)p1 = m_##fOpFunc(*(dfloat*)p1, *(dfloat*)p2); \
938
+ p1 += s1; \
939
+ p2 += s2; \
940
+ } \
941
+ } else { \
942
+ for (i = 0; i < n; i++) { \
943
+ *(dfloat*)p3 = m_##fOpFunc(*(dfloat*)p1, *(dfloat*)p2); \
944
+ p1 += s1; \
945
+ p2 += s2; \
946
+ p3 += s3; \
947
+ } \
948
+ } \
949
+ } \
950
+ return; \
951
+ } \
952
+ } \
953
+ \
954
+ for (i = 0; i < n; i++) { \
955
+ dfloat x; \
956
+ dfloat y; \
957
+ dfloat z; \
958
+ GET_DATA_STRIDE(p1, s1, dfloat, x); \
959
+ GET_DATA_STRIDE(p2, s2, dfloat, y); \
960
+ z = m_##fOpFunc(x, y); \
961
+ SET_DATA_STRIDE(p3, s3, dfloat, z); \
962
+ } \
963
+ }
964
+
423
965
  #endif /* NUMO_NARRAY_MH_OP_BINARY_FUNC_H */
@@ -115,4 +115,24 @@
115
115
  DEF_BINARY_SELF_FUNC(div, dfloat, numo_cDFloat) \
116
116
  DEF_BINARY_FUNC(div, '/', dfloat, numo_cDFloat)
117
117
 
118
+ #define DEF_NARRAY_SFLT_DIV_AVX_METHOD_FUNC() \
119
+ DEF_BINARY_SFLT_AVX_ITER_FUNC(div, _mm256_div_ps) \
120
+ DEF_BINARY_SELF_FUNC(div, sfloat, numo_cSFloat) \
121
+ DEF_BINARY_FUNC(div, '/', sfloat, numo_cSFloat)
122
+
123
+ #define DEF_NARRAY_DFLT_DIV_AVX_METHOD_FUNC() \
124
+ DEF_BINARY_DFLT_AVX_ITER_FUNC(div, _mm256_div_pd) \
125
+ DEF_BINARY_SELF_FUNC(div, dfloat, numo_cDFloat) \
126
+ DEF_BINARY_FUNC(div, '/', dfloat, numo_cDFloat)
127
+
128
+ #define DEF_NARRAY_SFLT_DIV_NEON_METHOD_FUNC() \
129
+ DEF_BINARY_SFLT_NEON_ITER_FUNC(div, vdivq_f32) \
130
+ DEF_BINARY_SELF_FUNC(div, sfloat, numo_cSFloat) \
131
+ DEF_BINARY_FUNC(div, '/', sfloat, numo_cSFloat)
132
+
133
+ #define DEF_NARRAY_DFLT_DIV_NEON_METHOD_FUNC() \
134
+ DEF_BINARY_DFLT_NEON_ITER_FUNC(div, vdivq_f64) \
135
+ DEF_BINARY_SELF_FUNC(div, dfloat, numo_cDFloat) \
136
+ DEF_BINARY_FUNC(div, '/', dfloat, numo_cDFloat)
137
+
118
138
  #endif /* NUMO_NARRAY_MH_OP_DIV_H */