RubyGems - numo-narray-alt - Versions diffs - 0.10.4 → 0.11.0 - Mend

numo-narray-alt 0.10.4 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +4 -4
data/README.md +9 -0
data/ext/numo/narray/extconf.rb +2 -0
data/ext/numo/narray/numo/narray.h +3 -3
data/ext/numo/narray/numo/types/float_macro.h +5 -1
data/ext/numo/narray/src/mh/math/sqrt.h +372 -0
data/ext/numo/narray/src/mh/mulsum.h +1360 -74
data/ext/numo/narray/src/mh/op/add.h +20 -0
data/ext/numo/narray/src/mh/op/binary_func.h +542 -0
data/ext/numo/narray/src/mh/op/div.h +20 -0
data/ext/numo/narray/src/mh/op/mul.h +20 -0
data/ext/numo/narray/src/mh/op/sub.h +20 -0
data/ext/numo/narray/src/mh/sort.h +4 -4
data/ext/numo/narray/src/t_bit.c +0 -5
data/ext/numo/narray/src/t_dcomplex.c +0 -5
data/ext/numo/narray/src/t_dfloat.c +35 -3
data/ext/numo/narray/src/t_int16.c +0 -5
data/ext/numo/narray/src/t_int32.c +0 -5
data/ext/numo/narray/src/t_int64.c +0 -5
data/ext/numo/narray/src/t_int8.c +0 -5
data/ext/numo/narray/src/t_robject.c +0 -5
data/ext/numo/narray/src/t_scomplex.c +0 -5
data/ext/numo/narray/src/t_sfloat.c +35 -3
data/ext/numo/narray/src/t_uint16.c +0 -5
data/ext/numo/narray/src/t_uint32.c +0 -5
data/ext/numo/narray/src/t_uint64.c +0 -5
data/ext/numo/narray/src/t_uint8.c +0 -5
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: b00e8da08175d2c19b50dc634303140c251a2139c223b2c8f3ef515858314da0
-  data.tar.gz: 484a78cdd0959d3d2a09e438c8eaeef953964612590c3c44050d5ca906e65a75
+  metadata.gz: 9b2e2bc7cc99b7ef2b868b588360ff66299a6570b4d6ea98c45bbb357eed661b
+  data.tar.gz: '0887ab3061ad5add393ecd99921ca88e3a06545c9617ffc9686550cb87b3e843'
 SHA512:
-  metadata.gz: 58600b9b39c99a28f4ec6df40d1dfba22db1c1aa38dc50d61fa905f31cdb3dcd623a0b07b81519d8341d0afa644b4ec3695c5ddf9889ffbb82decca415b34959
-  data.tar.gz: e0112d66a2bd84afb953ab911119884969fb6004815313c2d85e8898963393de4434d7ca0286bc9da84a4d880dfc16f5c609c218fcfd61fc65add01186e10d8c
+  metadata.gz: '09f47d21922f7e24e7915222eb9ddacadf4392156e53a15aee05d4f2fc6ebdafce244c3624de375b10411768d7f1a38de7d081504ef0fe21b6b1b82156ef9410'
+  data.tar.gz: 4cc3900b5631971fc76195eee5ecf66a6c1647a604d19997b0bcece1d1341f9245ace1c405c3f5e9dbc493389f456c4c9c817ea145a1c3a47f2180edea52ae36

data/README.md CHANGED Viewed

@@ -24,6 +24,15 @@ This project is in no way intended to adversely affect the development of the or
 $ gem install numo-narray-alt
 ```
+### Build options
+By default, the floating-point classes (`Numo::SFloat` / `Numo::DFloat`) use SIMD instructions
+(SSE2 / AVX / NEON) when the target CPU supports them. To build without SIMD, pass `--with-no-simd`:
+```shell
+$ gem install numo-narray-alt -- --with-no-simd
+```
 ## Usage
 The usage is exactly the same as Numo::NArray.

data/ext/numo/narray/extconf.rb CHANGED Viewed

@@ -66,6 +66,8 @@ have_func('RTYPEDDATA_GET_DATA')
 have_var('rb_cComplex')
+$defs << '-DNUMO_NO_SIMD' if with_config('no-simd', false)
 $objs = srcs.collect { |i| "#{i}.o" }
 create_header d('numo/extconf.h')

data/ext/numo/narray/numo/narray.h CHANGED Viewed

@@ -13,10 +13,10 @@ extern "C" {
 #endif
 #endif
-#define NARRAY_VERSION "0.10.4"
+#define NARRAY_VERSION "0.11.0"
 #define NARRAY_VERSION_MAJOR 0
-#define NARRAY_VERSION_MINOR 10
-#define NARRAY_VERSION_PATCH 4
+#define NARRAY_VERSION_MINOR 11
+#define NARRAY_VERSION_PATCH 0
 #define NARRAY_VERSION_CODE                                                                    \
   (NARRAY_VERSION_MAJOR * 10000 + NARRAY_VERSION_MINOR * 100 + NARRAY_VERSION_PATCH)

data/ext/numo/narray/numo/types/float_macro.h CHANGED Viewed

@@ -12,7 +12,7 @@ extern double pow(double, double);
 #define m_zero 0.0
 #define m_one 1.0
-#define m_num_to_data(x) (NIL_P(x) ? nan("") : NUM2DBL(x))
+#define m_num_to_data(x) f_num_to_data(x)
 #define m_data_to_num(x) rb_float_new(x)
 #define m_from_double(x) (x)
@@ -126,6 +126,10 @@ extern double pow(double, double);
 #define m_ldexp(x, y) ldexp(x, y)
 #define m_frexp(x, exp) frexp(x, exp)
+static inline dtype f_num_to_data(VALUE x) {
+  return NIL_P(x) ? nan("") : NUM2DBL(x);
+}
 static inline dtype pow_int(dtype x, int p) {
   dtype r = 1;
   switch (p) {

data/ext/numo/narray/src/mh/math/sqrt.h CHANGED Viewed

@@ -200,4 +200,376 @@
     return na_ndloop(&ndf, 1, a1);                                                             \
   }
+#define DEF_NARRAY_FLT_SQRT_AVX_SGL_METHOD_FUNC(tDType, tNAryClass)                            \
+  static void iter_##tDType##_math_s_sqrt(na_loop_t* const lp) {                               \
+    size_t i = 0;                                                                              \
+    size_t n;                                                                                  \
+    char *p1, *p2;                                                                             \
+    ssize_t s1, s2;                                                                            \
+    size_t *idx1, *idx2;                                                                       \
+    tDType x;                                                                                  \
+    size_t cnt;                                                                                \
+    size_t cnt_simd_loop = -1;                                                                 \
+    __m256 a;                                                                                  \
+    size_t num_pack;                                                                           \
+    num_pack = AVX_ALIGNMENT_SIZE / sizeof(tDType);                                            \
+                                                                                               \
+    INIT_COUNTER(lp, n);                                                                       \
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);                                                         \
+    INIT_PTR_IDX(lp, 1, p2, s2, idx2);                                                         \
+                                                                                               \
+    if (idx1) {                                                                                \
+      if (idx2) {                                                                              \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_INDEX(p1, idx1, tDType, x);                                                 \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_INDEX(p2, idx2, tDType, x);                                                 \
+        }                                                                                      \
+      } else {                                                                                 \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_INDEX(p1, idx1, tDType, x);                                                 \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_STRIDE(p2, s2, tDType, x);                                                  \
+        }                                                                                      \
+      }                                                                                        \
+    } else {                                                                                   \
+      if (idx2) {                                                                              \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_STRIDE(p1, s1, tDType, x);                                                  \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_INDEX(p2, idx2, tDType, x);                                                 \
+        }                                                                                      \
+      } else {                                                                                 \
+        if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType))) {                \
+          if (s1 == sizeof(tDType) && s2 == sizeof(tDType)) {                                  \
+            if ((n >= num_pack) &&                                                             \
+                is_same_aligned2(&((tDType*)p1)[i], &((tDType*)p2)[i], AVX_ALIGNMENT_SIZE)) {  \
+              cnt = get_count_of_elements_not_aligned_to_simd_size(                            \
+                &((tDType*)p1)[i], AVX_ALIGNMENT_SIZE, sizeof(tDType)                          \
+              );                                                                               \
+              for (i = 0; i < cnt; i++) {                                                      \
+                ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]);                                   \
+              }                                                                                \
+              cnt_simd_loop = (n - i) % num_pack;                                              \
+              if (p1 == p2) {                                                                  \
+                for (; i < n - cnt_simd_loop; i += num_pack) {                                 \
+                  a = _mm256_load_ps(&((tDType*)p1)[i]);                                       \
+                  a = _mm256_sqrt_ps(a);                                                       \
+                  _mm256_store_ps(&((tDType*)p1)[i], a);                                       \
+                }                                                                              \
+              } else {                                                                         \
+                for (; i < n - cnt_simd_loop; i += num_pack) {                                 \
+                  a = _mm256_load_ps(&((tDType*)p1)[i]);                                       \
+                  a = _mm256_sqrt_ps(a);                                                       \
+                  _mm256_stream_ps(&((tDType*)p2)[i], a);                                      \
+                }                                                                              \
+              }                                                                                \
+            }                                                                                  \
+            if (cnt_simd_loop != 0) {                                                          \
+              for (; i < n; i++) {                                                             \
+                ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]);                                   \
+              }                                                                                \
+            }                                                                                  \
+            return;                                                                            \
+          }                                                                                    \
+          if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType))) {    \
+            for (i = 0; i < n; i++) {                                                          \
+              *(tDType*)p2 = m_sqrt(*(tDType*)p1);                                             \
+              p1 += s1;                                                                        \
+              p2 += s2;                                                                        \
+            }                                                                                  \
+            return;                                                                            \
+          }                                                                                    \
+        }                                                                                      \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_STRIDE(p1, s1, tDType, x);                                                  \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_STRIDE(p2, s2, tDType, x);                                                  \
+        }                                                                                      \
+      }                                                                                        \
+    }                                                                                          \
+  }                                                                                            \
+                                                                                               \
+  static VALUE tDType##_math_s_sqrt(VALUE mod, VALUE a1) {                                     \
+    ndfunc_arg_in_t ain[1] = { { tNAryClass, 0 } };                                            \
+    ndfunc_arg_out_t aout[1] = { { tNAryClass, 0 } };                                          \
+    ndfunc_t ndf = { iter_##tDType##_math_s_sqrt, FULL_LOOP, 1, 1, ain, aout };                \
+    return na_ndloop(&ndf, 1, a1);                                                             \
+  }
+#define DEF_NARRAY_FLT_SQRT_AVX_DBL_METHOD_FUNC(tDType, tNAryClass)                            \
+  static void iter_##tDType##_math_s_sqrt(na_loop_t* const lp) {                               \
+    size_t i = 0;                                                                              \
+    size_t n;                                                                                  \
+    char *p1, *p2;                                                                             \
+    ssize_t s1, s2;                                                                            \
+    size_t *idx1, *idx2;                                                                       \
+    tDType x;                                                                                  \
+    size_t cnt;                                                                                \
+    size_t cnt_simd_loop = -1;                                                                 \
+    __m256d a;                                                                                 \
+    size_t num_pack;                                                                           \
+    num_pack = AVX_ALIGNMENT_SIZE / sizeof(tDType);                                            \
+                                                                                               \
+    INIT_COUNTER(lp, n);                                                                       \
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);                                                         \
+    INIT_PTR_IDX(lp, 1, p2, s2, idx2);                                                         \
+                                                                                               \
+    if (idx1) {                                                                                \
+      if (idx2) {                                                                              \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_INDEX(p1, idx1, tDType, x);                                                 \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_INDEX(p2, idx2, tDType, x);                                                 \
+        }                                                                                      \
+      } else {                                                                                 \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_INDEX(p1, idx1, tDType, x);                                                 \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_STRIDE(p2, s2, tDType, x);                                                  \
+        }                                                                                      \
+      }                                                                                        \
+    } else {                                                                                   \
+      if (idx2) {                                                                              \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_STRIDE(p1, s1, tDType, x);                                                  \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_INDEX(p2, idx2, tDType, x);                                                 \
+        }                                                                                      \
+      } else {                                                                                 \
+        if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType))) {                \
+          if (s1 == sizeof(tDType) && s2 == sizeof(tDType)) {                                  \
+            if ((n >= num_pack) &&                                                             \
+                is_same_aligned2(&((tDType*)p1)[i], &((tDType*)p2)[i], AVX_ALIGNMENT_SIZE)) {  \
+              cnt = get_count_of_elements_not_aligned_to_simd_size(                            \
+                &((tDType*)p1)[i], AVX_ALIGNMENT_SIZE, sizeof(tDType)                          \
+              );                                                                               \
+              for (i = 0; i < cnt; i++) {                                                      \
+                ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]);                                   \
+              }                                                                                \
+              cnt_simd_loop = (n - i) % num_pack;                                              \
+              if (p1 == p2) {                                                                  \
+                for (; i < n - cnt_simd_loop; i += num_pack) {                                 \
+                  a = _mm256_load_pd(&((tDType*)p1)[i]);                                       \
+                  a = _mm256_sqrt_pd(a);                                                       \
+                  _mm256_store_pd(&((tDType*)p1)[i], a);                                       \
+                }                                                                              \
+              } else {                                                                         \
+                for (; i < n - cnt_simd_loop; i += num_pack) {                                 \
+                  a = _mm256_load_pd(&((tDType*)p1)[i]);                                       \
+                  a = _mm256_sqrt_pd(a);                                                       \
+                  _mm256_stream_pd(&((tDType*)p2)[i], a);                                      \
+                }                                                                              \
+              }                                                                                \
+            }                                                                                  \
+            if (cnt_simd_loop != 0) {                                                          \
+              for (; i < n; i++) {                                                             \
+                ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]);                                   \
+              }                                                                                \
+            }                                                                                  \
+            return;                                                                            \
+          }                                                                                    \
+          if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType))) {    \
+            for (i = 0; i < n; i++) {                                                          \
+              *(tDType*)p2 = m_sqrt(*(tDType*)p1);                                             \
+              p1 += s1;                                                                        \
+              p2 += s2;                                                                        \
+            }                                                                                  \
+            return;                                                                            \
+          }                                                                                    \
+        }                                                                                      \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_STRIDE(p1, s1, tDType, x);                                                  \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_STRIDE(p2, s2, tDType, x);                                                  \
+        }                                                                                      \
+      }                                                                                        \
+    }                                                                                          \
+  }                                                                                            \
+                                                                                               \
+  static VALUE tDType##_math_s_sqrt(VALUE mod, VALUE a1) {                                     \
+    ndfunc_arg_in_t ain[1] = { { tNAryClass, 0 } };                                            \
+    ndfunc_arg_out_t aout[1] = { { tNAryClass, 0 } };                                          \
+    ndfunc_t ndf = { iter_##tDType##_math_s_sqrt, FULL_LOOP, 1, 1, ain, aout };                \
+    return na_ndloop(&ndf, 1, a1);                                                             \
+  }
+#define DEF_NARRAY_FLT_SQRT_NEON_SGL_METHOD_FUNC(tDType, tNAryClass)                           \
+  static void iter_##tDType##_math_s_sqrt(na_loop_t* const lp) {                               \
+    size_t i = 0;                                                                              \
+    size_t n;                                                                                  \
+    char *p1, *p2;                                                                             \
+    ssize_t s1, s2;                                                                            \
+    size_t *idx1, *idx2;                                                                       \
+    tDType x;                                                                                  \
+    size_t cnt;                                                                                \
+    size_t cnt_simd_loop = -1;                                                                 \
+    float32x4_t a;                                                                             \
+    size_t num_pack;                                                                           \
+    num_pack = NEON_ALIGNMENT_SIZE / sizeof(tDType);                                           \
+                                                                                               \
+    INIT_COUNTER(lp, n);                                                                       \
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);                                                         \
+    INIT_PTR_IDX(lp, 1, p2, s2, idx2);                                                         \
+                                                                                               \
+    if (idx1) {                                                                                \
+      if (idx2) {                                                                              \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_INDEX(p1, idx1, tDType, x);                                                 \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_INDEX(p2, idx2, tDType, x);                                                 \
+        }                                                                                      \
+      } else {                                                                                 \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_INDEX(p1, idx1, tDType, x);                                                 \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_STRIDE(p2, s2, tDType, x);                                                  \
+        }                                                                                      \
+      }                                                                                        \
+    } else {                                                                                   \
+      if (idx2) {                                                                              \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_STRIDE(p1, s1, tDType, x);                                                  \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_INDEX(p2, idx2, tDType, x);                                                 \
+        }                                                                                      \
+      } else {                                                                                 \
+        if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType))) {                \
+          if (s1 == sizeof(tDType) && s2 == sizeof(tDType)) {                                  \
+            if ((n >= num_pack) &&                                                             \
+                is_same_aligned2(&((tDType*)p1)[i], &((tDType*)p2)[i], NEON_ALIGNMENT_SIZE)) { \
+              cnt = get_count_of_elements_not_aligned_to_simd_size(                            \
+                &((tDType*)p1)[i], NEON_ALIGNMENT_SIZE, sizeof(tDType)                         \
+              );                                                                               \
+              for (i = 0; i < cnt; i++) {                                                      \
+                ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]);                                   \
+              }                                                                                \
+              cnt_simd_loop = (n - i) % num_pack;                                              \
+              for (; i < n - cnt_simd_loop; i += num_pack) {                                   \
+                a = vld1q_f32(&((tDType*)p1)[i]);                                              \
+                a = vsqrtq_f32(a);                                                             \
+                vst1q_f32(&((tDType*)p2)[i], a);                                               \
+              }                                                                                \
+            }                                                                                  \
+            if (cnt_simd_loop != 0) {                                                          \
+              for (; i < n; i++) {                                                             \
+                ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]);                                   \
+              }                                                                                \
+            }                                                                                  \
+            return;                                                                            \
+          }                                                                                    \
+          if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType))) {    \
+            for (i = 0; i < n; i++) {                                                          \
+              *(tDType*)p2 = m_sqrt(*(tDType*)p1);                                             \
+              p1 += s1;                                                                        \
+              p2 += s2;                                                                        \
+            }                                                                                  \
+            return;                                                                            \
+          }                                                                                    \
+        }                                                                                      \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_STRIDE(p1, s1, tDType, x);                                                  \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_STRIDE(p2, s2, tDType, x);                                                  \
+        }                                                                                      \
+      }                                                                                        \
+    }                                                                                          \
+  }                                                                                            \
+                                                                                               \
+  static VALUE tDType##_math_s_sqrt(VALUE mod, VALUE a1) {                                     \
+    ndfunc_arg_in_t ain[1] = { { tNAryClass, 0 } };                                            \
+    ndfunc_arg_out_t aout[1] = { { tNAryClass, 0 } };                                          \
+    ndfunc_t ndf = { iter_##tDType##_math_s_sqrt, FULL_LOOP, 1, 1, ain, aout };                \
+    return na_ndloop(&ndf, 1, a1);                                                             \
+  }
+#define DEF_NARRAY_FLT_SQRT_NEON_DBL_METHOD_FUNC(tDType, tNAryClass)                           \
+  static void iter_##tDType##_math_s_sqrt(na_loop_t* const lp) {                               \
+    size_t i = 0;                                                                              \
+    size_t n;                                                                                  \
+    char *p1, *p2;                                                                             \
+    ssize_t s1, s2;                                                                            \
+    size_t *idx1, *idx2;                                                                       \
+    tDType x;                                                                                  \
+    size_t cnt;                                                                                \
+    size_t cnt_simd_loop = -1;                                                                 \
+    float64x2_t a;                                                                             \
+    size_t num_pack;                                                                           \
+    num_pack = NEON_ALIGNMENT_SIZE / sizeof(tDType);                                           \
+                                                                                               \
+    INIT_COUNTER(lp, n);                                                                       \
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);                                                         \
+    INIT_PTR_IDX(lp, 1, p2, s2, idx2);                                                         \
+                                                                                               \
+    if (idx1) {                                                                                \
+      if (idx2) {                                                                              \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_INDEX(p1, idx1, tDType, x);                                                 \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_INDEX(p2, idx2, tDType, x);                                                 \
+        }                                                                                      \
+      } else {                                                                                 \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_INDEX(p1, idx1, tDType, x);                                                 \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_STRIDE(p2, s2, tDType, x);                                                  \
+        }                                                                                      \
+      }                                                                                        \
+    } else {                                                                                   \
+      if (idx2) {                                                                              \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_STRIDE(p1, s1, tDType, x);                                                  \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_INDEX(p2, idx2, tDType, x);                                                 \
+        }                                                                                      \
+      } else {                                                                                 \
+        if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType))) {                \
+          if (s1 == sizeof(tDType) && s2 == sizeof(tDType)) {                                  \
+            if ((n >= num_pack) &&                                                             \
+                is_same_aligned2(&((tDType*)p1)[i], &((tDType*)p2)[i], NEON_ALIGNMENT_SIZE)) { \
+              cnt = get_count_of_elements_not_aligned_to_simd_size(                            \
+                &((tDType*)p1)[i], NEON_ALIGNMENT_SIZE, sizeof(tDType)                         \
+              );                                                                               \
+              for (i = 0; i < cnt; i++) {                                                      \
+                ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]);                                   \
+              }                                                                                \
+              cnt_simd_loop = (n - i) % num_pack;                                              \
+              for (; i < n - cnt_simd_loop; i += num_pack) {                                   \
+                a = vld1q_f64(&((tDType*)p1)[i]);                                              \
+                a = vsqrtq_f64(a);                                                             \
+                vst1q_f64(&((tDType*)p2)[i], a);                                               \
+              }                                                                                \
+            }                                                                                  \
+            if (cnt_simd_loop != 0) {                                                          \
+              for (; i < n; i++) {                                                             \
+                ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]);                                   \
+              }                                                                                \
+            }                                                                                  \
+            return;                                                                            \
+          }                                                                                    \
+          if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType))) {    \
+            for (i = 0; i < n; i++) {                                                          \
+              *(tDType*)p2 = m_sqrt(*(tDType*)p1);                                             \
+              p1 += s1;                                                                        \
+              p2 += s2;                                                                        \
+            }                                                                                  \
+            return;                                                                            \
+          }                                                                                    \
+        }                                                                                      \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_STRIDE(p1, s1, tDType, x);                                                  \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_STRIDE(p2, s2, tDType, x);                                                  \
+        }                                                                                      \
+      }                                                                                        \
+    }                                                                                          \
+  }                                                                                            \
+                                                                                               \
+  static VALUE tDType##_math_s_sqrt(VALUE mod, VALUE a1) {                                     \
+    ndfunc_arg_in_t ain[1] = { { tNAryClass, 0 } };                                            \
+    ndfunc_arg_out_t aout[1] = { { tNAryClass, 0 } };                                          \
+    ndfunc_t ndf = { iter_##tDType##_math_s_sqrt, FULL_LOOP, 1, 1, ain, aout };                \
+    return na_ndloop(&ndf, 1, a1);                                                             \
+  }
 #endif /* NUMO_NARRAY_MH_MATH_SQRT_H */