RubyGems - numo-narray-alt - Versions diffs - 0.10.5 → 0.11.0 - Mend

numo-narray-alt 0.10.5 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +4 -4
data/README.md +9 -0
data/ext/numo/narray/extconf.rb +2 -0
data/ext/numo/narray/numo/narray.h +3 -3
data/ext/numo/narray/src/mh/math/sqrt.h +372 -0
data/ext/numo/narray/src/mh/mulsum.h +1360 -74
data/ext/numo/narray/src/mh/op/add.h +20 -0
data/ext/numo/narray/src/mh/op/binary_func.h +542 -0
data/ext/numo/narray/src/mh/op/div.h +20 -0
data/ext/numo/narray/src/mh/op/mul.h +20 -0
data/ext/numo/narray/src/mh/op/sub.h +20 -0
data/ext/numo/narray/src/mh/sort.h +4 -4
data/ext/numo/narray/src/t_bit.c +0 -5
data/ext/numo/narray/src/t_dcomplex.c +0 -5
data/ext/numo/narray/src/t_dfloat.c +35 -3
data/ext/numo/narray/src/t_int16.c +0 -5
data/ext/numo/narray/src/t_int32.c +0 -5
data/ext/numo/narray/src/t_int64.c +0 -5
data/ext/numo/narray/src/t_int8.c +0 -5
data/ext/numo/narray/src/t_robject.c +0 -5
data/ext/numo/narray/src/t_scomplex.c +0 -5
data/ext/numo/narray/src/t_sfloat.c +35 -3
data/ext/numo/narray/src/t_uint16.c +0 -5
data/ext/numo/narray/src/t_uint32.c +0 -5
data/ext/numo/narray/src/t_uint64.c +0 -5
data/ext/numo/narray/src/t_uint8.c +0 -5
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ab5316ed6e130742f1602baa630db5b06e9c69b8bc145299f7d68eb7e20ff24b
-  data.tar.gz: 91b41466a41c977c1949ee5047cbc2ac9aeecff793db3379f765f33c13fa82ee
+  metadata.gz: 9b2e2bc7cc99b7ef2b868b588360ff66299a6570b4d6ea98c45bbb357eed661b
+  data.tar.gz: '0887ab3061ad5add393ecd99921ca88e3a06545c9617ffc9686550cb87b3e843'
 SHA512:
-  metadata.gz: 400b230e6c1ba59aa0a43a5706d32d506b5e28cc91af7445e59cf91bc8d0c8b4db2cf44a4648a8bec7cfdd322feebfab9990940c9597139a0bf4ddf9217d4cde
-  data.tar.gz: 59de858a47cc6f1c8b4ec4dcf23108e300cfb60de8ab1ce4559586da76d7cc53410c03b159d00855a686de416577109f9045dd995a89c0566df69e13f5c6e0cb
+  metadata.gz: '09f47d21922f7e24e7915222eb9ddacadf4392156e53a15aee05d4f2fc6ebdafce244c3624de375b10411768d7f1a38de7d081504ef0fe21b6b1b82156ef9410'
+  data.tar.gz: 4cc3900b5631971fc76195eee5ecf66a6c1647a604d19997b0bcece1d1341f9245ace1c405c3f5e9dbc493389f456c4c9c817ea145a1c3a47f2180edea52ae36

data/README.md CHANGED Viewed

@@ -24,6 +24,15 @@ This project is in no way intended to adversely affect the development of the or
 $ gem install numo-narray-alt
 ```
+### Build options
+By default, the floating-point classes (`Numo::SFloat` / `Numo::DFloat`) use SIMD instructions
+(SSE2 / AVX / NEON) when the target CPU supports them. To build without SIMD, pass `--with-no-simd`:
+```shell
+$ gem install numo-narray-alt -- --with-no-simd
+```
 ## Usage
 The usage is exactly the same as Numo::NArray.

data/ext/numo/narray/extconf.rb CHANGED Viewed

@@ -66,6 +66,8 @@ have_func('RTYPEDDATA_GET_DATA')
 have_var('rb_cComplex')
+$defs << '-DNUMO_NO_SIMD' if with_config('no-simd', false)
 $objs = srcs.collect { |i| "#{i}.o" }
 create_header d('numo/extconf.h')

data/ext/numo/narray/numo/narray.h CHANGED Viewed

@@ -13,10 +13,10 @@ extern "C" {
 #endif
 #endif
-#define NARRAY_VERSION "0.10.5"
+#define NARRAY_VERSION "0.11.0"
 #define NARRAY_VERSION_MAJOR 0
-#define NARRAY_VERSION_MINOR 10
-#define NARRAY_VERSION_PATCH 5
+#define NARRAY_VERSION_MINOR 11
+#define NARRAY_VERSION_PATCH 0
 #define NARRAY_VERSION_CODE                                                                    \
   (NARRAY_VERSION_MAJOR * 10000 + NARRAY_VERSION_MINOR * 100 + NARRAY_VERSION_PATCH)

data/ext/numo/narray/src/mh/math/sqrt.h CHANGED Viewed

@@ -200,4 +200,376 @@
     return na_ndloop(&ndf, 1, a1);                                                             \
   }
+#define DEF_NARRAY_FLT_SQRT_AVX_SGL_METHOD_FUNC(tDType, tNAryClass)                            \
+  static void iter_##tDType##_math_s_sqrt(na_loop_t* const lp) {                               \
+    size_t i = 0;                                                                              \
+    size_t n;                                                                                  \
+    char *p1, *p2;                                                                             \
+    ssize_t s1, s2;                                                                            \
+    size_t *idx1, *idx2;                                                                       \
+    tDType x;                                                                                  \
+    size_t cnt;                                                                                \
+    size_t cnt_simd_loop = -1;                                                                 \
+    __m256 a;                                                                                  \
+    size_t num_pack;                                                                           \
+    num_pack = AVX_ALIGNMENT_SIZE / sizeof(tDType);                                            \
+                                                                                               \
+    INIT_COUNTER(lp, n);                                                                       \
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);                                                         \
+    INIT_PTR_IDX(lp, 1, p2, s2, idx2);                                                         \
+                                                                                               \
+    if (idx1) {                                                                                \
+      if (idx2) {                                                                              \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_INDEX(p1, idx1, tDType, x);                                                 \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_INDEX(p2, idx2, tDType, x);                                                 \
+        }                                                                                      \
+      } else {                                                                                 \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_INDEX(p1, idx1, tDType, x);                                                 \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_STRIDE(p2, s2, tDType, x);                                                  \
+        }                                                                                      \
+      }                                                                                        \
+    } else {                                                                                   \
+      if (idx2) {                                                                              \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_STRIDE(p1, s1, tDType, x);                                                  \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_INDEX(p2, idx2, tDType, x);                                                 \
+        }                                                                                      \
+      } else {                                                                                 \
+        if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType))) {                \
+          if (s1 == sizeof(tDType) && s2 == sizeof(tDType)) {                                  \
+            if ((n >= num_pack) &&                                                             \
+                is_same_aligned2(&((tDType*)p1)[i], &((tDType*)p2)[i], AVX_ALIGNMENT_SIZE)) {  \
+              cnt = get_count_of_elements_not_aligned_to_simd_size(                            \
+                &((tDType*)p1)[i], AVX_ALIGNMENT_SIZE, sizeof(tDType)                          \
+              );                                                                               \
+              for (i = 0; i < cnt; i++) {                                                      \
+                ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]);                                   \
+              }                                                                                \
+              cnt_simd_loop = (n - i) % num_pack;                                              \
+              if (p1 == p2) {                                                                  \
+                for (; i < n - cnt_simd_loop; i += num_pack) {                                 \
+                  a = _mm256_load_ps(&((tDType*)p1)[i]);                                       \
+                  a = _mm256_sqrt_ps(a);                                                       \
+                  _mm256_store_ps(&((tDType*)p1)[i], a);                                       \
+                }                                                                              \
+              } else {                                                                         \
+                for (; i < n - cnt_simd_loop; i += num_pack) {                                 \
+                  a = _mm256_load_ps(&((tDType*)p1)[i]);                                       \
+                  a = _mm256_sqrt_ps(a);                                                       \
+                  _mm256_stream_ps(&((tDType*)p2)[i], a);                                      \
+                }                                                                              \
+              }                                                                                \
+            }                                                                                  \
+            if (cnt_simd_loop != 0) {                                                          \
+              for (; i < n; i++) {                                                             \
+                ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]);                                   \
+              }                                                                                \
+            }                                                                                  \
+            return;                                                                            \
+          }                                                                                    \
+          if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType))) {    \
+            for (i = 0; i < n; i++) {                                                          \
+              *(tDType*)p2 = m_sqrt(*(tDType*)p1);                                             \
+              p1 += s1;                                                                        \
+              p2 += s2;                                                                        \
+            }                                                                                  \
+            return;                                                                            \
+          }                                                                                    \
+        }                                                                                      \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_STRIDE(p1, s1, tDType, x);                                                  \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_STRIDE(p2, s2, tDType, x);                                                  \
+        }                                                                                      \
+      }                                                                                        \
+    }                                                                                          \
+  }                                                                                            \
+                                                                                               \
+  static VALUE tDType##_math_s_sqrt(VALUE mod, VALUE a1) {                                     \
+    ndfunc_arg_in_t ain[1] = { { tNAryClass, 0 } };                                            \
+    ndfunc_arg_out_t aout[1] = { { tNAryClass, 0 } };                                          \
+    ndfunc_t ndf = { iter_##tDType##_math_s_sqrt, FULL_LOOP, 1, 1, ain, aout };                \
+    return na_ndloop(&ndf, 1, a1);                                                             \
+  }
+#define DEF_NARRAY_FLT_SQRT_AVX_DBL_METHOD_FUNC(tDType, tNAryClass)                            \
+  static void iter_##tDType##_math_s_sqrt(na_loop_t* const lp) {                               \
+    size_t i = 0;                                                                              \
+    size_t n;                                                                                  \
+    char *p1, *p2;                                                                             \
+    ssize_t s1, s2;                                                                            \
+    size_t *idx1, *idx2;                                                                       \
+    tDType x;                                                                                  \
+    size_t cnt;                                                                                \
+    size_t cnt_simd_loop = -1;                                                                 \
+    __m256d a;                                                                                 \
+    size_t num_pack;                                                                           \
+    num_pack = AVX_ALIGNMENT_SIZE / sizeof(tDType);                                            \
+                                                                                               \
+    INIT_COUNTER(lp, n);                                                                       \
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);                                                         \
+    INIT_PTR_IDX(lp, 1, p2, s2, idx2);                                                         \
+                                                                                               \
+    if (idx1) {                                                                                \
+      if (idx2) {                                                                              \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_INDEX(p1, idx1, tDType, x);                                                 \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_INDEX(p2, idx2, tDType, x);                                                 \
+        }                                                                                      \
+      } else {                                                                                 \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_INDEX(p1, idx1, tDType, x);                                                 \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_STRIDE(p2, s2, tDType, x);                                                  \
+        }                                                                                      \
+      }                                                                                        \
+    } else {                                                                                   \
+      if (idx2) {                                                                              \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_STRIDE(p1, s1, tDType, x);                                                  \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_INDEX(p2, idx2, tDType, x);                                                 \
+        }                                                                                      \
+      } else {                                                                                 \
+        if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType))) {                \
+          if (s1 == sizeof(tDType) && s2 == sizeof(tDType)) {                                  \
+            if ((n >= num_pack) &&                                                             \
+                is_same_aligned2(&((tDType*)p1)[i], &((tDType*)p2)[i], AVX_ALIGNMENT_SIZE)) {  \
+              cnt = get_count_of_elements_not_aligned_to_simd_size(                            \
+                &((tDType*)p1)[i], AVX_ALIGNMENT_SIZE, sizeof(tDType)                          \
+              );                                                                               \
+              for (i = 0; i < cnt; i++) {                                                      \
+                ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]);                                   \
+              }                                                                                \
+              cnt_simd_loop = (n - i) % num_pack;                                              \
+              if (p1 == p2) {                                                                  \
+                for (; i < n - cnt_simd_loop; i += num_pack) {                                 \
+                  a = _mm256_load_pd(&((tDType*)p1)[i]);                                       \
+                  a = _mm256_sqrt_pd(a);                                                       \
+                  _mm256_store_pd(&((tDType*)p1)[i], a);                                       \
+                }                                                                              \
+              } else {                                                                         \
+                for (; i < n - cnt_simd_loop; i += num_pack) {                                 \
+                  a = _mm256_load_pd(&((tDType*)p1)[i]);                                       \
+                  a = _mm256_sqrt_pd(a);                                                       \
+                  _mm256_stream_pd(&((tDType*)p2)[i], a);                                      \
+                }                                                                              \
+              }                                                                                \
+            }                                                                                  \
+            if (cnt_simd_loop != 0) {                                                          \
+              for (; i < n; i++) {                                                             \
+                ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]);                                   \
+              }                                                                                \
+            }                                                                                  \
+            return;                                                                            \
+          }                                                                                    \
+          if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType))) {    \
+            for (i = 0; i < n; i++) {                                                          \
+              *(tDType*)p2 = m_sqrt(*(tDType*)p1);                                             \
+              p1 += s1;                                                                        \
+              p2 += s2;                                                                        \
+            }                                                                                  \
+            return;                                                                            \
+          }                                                                                    \
+        }                                                                                      \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_STRIDE(p1, s1, tDType, x);                                                  \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_STRIDE(p2, s2, tDType, x);                                                  \
+        }                                                                                      \
+      }                                                                                        \
+    }                                                                                          \
+  }                                                                                            \
+                                                                                               \
+  static VALUE tDType##_math_s_sqrt(VALUE mod, VALUE a1) {                                     \
+    ndfunc_arg_in_t ain[1] = { { tNAryClass, 0 } };                                            \
+    ndfunc_arg_out_t aout[1] = { { tNAryClass, 0 } };                                          \
+    ndfunc_t ndf = { iter_##tDType##_math_s_sqrt, FULL_LOOP, 1, 1, ain, aout };                \
+    return na_ndloop(&ndf, 1, a1);                                                             \
+  }
+#define DEF_NARRAY_FLT_SQRT_NEON_SGL_METHOD_FUNC(tDType, tNAryClass)                           \
+  static void iter_##tDType##_math_s_sqrt(na_loop_t* const lp) {                               \
+    size_t i = 0;                                                                              \
+    size_t n;                                                                                  \
+    char *p1, *p2;                                                                             \
+    ssize_t s1, s2;                                                                            \
+    size_t *idx1, *idx2;                                                                       \
+    tDType x;                                                                                  \
+    size_t cnt;                                                                                \
+    size_t cnt_simd_loop = -1;                                                                 \
+    float32x4_t a;                                                                             \
+    size_t num_pack;                                                                           \
+    num_pack = NEON_ALIGNMENT_SIZE / sizeof(tDType);                                           \
+                                                                                               \
+    INIT_COUNTER(lp, n);                                                                       \
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);                                                         \
+    INIT_PTR_IDX(lp, 1, p2, s2, idx2);                                                         \
+                                                                                               \
+    if (idx1) {                                                                                \
+      if (idx2) {                                                                              \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_INDEX(p1, idx1, tDType, x);                                                 \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_INDEX(p2, idx2, tDType, x);                                                 \
+        }                                                                                      \
+      } else {                                                                                 \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_INDEX(p1, idx1, tDType, x);                                                 \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_STRIDE(p2, s2, tDType, x);                                                  \
+        }                                                                                      \
+      }                                                                                        \
+    } else {                                                                                   \
+      if (idx2) {                                                                              \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_STRIDE(p1, s1, tDType, x);                                                  \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_INDEX(p2, idx2, tDType, x);                                                 \
+        }                                                                                      \
+      } else {                                                                                 \
+        if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType))) {                \
+          if (s1 == sizeof(tDType) && s2 == sizeof(tDType)) {                                  \
+            if ((n >= num_pack) &&                                                             \
+                is_same_aligned2(&((tDType*)p1)[i], &((tDType*)p2)[i], NEON_ALIGNMENT_SIZE)) { \
+              cnt = get_count_of_elements_not_aligned_to_simd_size(                            \
+                &((tDType*)p1)[i], NEON_ALIGNMENT_SIZE, sizeof(tDType)                         \
+              );                                                                               \
+              for (i = 0; i < cnt; i++) {                                                      \
+                ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]);                                   \
+              }                                                                                \
+              cnt_simd_loop = (n - i) % num_pack;                                              \
+              for (; i < n - cnt_simd_loop; i += num_pack) {                                   \
+                a = vld1q_f32(&((tDType*)p1)[i]);                                              \
+                a = vsqrtq_f32(a);                                                             \
+                vst1q_f32(&((tDType*)p2)[i], a);                                               \
+              }                                                                                \
+            }                                                                                  \
+            if (cnt_simd_loop != 0) {                                                          \
+              for (; i < n; i++) {                                                             \
+                ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]);                                   \
+              }                                                                                \
+            }                                                                                  \
+            return;                                                                            \
+          }                                                                                    \
+          if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType))) {    \
+            for (i = 0; i < n; i++) {                                                          \
+              *(tDType*)p2 = m_sqrt(*(tDType*)p1);                                             \
+              p1 += s1;                                                                        \
+              p2 += s2;                                                                        \
+            }                                                                                  \
+            return;                                                                            \
+          }                                                                                    \
+        }                                                                                      \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_STRIDE(p1, s1, tDType, x);                                                  \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_STRIDE(p2, s2, tDType, x);                                                  \
+        }                                                                                      \
+      }                                                                                        \
+    }                                                                                          \
+  }                                                                                            \
+                                                                                               \
+  static VALUE tDType##_math_s_sqrt(VALUE mod, VALUE a1) {                                     \
+    ndfunc_arg_in_t ain[1] = { { tNAryClass, 0 } };                                            \
+    ndfunc_arg_out_t aout[1] = { { tNAryClass, 0 } };                                          \
+    ndfunc_t ndf = { iter_##tDType##_math_s_sqrt, FULL_LOOP, 1, 1, ain, aout };                \
+    return na_ndloop(&ndf, 1, a1);                                                             \
+  }
+#define DEF_NARRAY_FLT_SQRT_NEON_DBL_METHOD_FUNC(tDType, tNAryClass)                           \
+  static void iter_##tDType##_math_s_sqrt(na_loop_t* const lp) {                               \
+    size_t i = 0;                                                                              \
+    size_t n;                                                                                  \
+    char *p1, *p2;                                                                             \
+    ssize_t s1, s2;                                                                            \
+    size_t *idx1, *idx2;                                                                       \
+    tDType x;                                                                                  \
+    size_t cnt;                                                                                \
+    size_t cnt_simd_loop = -1;                                                                 \
+    float64x2_t a;                                                                             \
+    size_t num_pack;                                                                           \
+    num_pack = NEON_ALIGNMENT_SIZE / sizeof(tDType);                                           \
+                                                                                               \
+    INIT_COUNTER(lp, n);                                                                       \
+    INIT_PTR_IDX(lp, 0, p1, s1, idx1);                                                         \
+    INIT_PTR_IDX(lp, 1, p2, s2, idx2);                                                         \
+                                                                                               \
+    if (idx1) {                                                                                \
+      if (idx2) {                                                                              \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_INDEX(p1, idx1, tDType, x);                                                 \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_INDEX(p2, idx2, tDType, x);                                                 \
+        }                                                                                      \
+      } else {                                                                                 \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_INDEX(p1, idx1, tDType, x);                                                 \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_STRIDE(p2, s2, tDType, x);                                                  \
+        }                                                                                      \
+      }                                                                                        \
+    } else {                                                                                   \
+      if (idx2) {                                                                              \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_STRIDE(p1, s1, tDType, x);                                                  \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_INDEX(p2, idx2, tDType, x);                                                 \
+        }                                                                                      \
+      } else {                                                                                 \
+        if (is_aligned(p1, sizeof(tDType)) && is_aligned(p2, sizeof(tDType))) {                \
+          if (s1 == sizeof(tDType) && s2 == sizeof(tDType)) {                                  \
+            if ((n >= num_pack) &&                                                             \
+                is_same_aligned2(&((tDType*)p1)[i], &((tDType*)p2)[i], NEON_ALIGNMENT_SIZE)) { \
+              cnt = get_count_of_elements_not_aligned_to_simd_size(                            \
+                &((tDType*)p1)[i], NEON_ALIGNMENT_SIZE, sizeof(tDType)                         \
+              );                                                                               \
+              for (i = 0; i < cnt; i++) {                                                      \
+                ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]);                                   \
+              }                                                                                \
+              cnt_simd_loop = (n - i) % num_pack;                                              \
+              for (; i < n - cnt_simd_loop; i += num_pack) {                                   \
+                a = vld1q_f64(&((tDType*)p1)[i]);                                              \
+                a = vsqrtq_f64(a);                                                             \
+                vst1q_f64(&((tDType*)p2)[i], a);                                               \
+              }                                                                                \
+            }                                                                                  \
+            if (cnt_simd_loop != 0) {                                                          \
+              for (; i < n; i++) {                                                             \
+                ((tDType*)p2)[i] = m_sqrt(((tDType*)p1)[i]);                                   \
+              }                                                                                \
+            }                                                                                  \
+            return;                                                                            \
+          }                                                                                    \
+          if (is_aligned_step(s1, sizeof(tDType)) && is_aligned_step(s2, sizeof(tDType))) {    \
+            for (i = 0; i < n; i++) {                                                          \
+              *(tDType*)p2 = m_sqrt(*(tDType*)p1);                                             \
+              p1 += s1;                                                                        \
+              p2 += s2;                                                                        \
+            }                                                                                  \
+            return;                                                                            \
+          }                                                                                    \
+        }                                                                                      \
+        for (i = 0; i < n; i++) {                                                              \
+          GET_DATA_STRIDE(p1, s1, tDType, x);                                                  \
+          x = m_sqrt(x);                                                                       \
+          SET_DATA_STRIDE(p2, s2, tDType, x);                                                  \
+        }                                                                                      \
+      }                                                                                        \
+    }                                                                                          \
+  }                                                                                            \
+                                                                                               \
+  static VALUE tDType##_math_s_sqrt(VALUE mod, VALUE a1) {                                     \
+    ndfunc_arg_in_t ain[1] = { { tNAryClass, 0 } };                                            \
+    ndfunc_arg_out_t aout[1] = { { tNAryClass, 0 } };                                          \
+    ndfunc_t ndf = { iter_##tDType##_math_s_sqrt, FULL_LOOP, 1, 1, ain, aout };                \
+    return na_ndloop(&ndf, 1, a1);                                                             \
+  }
 #endif /* NUMO_NARRAY_MH_MATH_SQRT_H */