PyPI - sequenzo - Versions diffs - 0.1.17__cp39-cp39-win_amd64.whl → 0.1.18__cp39-cp39-win_amd64.whl - Mend

sequenzo 0.1.17__cp39-cp39-win_amd64.whl → 0.1.18__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sequenzo might be problematic. Click here for more details.

Files changed (101) hide show

sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp CHANGED Viewed

@@ -13,6 +13,7 @@
 #define XSIMD_NEON_HPP
 #include <algorithm>
+#include <array>
 #include <complex>
 #include <tuple>
 #include <type_traits>
@@ -717,16 +718,10 @@ namespace xsimd
             return vnegq_s32(rhs);
         }
-        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
-        {
-            return batch<T, A> { -rhs.get(0), -rhs.get(1) };
-        }
-        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
-            return batch<T, A> { -rhs.get(0), -rhs.get(1) };
+            return 0 - rhs;
         }
         template <class A>
@@ -923,16 +918,28 @@ namespace xsimd
             return dispatcher.apply(register_type(lhs), register_type(rhs));
         }
-        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
         XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
-            return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) });
+            auto eq32 = vceqq_u32(vreinterpretq_u32_u64(lhs.data), vreinterpretq_u32_u64(rhs.data));
+            auto rev32 = vrev64q_u32(eq32);
+            auto eq64 = vandq_u32(eq32, rev32);
+            return batch_bool<T, A>(vreinterpretq_u64_u32(eq64));
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            auto eq32 = vceqq_u32(vreinterpretq_u32_s64(lhs.data), vreinterpretq_u32_s64(rhs.data));
+            auto rev32 = vrev64q_u32(eq32);
+            auto eq64 = vandq_u32(eq32, rev32);
+            return batch_bool<T, A>(vreinterpretq_u64_u32(eq64));
         }
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
         XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
         {
-            return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) });
+            return eq(batch<T, A> { lhs.data }, batch<T, A> { rhs.data }, A {});
         }
         /*************
@@ -985,10 +992,19 @@ namespace xsimd
             return dispatcher.apply(register_type(lhs), register_type(rhs));
         }
-        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            return batch_bool<T, A>(vreinterpretq_u64_s64(vshrq_n_s64(vqsubq_s64(register_type(lhs), register_type(rhs)), 63)));
+        }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
         XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
-            return batch_bool<T, A>({ lhs.get(0) < rhs.get(0), lhs.get(1) < rhs.get(1) });
+            using register_type = typename batch<T, A>::register_type;
+            register_type acc = { 0x7FFFFFFFFFFFFFFFull, 0x7FFFFFFFFFFFFFFFull };
+            return batch_bool<T, A>(vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_u64(vqaddq_u64(vqsubq_u64(register_type(rhs), register_type(lhs)), acc)), 63)));
         }
         /******
@@ -1012,12 +1028,24 @@ namespace xsimd
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
         XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
-            return batch_bool<T, A>({ lhs.get(0) <= rhs.get(0), lhs.get(1) <= rhs.get(1) });
+            return !(lhs > rhs);
         }
         /******
          * gt *
          ******/
+        namespace detail
+        {
+            XSIMD_INLINE int64x2_t bitwise_not_s64(int64x2_t arg) noexcept
+            {
+                return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(arg)));
+            }
+            XSIMD_INLINE uint64x2_t bitwise_not_u64(uint64x2_t arg) noexcept
+            {
+                return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(arg)));
+            }
+        }
         WRAP_BINARY_INT_EXCLUDING_64(vcgtq, detail::comp_return_type)
         WRAP_BINARY_FLOAT(vcgtq, detail::comp_return_type)
@@ -1033,10 +1061,19 @@ namespace xsimd
             return dispatcher.apply(register_type(lhs), register_type(rhs));
         }
-        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
         XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
-            return batch_bool<T, A>({ lhs.get(0) > rhs.get(0), lhs.get(1) > rhs.get(1) });
+            using register_type = typename batch<T, A>::register_type;
+            return batch_bool<T, A>(vreinterpretq_u64_s64(vshrq_n_s64(vqsubq_s64(register_type(rhs), register_type(lhs)), 63)));
+        }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            register_type acc = { 0x7FFFFFFFFFFFFFFFull, 0x7FFFFFFFFFFFFFFFull };
+            return batch_bool<T, A>(vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_u64(vqaddq_u64(vqsubq_u64(register_type(lhs), register_type(rhs)), acc)), 63)));
         }
         /******
@@ -1060,7 +1097,7 @@ namespace xsimd
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
         XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
-            return batch_bool<T, A>({ lhs.get(0) >= rhs.get(0), lhs.get(1) >= rhs.get(1) });
+            return !(lhs < rhs);
         }
         /*******************
@@ -1212,16 +1249,6 @@ namespace xsimd
         namespace detail
         {
-            XSIMD_INLINE int64x2_t bitwise_not_s64(int64x2_t arg) noexcept
-            {
-                return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(arg)));
-            }
-            XSIMD_INLINE uint64x2_t bitwise_not_u64(uint64x2_t arg) noexcept
-            {
-                return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(arg)));
-            }
             XSIMD_INLINE float32x4_t bitwise_not_f32(float32x4_t arg) noexcept
             {
                 return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(arg)));
@@ -1314,7 +1341,7 @@ namespace xsimd
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
         XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
-            return { std::min(lhs.get(0), rhs.get(0)), std::min(lhs.get(1), rhs.get(1)) };
+            return select(lhs > rhs, rhs, lhs);
         }
         /*******
@@ -1338,7 +1365,7 @@ namespace xsimd
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
         XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
-            return { std::max(lhs.get(0), rhs.get(0)), std::max(lhs.get(1), rhs.get(1)) };
+            return select(lhs > rhs, lhs, rhs);
         }
         /*******
@@ -1678,14 +1705,21 @@ namespace xsimd
          * reduce_max *
          **************/
-        // Using common implementation because ARM doe snot provide intrinsics
+        // Using common implementation because ARM does not provide intrinsics
         // for this operation
         /**************
          * reduce_min *
          **************/
-        // Using common implementation because ARM doe snot provide intrinsics
+        // Using common implementation because ARM does not provide intrinsics
+        // for this operation
+        /**************
+         * reduce_mul *
+         **************/
+        // Using common implementation because ARM does not provide intrinsics
         // for this operation
         /**********
@@ -2280,6 +2314,55 @@ namespace xsimd
             return vshlq_s64(lhs, rhs);
         }
+        // immediate variant
+        template <size_t shift, class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return vshlq_n_u8(x, shift);
+        }
+        template <size_t shift, class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return vshlq_n_s8(x, shift);
+        }
+        template <size_t shift, class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return vshlq_n_u16(x, shift);
+        }
+        template <size_t shift, class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return vshlq_n_s16(x, shift);
+        }
+        template <size_t shift, class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return vshlq_n_u32(x, shift);
+        }
+        template <size_t shift, class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return vshlq_n_s32(x, shift);
+        }
+        template <size_t shift, class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return vshlq_n_u64(x, shift);
+        }
+        template <size_t shift, class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return vshlq_n_s64(x, shift);
+        }
         /******************
          * bitwise_rshift *
          ******************/
@@ -2455,6 +2538,110 @@ namespace xsimd
             return vshlq_s32(lhs, vnegq_s32(rhs));
         }
+        // immediate variant
+        template <size_t shift, class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return vshrq_n_u8(x, shift);
+        }
+        template <size_t shift, class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return vshrq_n_s8(x, shift);
+        }
+        template <size_t shift, class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return vshrq_n_u16(x, shift);
+        }
+        template <size_t shift, class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return vshrq_n_s16(x, shift);
+        }
+        template <size_t shift, class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return vshrq_n_u32(x, shift);
+        }
+        template <size_t shift, class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return vshrq_n_s32(x, shift);
+        }
+        template <size_t shift, class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return vshrq_n_u64(x, shift);
+        }
+        template <size_t shift, class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return vshrq_n_s64(x, shift);
+        }
+        // first
+        template <class A>
+        XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_f32(self, 0);
+        }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE T first(batch<T, A> val, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_u8(val, 0);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE T first(batch<T, A> val, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_s8(val, 0);
+        }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE T first(batch<T, A> val, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_u16(val, 0);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE T first(batch<T, A> val, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_s16(val, 0);
+        }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE T first(batch<T, A> val, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_u32(val, 0);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE T first(batch<T, A> val, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_s32(val, 0);
+        }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE T first(batch<T, A> val, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_u64(val, 0);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE T first(batch<T, A> val, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_s64(val, 0);
+        }
         // Overloads of bitwise shifts accepting two batches of uint64/int64 are not available with ARMv7
         /*******
@@ -2771,10 +2958,11 @@ namespace xsimd
         XSIMD_INLINE batch<T, A> rotate_left(batch<T, A> const& a, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
+            // Adding modulo to avoid warning.
             const detail::neon_dispatcher::binary dispatcher = {
-                std::make_tuple(wrap::rotate_left_u8<N>, wrap::rotate_left_s8<N>, wrap::rotate_left_u16<N>, wrap::rotate_left_s16<N>,
-                                wrap::rotate_left_u32<N>, wrap::rotate_left_s32<N>, wrap::rotate_left_u64<N>, wrap::rotate_left_s64<N>,
-                                wrap::rotate_left_f32<N>)
+                std::make_tuple(wrap::rotate_left_u8<N>, wrap::rotate_left_s8<N>, wrap::rotate_left_u16<N % 8>, wrap::rotate_left_s16<N % 8>,
+                                wrap::rotate_left_u32<N % 4>, wrap::rotate_left_s32<N % 4>, wrap::rotate_left_u64<N % 2>, wrap::rotate_left_s64<N % 2>,
+                                wrap::rotate_left_f32<N % 4>)
             };
             return dispatcher.apply(register_type(a), register_type(a));
         }
@@ -2799,6 +2987,147 @@ namespace xsimd
             self.store_aligned(data.data());
             return set(batch<T, A>(), A(), data[idx]...);
         }
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self,
+                                                batch_constant<uint64_t, A, V0, V1>,
+                                                requires_arch<neon>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(V0 == 0 && V1 == 0)
+            {
+                auto lo = vget_low_u64(self);
+                return vcombine_u64(lo, lo);
+            }
+            XSIMD_IF_CONSTEXPR(V0 == 1 && V1 == 1)
+            {
+                auto hi = vget_high_u64(self);
+                return vcombine_u64(hi, hi);
+            }
+            XSIMD_IF_CONSTEXPR(V0 == 0 && V1 == 1)
+            {
+                return self;
+            }
+            else
+            {
+                return vextq_u64(self, self, 1);
+            }
+        }
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self,
+                                               batch_constant<int64_t, A, V0, V1> mask,
+                                               requires_arch<neon>) noexcept
+        {
+            return vreinterpretq_s64_u64(swizzle(vreinterpretq_u64_s64(self), mask, A {}));
+        }
+        namespace detail
+        {
+            template <uint32_t Va, uint32_t Vb>
+            XSIMD_INLINE uint8x8_t make_mask()
+            {
+                uint8x8_t res = {
+                    static_cast<uint8_t>((Va % 2) * 4 + 0),
+                    static_cast<uint8_t>((Va % 2) * 4 + 1),
+                    static_cast<uint8_t>((Va % 2) * 4 + 2),
+                    static_cast<uint8_t>((Va % 2) * 4 + 3),
+                    static_cast<uint8_t>((Vb % 2) * 4 + 0),
+                    static_cast<uint8_t>((Vb % 2) * 4 + 1),
+                    static_cast<uint8_t>((Vb % 2) * 4 + 2),
+                    static_cast<uint8_t>((Vb % 2) * 4 + 3),
+                };
+                return res;
+            }
+        }
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
+                                                batch_constant<uint32_t, A, V0, V1, V2, V3> mask,
+                                                requires_arch<neon>) noexcept
+        {
+            constexpr bool is_identity = detail::is_identity(mask);
+            constexpr bool is_dup_lo = detail::is_dup_lo(mask);
+            constexpr bool is_dup_hi = detail::is_dup_hi(mask);
+            XSIMD_IF_CONSTEXPR(is_identity)
+            {
+                return self;
+            }
+            XSIMD_IF_CONSTEXPR(is_dup_lo)
+            {
+                XSIMD_IF_CONSTEXPR(V0 == 0 && V1 == 1)
+                {
+                    return vreinterpretq_u32_u64(vdupq_lane_u64(vget_low_u64(vreinterpretq_u64_u32(self)), 0));
+                }
+                XSIMD_IF_CONSTEXPR(V0 == 1 && V1 == 0)
+                {
+                    return vreinterpretq_u32_u64(vdupq_lane_u64(vreinterpret_u64_u32(vrev64_u32(vget_low_u32(self))), 0));
+                }
+                return vdupq_n_u32(vgetq_lane_u32(self, V0));
+            }
+            XSIMD_IF_CONSTEXPR(is_dup_hi)
+            {
+                XSIMD_IF_CONSTEXPR(V0 == 2 && V1 == 3)
+                {
+                    return vreinterpretq_u32_u64(vdupq_lane_u64(vget_high_u64(vreinterpretq_u64_u32(self)), 0));
+                }
+                XSIMD_IF_CONSTEXPR(V0 == 3 && V1 == 2)
+                {
+                    return vreinterpretq_u32_u64(vdupq_lane_u64(vreinterpret_u64_u32(vrev64_u32(vget_high_u32(self))), 0));
+                }
+                return vdupq_n_u32(vgetq_lane_u32(self, V0));
+            }
+            XSIMD_IF_CONSTEXPR(V0 < 2 && V1 < 2 && V2 < 2 && V3 < 2)
+            {
+                uint8x8_t low = vreinterpret_u8_u64(vget_low_u64(vreinterpretq_u64_u32(self)));
+                uint8x8_t mask_lo = detail::make_mask<V0, V1>();
+                uint8x8_t mask_hi = detail::make_mask<V2, V3>();
+                uint8x8_t lo = vtbl1_u8(low, mask_lo);
+                uint8x8_t hi = vtbl1_u8(low, mask_hi);
+                return vreinterpretq_u32_u8(vcombine_u8(lo, hi));
+            }
+            XSIMD_IF_CONSTEXPR(V0 >= 2 && V1 >= 2 && V2 >= 2 && V3 >= 2)
+            {
+                uint8x8_t high = vreinterpret_u8_u64(vget_high_u64(vreinterpretq_u64_u32(self)));
+                uint8x8_t mask_lo = detail::make_mask<V0, V1>();
+                uint8x8_t mask_hi = detail::make_mask<V2, V3>();
+                uint8x8_t lo = vtbl1_u8(high, mask_lo);
+                uint8x8_t hi = vtbl1_u8(high, mask_hi);
+                return vreinterpretq_u32_u8(vcombine_u8(lo, hi));
+            }
+            uint8x8_t mask_lo = detail::make_mask<V0, V1>();
+            uint8x8_t mask_hi = detail::make_mask<V2, V3>();
+            uint8x8_t low = vreinterpret_u8_u64(vget_low_u64(vreinterpretq_u64_u32(self)));
+            uint8x8_t lol = vtbl1_u8(low, mask_lo);
+            uint8x8_t loh = vtbl1_u8(low, mask_hi);
+            uint32x4_t true_br = vreinterpretq_u32_u8(vcombine_u8(lol, loh));
+            uint8x8_t high = vreinterpret_u8_u64(vget_high_u64(vreinterpretq_u64_u32(self)));
+            uint8x8_t hil = vtbl1_u8(high, mask_lo);
+            uint8x8_t hih = vtbl1_u8(high, mask_hi);
+            uint32x4_t false_br = vreinterpretq_u32_u8(vcombine_u8(hil, hih));
+            batch_bool_constant<uint32_t, A, (V0 < 2), (V1 < 2), (V2 < 2), (V3 < 2)> blend_mask;
+            return select(blend_mask, batch<uint32_t, A>(true_br), batch<uint32_t, A>(false_br), A {});
+        }
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
+                                               batch_constant<int32_t, A, V0, V1, V2, V3> mask,
+                                               requires_arch<neon>) noexcept
+        {
+            return vreinterpretq_s32_u32(swizzle(vreinterpretq_u32_s32(self), mask, A {}));
+        }
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self,
+                                             batch_constant<uint32_t, A, V0, V1, V2, V3> mask,
+                                             requires_arch<neon>) noexcept
+        {
+            return vreinterpretq_f32_u32(swizzle(batch<uint32_t, A>(vreinterpretq_u32_f32(self)), mask, A {}));
+        }
     }
 }

sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp CHANGED Viewed

@@ -28,6 +28,13 @@ namespace xsimd
     {
         using namespace types;
+        // first
+        template <class A>
+        XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<neon64>) noexcept
+        {
+            return vgetq_lane_f64(self, 0);
+        }
         /*******
          * all *
          *******/

sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp CHANGED Viewed

@@ -1335,6 +1335,19 @@ namespace xsimd
             return result;
         }
+        // first
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE T first(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmv_lane0(arg);
+        }
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE std::complex<T> first(batch<std::complex<T>, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return std::complex<T> { detail::rvvmv_lane0(arg.real()), detail::rvvmv_lane0(arg.imag()) };
+        }
         // insert
         template <class A, class T, size_t I, detail::rvv_enable_all_t<T> = 0>
         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& arg, T val, index<I>, requires_arch<rvv>) noexcept

sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp CHANGED Viewed

@@ -300,12 +300,29 @@ namespace xsimd
         return x << shift;
     }
+    template <size_t shift, class T>
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, T>::type
+    bitwise_lshift(T x) noexcept
+    {
+        constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
+        static_assert(shift < bits, "Count must be less than the number of bits in T");
+        return x << shift;
+    }
     template <class T0, class T1>
     XSIMD_INLINE typename std::enable_if<std::is_integral<T0>::value && std::is_integral<T1>::value, T0>::type
     bitwise_rshift(T0 x, T1 shift) noexcept
     {
         return x >> shift;
     }
+    template <size_t shift, class T>
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, T>::type
+    bitwise_rshift(T x) noexcept
+    {
+        constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
+        static_assert(shift < bits, "Count must be less than the number of bits in T");
+        return x >> shift;
+    }
     template <class T>
     XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, T>::type
@@ -447,16 +464,32 @@ namespace xsimd
     XSIMD_INLINE typename std::enable_if<std::is_integral<T0>::value && std::is_integral<T1>::value, T0>::type
     rotl(T0 x, T1 shift) noexcept
     {
-        constexpr auto N = std::numeric_limits<T0>::digits;
-        return (x << shift) | (x >> (N - shift));
+        constexpr auto bits = std::numeric_limits<T0>::digits + std::numeric_limits<T0>::is_signed;
+        return (x << shift) | (x >> (bits - shift));
+    }
+    template <size_t count, class T>
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, T>::type
+    rotl(T x) noexcept
+    {
+        constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
+        static_assert(count < bits, "Count must be less than the number of bits in T");
+        return (x << count) | (x >> (bits - count));
     }
     template <class T0, class T1>
     XSIMD_INLINE typename std::enable_if<std::is_integral<T0>::value && std::is_integral<T1>::value, T0>::type
     rotr(T0 x, T1 shift) noexcept
     {
-        constexpr auto N = std::numeric_limits<T0>::digits;
-        return (x >> shift) | (x << (N - shift));
+        constexpr auto bits = std::numeric_limits<T0>::digits + std::numeric_limits<T0>::is_signed;
+        return (x >> shift) | (x << (bits - shift));
+    }
+    template <size_t count, class T>
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, T>::type
+    rotr(T x) noexcept
+    {
+        constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
+        static_assert(count < bits, "Count must be less than the number of bits in T");
+        return (x >> count) | (x << (bits - count));
     }
     template <class T>
@@ -510,7 +543,11 @@ namespace xsimd
     template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
     XSIMD_INLINE bool is_flint(const T& x) noexcept
     {
+#ifdef __FAST_MATH__
+        return (x - std::trunc(x)) == T(0);
+#else
         return std::isnan(x - x) ? false : (x - std::trunc(x)) == T(0);
+#endif
     }
     template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>