PyPI - sequenzo - Versions diffs - 0.1.17__cp39-cp39-win_amd64.whl → 0.1.18__cp39-cp39-win_amd64.whl - Mend

sequenzo 0.1.17__cp39-cp39-win_amd64.whl → 0.1.18__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sequenzo might be problematic. Click here for more details.

Files changed (101) hide show

sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp CHANGED Viewed

@@ -3,6 +3,7 @@
  * Martin Renou                                                             *
  * Copyright (c) QuantStack                                                 *
  * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Marco Barbone                                              *
  *                                                                          *
  * Distributed under the terms of the BSD 3-Clause License.                 *
  *                                                                          *
@@ -20,7 +21,6 @@
 namespace xsimd
 {
     namespace kernel
     {
         using namespace types;
@@ -925,12 +925,12 @@ namespace xsimd
         template <class A>
         XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
         {
-            return _mm256_max_ps(self, other);
+            return _mm256_max_ps(other, self);
         }
         template <class A>
         XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
         {
-            return _mm256_max_pd(self, other);
+            return _mm256_max_pd(other, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
@@ -942,12 +942,12 @@ namespace xsimd
         template <class A>
         XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
         {
-            return _mm256_min_ps(self, other);
+            return _mm256_min_ps(other, self);
         }
         template <class A>
         XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
         {
-            return _mm256_min_pd(self, other);
+            return _mm256_min_pd(other, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
@@ -1046,7 +1046,7 @@ namespace xsimd
         }
         // reduce_add
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value || std::is_same<T, double>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
         {
             typename batch<T, sse4_2>::register_type low, high;
@@ -1077,6 +1077,16 @@ namespace xsimd
             return reduce_min(batch<T, sse4_2>(low));
         }
+        // reduce_mul
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            typename batch<T, sse4_2>::register_type low, high;
+            detail::split_avx(self, low, high);
+            batch<T, sse4_2> blow(low), bhigh(high);
+            return reduce_mul(blow * bhigh);
+        }
         // rsqrt
         template <class A>
         XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
@@ -1418,23 +1428,19 @@ namespace xsimd
         XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx>) noexcept
         {
             // duplicate low and high part of input
-            __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
-            __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0);
-            __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self));
-            __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
+            // Duplicate lanes separately
+            // 1) duplicate low and high lanes
+            __m256 lo = _mm256_permute2f128_ps(self, self, 0x00); // [low | low]
+            __m256 hi = _mm256_permute2f128_ps(self, self, 0x11); // [high| high]
             // normalize mask
             batch<uint32_t, A> half_mask = mask % 4;
             // permute within each lane
-            __m256 r0 = _mm256_permutevar_ps(low_low, half_mask);
-            __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask);
+            __m256 r0 = _mm256_permutevar_ps(lo, half_mask);
+            __m256 r1 = _mm256_permutevar_ps(hi, half_mask);
-            // mask to choose the right lane
             batch_bool<uint32_t, A> blend_mask = mask >= 4;
-            // blend the two permutes
             return _mm256_blendv_ps(r0, r1, batch_bool_cast<float>(blend_mask));
         }
@@ -1442,18 +1448,15 @@ namespace xsimd
         XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx>) noexcept
         {
             // duplicate low and high part of input
-            __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
-            __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0);
-            __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self));
-            __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
+            __m256d lo = _mm256_permute2f128_pd(self, self, 0x00);
+            __m256d hi = _mm256_permute2f128_pd(self, self, 0x11);
             // normalize mask
             batch<uint64_t, A> half_mask = -(mask & 1);
             // permute within each lane
-            __m256d r0 = _mm256_permutevar_pd(low_low, half_mask);
-            __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask);
+            __m256d r0 = _mm256_permutevar_pd(lo, half_mask);
+            __m256d r1 = _mm256_permutevar_pd(hi, half_mask);
             // mask to choose the right lane
             batch_bool<uint64_t, A> blend_mask = mask >= 2;
@@ -1479,53 +1482,67 @@ namespace xsimd
         // swizzle (constant mask)
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
-        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx>) noexcept
         {
-            // duplicate low and high part of input
-            __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
-            __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0);
-            __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self));
-            __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
+            constexpr bool is_identity = detail::is_identity(mask);
+            constexpr bool is_dup_low = detail::is_dup_lo(mask);
+            constexpr bool is_dup_hi = detail::is_dup_hi(mask);
+            constexpr bool is_dup = is_dup_low || is_dup_hi;
+            XSIMD_IF_CONSTEXPR(is_identity)
+            {
+                return self;
+            }
+            XSIMD_IF_CONSTEXPR(is_dup)
+            {
+                constexpr auto control = is_dup_low ? 0x00 : 0x11;
+                constexpr auto is_dup_identity = is_dup_low ? detail::is_identity<uint32_t, V0, V1, V2, V3>() : detail::is_identity<int64_t, V4 - 4, V5 - 4, V6 - 4, V7 - 4>();
+                auto split = _mm256_permute2f128_ps(self, self, control);
+                XSIMD_IF_CONSTEXPR(!is_dup_identity)
+                {
+                    constexpr auto shuffle_mask = is_dup_low ? detail::mod_shuffle(V0, V1, V2, V3) : detail::mod_shuffle(V4 - 4, V5 - 4, V6 - 4, V7 - 4);
+                    split = _mm256_permute_ps(split, shuffle_mask);
+                }
+                return split;
+            }
+            // Duplicate lanes separately
+            // 1) duplicate low and high lanes
+            __m256 low_dup = _mm256_permute2f128_ps(self, self, 0x00); // [low | low]
+            __m256 hi_dup = _mm256_permute2f128_ps(self, self, 0x11); // [high| high]
-            // normalize mask
-            batch_constant<uint32_t, A, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
+            // 2) build lane-local index vector (each element = source_index & 3)
+            constexpr batch_constant<uint32_t, A, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
-            // permute within each lane
-            __m256 r0 = _mm256_permutevar_ps(low_low, half_mask.as_batch());
-            __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask.as_batch());
+            __m256 r0 = _mm256_permutevar_ps(low_dup, half_mask.as_batch()); // pick from low lane
+            __m256 r1 = _mm256_permutevar_ps(hi_dup, half_mask.as_batch()); // pick from high lane
-            // mask to choose the right lane
-            batch_bool_constant<uint32_t, A, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;
+            constexpr batch_bool_constant<uint32_t, A, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> lane_mask {};
-            // blend the two permutes
-            constexpr auto mask = blend_mask.mask();
-            return _mm256_blend_ps(r0, r1, mask);
+            return _mm256_blend_ps(r0, r1, lane_mask.mask());
         }
         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx>) noexcept
         {
+            // cannot use detail::mod_shuffle as the mod and shift are different in this case
+            constexpr auto imm = ((V0 & 1) << 0) | ((V1 & 1) << 1) | ((V2 & 1) << 2) | ((V3 & 1) << 3);
+            XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; }
+            XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
+            {
+                return _mm256_permute_pd(self, imm);
+            }
             // duplicate low and high part of input
-            __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
-            __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0);
-            __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self));
-            __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
-            // normalize mask
-            batch_constant<uint64_t, A, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;
+            __m256d lo = _mm256_permute2f128_pd(self, self, 0x00);
+            __m256d hi = _mm256_permute2f128_pd(self, self, 0x11);
             // permute within each lane
-            __m256d r0 = _mm256_permutevar_pd(low_low, half_mask.as_batch());
-            __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask.as_batch());
+            __m256d r0 = _mm256_permute_pd(lo, imm);
+            __m256d r1 = _mm256_permute_pd(hi, imm);
             // mask to choose the right lane
-            batch_bool_constant<uint64_t, A, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
+            constexpr batch_bool_constant<uint64_t, A, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
             // blend the two permutes
-            constexpr auto mask = blend_mask.mask();
-            return _mm256_blend_pd(r0, r1, mask);
+            return _mm256_blend_pd(r0, r1, blend_mask.mask());
         }
         template <class A,
                   typename T,
@@ -1861,6 +1878,46 @@ namespace xsimd
             auto hi = _mm256_unpackhi_pd(self, other);
             return _mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1);
         }
+        // first
+        template <class A>
+        XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_cvtss_f32(self);
+        }
+        template <class A>
+        XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_cvtsd_f64(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return static_cast<T>(_mm256_cvtsi256_si32(self) & 0xFF);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return static_cast<T>(_mm256_cvtsi256_si32(self) & 0xFFFF);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return static_cast<T>(_mm256_cvtsi256_si32(self));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                batch<T, sse4_2> low = _mm256_castsi256_si128(self);
+                return first(low, sse4_2 {});
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
     }
 }

sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp CHANGED Viewed

@@ -17,6 +17,8 @@
 #include "../types/xsimd_avx2_register.hpp"
+#include <limits>
 namespace xsimd
 {
@@ -172,6 +174,29 @@ namespace xsimd
             }
         }
+        template <size_t shift, class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
+            static_assert(shift < bits, "Shift must be less than the number of bits in T");
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_slli_epi16(self, shift);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_slli_epi32(self, shift);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_slli_epi64(self, shift);
+            }
+            else
+            {
+                return bitwise_lshift<shift>(self, avx {});
+            }
+        }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
         {
@@ -252,6 +277,65 @@ namespace xsimd
             }
         }
+        template <size_t shift, class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
+            static_assert(shift < bits, "Shift amount must be less than the number of bits in T");
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    __m256i sign_mask = _mm256_set1_epi16((0xFF00 >> shift) & 0x00FF);
+                    __m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self);
+                    __m256i res = _mm256_srai_epi16(self, shift);
+                    return _mm256_or_si256(
+                        detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                           { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
+                                           sign_mask, cmp_is_negative),
+                        _mm256_andnot_si256(sign_mask, res));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_srai_epi16(self, shift);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_srai_epi32(self, shift);
+                }
+                else
+                {
+                    return bitwise_rshift<shift>(self, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    const __m256i byte_mask = _mm256_set1_epi16(0x00FF);
+                    __m256i u16 = _mm256_and_si256(self, byte_mask);
+                    __m256i r16 = _mm256_srli_epi16(u16, shift);
+                    return _mm256_and_si256(r16, byte_mask);
+                }
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_srli_epi16(self, shift);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_srli_epi32(self, shift);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm256_srli_epi64(self, shift);
+                }
+                else
+                {
+                    return bitwise_rshift<shift>(self, avx {});
+                }
+            }
+        }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
         {
@@ -657,9 +741,35 @@ namespace xsimd
         // rotate_left
         template <size_t N, class A>
+        XSIMD_INLINE batch<uint8_t, A> rotate_left(batch<uint8_t, A> const& self, requires_arch<avx2>) noexcept
+        {
+            auto other = _mm256_permute2x128_si256(self, self, 0x1);
+            if (N < 16)
+            {
+                return _mm256_alignr_epi8(other, self, N);
+            }
+            else
+            {
+                return _mm256_alignr_epi8(self, other, N - 16);
+            }
+        }
+        template <size_t N, class A>
+        XSIMD_INLINE batch<int8_t, A> rotate_left(batch<int8_t, A> const& self, requires_arch<avx2>) noexcept
+        {
+            return bitwise_cast<int8_t>(rotate_left<N, A>(bitwise_cast<uint8_t>(self), avx2 {}));
+        }
+        template <size_t N, class A>
         XSIMD_INLINE batch<uint16_t, A> rotate_left(batch<uint16_t, A> const& self, requires_arch<avx2>) noexcept
         {
-            return _mm256_alignr_epi8(self, self, N);
+            auto other = _mm256_permute2x128_si256(self, self, 0x1);
+            if (N < 8)
+            {
+                return _mm256_alignr_epi8(other, self, 2 * N);
+            }
+            else
+            {
+                return _mm256_alignr_epi8(self, other, 2 * (N - 8));
+            }
         }
         template <size_t N, class A>
         XSIMD_INLINE batch<int16_t, A> rotate_left(batch<int16_t, A> const& self, requires_arch<avx2>) noexcept
@@ -879,9 +989,8 @@ namespace xsimd
         template <class A>
         XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
         {
-            return _mm256_permutevar8x32_ps(self, mask);
+            return swizzle(self, mask, avx {});
         }
         template <class A>
         XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept
         {
@@ -903,7 +1012,7 @@ namespace xsimd
         template <class A>
         XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
         {
-            return _mm256_permutevar8x32_epi32(self, mask);
+            return swizzle(self, mask, avx {});
         }
         template <class A>
         XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
@@ -915,20 +1024,33 @@ namespace xsimd
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
         XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
         {
-            return _mm256_permutevar8x32_ps(self, mask.as_batch());
+            XSIMD_IF_CONSTEXPR(detail::is_all_different(mask) && !detail::is_identity(mask))
+            {
+                // The intrinsic does NOT allow to copy the same element of the source vector to more than one element of the destination vector.
+                // one-shot 8-lane permute
+                return _mm256_permutevar8x32_ps(self, mask.as_batch());
+            }
+            return swizzle(self, mask, avx {});
         }
         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
         {
-            constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
-            return _mm256_permute4x64_pd(self, mask);
+            XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; }
+            XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
+            {
+                constexpr auto imm = ((V0 & 1) << 0) | ((V1 & 1) << 1) | ((V2 & 1) << 2) | ((V3 & 1) << 3);
+                return _mm256_permute_pd(self, imm);
+            }
+            constexpr auto imm = detail::mod_shuffle(V0, V1, V2, V3);
+            // fallback to full 4-element permute
+            return _mm256_permute4x64_pd(self, imm);
         }
         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
         XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
         {
-            constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
+            constexpr auto mask = detail::mod_shuffle(V0, V1, V2, V3);
             return _mm256_permute4x64_epi64(self, mask);
         }
         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>

sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp CHANGED Viewed

@@ -429,18 +429,6 @@ namespace xsimd
             return detail::compare_int_avx512bw<A, T, _MM_CMPINT_NE>(self, other);
         }
-        // rotate_left
-        template <size_t N, class A>
-        XSIMD_INLINE batch<uint16_t, A> rotate_left(batch<uint16_t, A> const& self, requires_arch<avx512bw>) noexcept
-        {
-            return _mm512_alignr_epi8(self, self, N);
-        }
-        template <size_t N, class A>
-        XSIMD_INLINE batch<int16_t, A> rotate_left(batch<int16_t, A> const& self, requires_arch<avx512bw>) noexcept
-        {
-            return bitwise_cast<int16_t>(rotate_left<N, A>(bitwise_cast<uint16_t>(self), avx512bw {}));
-        }
         // sadd
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
@@ -496,115 +484,25 @@ namespace xsimd
         }
         // slide_left
-        namespace detail
-        {
-            template <size_t... Is>
-            constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_hi(::xsimd::detail::index_sequence<Is...>)
-            {
-                return { (Is == 0 ? 8 : Is - 1)... };
-            }
-            template <size_t N, size_t... Is>
-            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_pattern(::xsimd::detail::index_sequence<Is...>)
-            {
-                return { (Is >= N ? Is - N : 0)... };
-            }
-            template <size_t N, size_t... Is>
-            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_mask(::xsimd::detail::index_sequence<Is...>)
-            {
-                return { (Is >= N ? 0xFFFF : 0x0000)... };
-            }
-        }
-        template <size_t N, class A, class T>
+        template <size_t N, class A, class T, class = typename std::enable_if<(N & 3) == 2 && (N < 64)>::type>
         XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
         {
-            constexpr unsigned BitCount = N * 8;
-            if (BitCount == 0)
-            {
-                return x;
-            }
-            if (BitCount >= 512)
-            {
-                return batch<T, A>(T(0));
-            }
-            batch<T, A> xx;
-            if (N & 1)
-            {
-                alignas(A::alignment()) uint64_t buffer[8];
-                _mm512_store_epi64(&buffer[0], x);
-                for (int i = 7; i > 0; --i)
-                    buffer[i] = (buffer[i] << 8) | (buffer[i - 1] >> 56);
-                buffer[0] = buffer[0] << 8;
-                xx = _mm512_load_epi64(&buffer[0]);
+            static_assert((N & 3) == 2 && N < 64, "The AVX512F implementation may have a lower latency.");
-                alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_hi(::xsimd::detail::make_index_sequence<512 / 64>());
-                __m512i xl = _mm512_slli_epi64(x, 8);
-                __m512i xr = _mm512_srli_epi64(x, 56);
-                xr = _mm512_permutex2var_epi64(xr, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
-                xx = _mm512_or_si512(xr, xl);
-                if (N == 1)
-                    return xx;
-            }
-            else
-            {
-                xx = x;
-            }
-            alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
-            alignas(A::alignment()) auto slide_mask = detail::make_slide_left_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
-            return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
+            __mmask32 mask = 0xFFFFFFFFu << ((N / 2) & 31);
+            auto slide_pattern = make_batch_constant<uint16_t, detail::make_slide_left_pattern<N / 2>, A>();
+            return _mm512_maskz_permutexvar_epi16(mask, slide_pattern.as_batch(), x);
         }
         // slide_right
-        namespace detail
-        {
-            template <size_t... Is>
-            constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_low(::xsimd::detail::index_sequence<Is...>)
-            {
-                return { (Is + 1)... };
-            }
-            template <size_t N, size_t... Is>
-            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_pattern(::xsimd::detail::index_sequence<Is...>)
-            {
-                return { (Is < (32 - N) ? Is + N : 0)... };
-            }
-            template <size_t N, size_t... Is>
-            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_mask(::xsimd::detail::index_sequence<Is...>)
-            {
-                return { (Is < 32 - N ? 0xFFFF : 0x0000)... };
-            }
-        }
-        template <size_t N, class A, class T>
+        template <size_t N, class A, class T, class = typename std::enable_if<(N & 3) == 2 && (N < 64)>::type>
         XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
         {
-            constexpr unsigned BitCount = N * 8;
-            if (BitCount == 0)
-            {
-                return x;
-            }
-            if (BitCount >= 512)
-            {
-                return batch<T, A>(T(0));
-            }
-            batch<T, A> xx;
-            if (N & 1)
-            {
-                alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_low(::xsimd::detail::make_index_sequence<512 / 64>());
-                __m512i xr = _mm512_srli_epi64(x, 8);
-                __m512i xl = _mm512_slli_epi64(x, 56);
-                xl = _mm512_permutex2var_epi64(xl, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
-                xx = _mm512_or_si512(xr, xl);
-                if (N == 1)
-                    return xx;
-            }
-            else
-            {
-                xx = x;
-            }
-            alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
-            alignas(A::alignment()) auto slide_mask = detail::make_slide_right_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
-            return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
+            static_assert((N & 3) == 2 && N < 64, "The AVX512F implementation may have a lower latency.");
+            __mmask32 mask = 0xFFFFFFFFu >> ((N / 2) & 31);
+            auto slide_pattern = make_batch_constant<uint16_t, detail::make_slide_right_pattern<N / 2>, A>();
+            return _mm512_maskz_permutexvar_epi16(mask, slide_pattern.as_batch(), x);
         }
         // ssub