PyPI - sequenzo - Versions diffs - 0.1.17__cp39-cp39-macosx_10_9_universal2.whl → 0.1.18__cp39-cp39-macosx_10_9_universal2.whl - Mend

sequenzo 0.1.17__cp39-cp39-macosx_10_9_universal2.whl → 0.1.18__cp39-cp39-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sequenzo might be problematic. Click here for more details.

Files changed (86) hide show

sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp CHANGED Viewed

@@ -9,8 +9,8 @@
  * The full license is in the file LICENSE, distributed with this software. *
  ****************************************************************************/
-#ifndef XSIMD_AVX512_DQHPP
-#define XSIMD_AVX512_D_HPP
+#ifndef XSIMD_AVX512DQ_HPP
+#define XSIMD_AVX512DQ_HPP
 #include "../types/xsimd_avx512dq_register.hpp"
@@ -47,12 +47,12 @@ namespace xsimd
         // bitwise_not
         template <class A>
-        XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512dq>) noexcept
         {
             return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1)));
         }
         template <class A>
-        XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512dq>) noexcept
         {
             return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1)));
         }
@@ -96,7 +96,7 @@ namespace xsimd
             // tmp1 = [a0..8, b0..8]
             // tmp2 = [a8..f, b8..f]
 #define XSIMD_AVX512_HADDP_STEP1(I, a, b)                                \
-    batch<float, avx512f> res##I;                                        \
+    batch<float, avx512dq> res##I;                                       \
     {                                                                    \
         auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
         auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
@@ -180,7 +180,7 @@ namespace xsimd
         // reduce_add
         template <class A>
-        XSIMD_INLINE float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE float reduce_add(batch<float, A> const& rhs, requires_arch<avx512dq>) noexcept
         {
             __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
             __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
@@ -188,11 +188,43 @@ namespace xsimd
             return reduce_add(batch<float, avx2>(res1), avx2 {});
         }
+        // reduce_mul
+        template <class A>
+        XSIMD_INLINE float reduce_mul(batch<float, A> const& rhs, requires_arch<avx512dq>) noexcept
+        {
+            __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
+            __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
+            __m256 res1 = _mm256_mul_ps(tmp1, tmp2);
+            return reduce_mul(batch<float, avx2>(res1), avx2 {});
+        }
+        // swizzle constant mask
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7,
+                  uint32_t V8, uint32_t V9, uint32_t V10, uint32_t V11, uint32_t V12, uint32_t V13, uint32_t V14, uint32_t V15>
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self,
+                                             batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask,
+                                             requires_arch<avx512dq>) noexcept
+        {
+            constexpr bool dup_lo = detail::is_dup_lo(mask);
+            constexpr bool dup_hi = detail::is_dup_hi(mask);
+            XSIMD_IF_CONSTEXPR(dup_lo || dup_hi)
+            {
+                const batch<float, avx2> half = _mm512_extractf32x8_ps(self, dup_lo ? 0 : 1);
+                constexpr typename std::conditional<dup_lo, batch_constant<uint32_t, avx2, V0 % 8, V1 % 8, V2 % 8, V3 % 8, V4 % 8, V5 % 8, V6 % 8, V7 % 8>,
+                                                    batch_constant<uint32_t, avx2, V8 % 8, V9 % 8, V10 % 8, V11 % 8, V12 % 8, V13 % 8, V14 % 8, V15 % 8>>::type half_mask {};
+                auto permuted = swizzle(half, half_mask, avx2 {});
+                // merge the two slices into an AVX512F register:
+                return _mm512_broadcast_f32x8(permuted); // duplicates the 256-bit perm into both halves
+            }
+            return swizzle(self, mask, avx512f {});
+        }
         // convert
         namespace detail
         {
             template <class A>
-            XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx512dq>) noexcept
+            XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& self, batch<double, A> const&, requires_arch<avx512dq>) noexcept
             {
                 return _mm512_cvtepi64_pd(self);
             }

sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp CHANGED Viewed

@@ -32,6 +32,8 @@ namespace xsimd
         XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, Mask const& mask, requires_arch<common>) noexcept;
         template <class A, class T, size_t I>
         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<common>) noexcept;
+        template <class A, class T, class ITy, ITy... Is>
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch_constant<ITy, A, Is...>, requires_arch<common>) noexcept;
         template <class A>
         XSIMD_INLINE void transpose(batch<uint16_t, A>* matrix_begin, batch<uint16_t, A>* matrix_end, requires_arch<common>) noexcept;
         template <class A>
@@ -562,6 +564,100 @@ namespace xsimd
             }
         }
+        // rotl
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_rolv_epi32(self, other);
+            }
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_rolv_epi64(self, other);
+            }
+            return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                      { return rotl(batch<T, avx2>(s), batch<T, avx2>(o), avx2 {}); },
+                                      self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
+        {
+            return rotl(self, batch<T, A>(other), A {});
+        }
+        template <size_t count, class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
+            static_assert(count < bits, "Count must be less than the number of bits in T");
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_rol_epi32(self, count);
+            }
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_rol_epi64(self, count);
+            }
+            return detail::fwd_to_avx([](__m256i s) noexcept
+                                      { return rotl<count>(batch<T, avx2>(s), avx2 {}); },
+                                      self);
+        }
+        // rotr
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) < 4)
+            {
+                return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                          { return rotr(batch<T, avx2>(s), batch<T, avx2>(o), avx2 {}); },
+                                          self, other);
+            }
+            XSIMD_IF_CONSTEXPR(std::is_unsigned<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_rorv_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_rorv_epi64(self, other);
+                }
+            }
+            return rotr(self, other, common {});
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
+        {
+            return rotr(self, batch<T, A>(other), A {});
+        }
+        template <size_t count, class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
+            static_assert(count < bits, "Count must be less than the number of bits in T");
+            XSIMD_IF_CONSTEXPR(sizeof(T) < 4)
+            {
+                return detail::fwd_to_avx([](__m256i s) noexcept
+                                          { return rotr<count>(batch<T, avx2>(s), avx2 {}); },
+                                          self);
+            }
+            XSIMD_IF_CONSTEXPR(std::is_unsigned<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_ror_epi32(self, count);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_ror_epi64(self, count);
+                }
+            }
+            return rotr<count>(self, common {});
+        }
         // bitwise_xor
         template <class A>
         XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
@@ -578,7 +674,7 @@ namespace xsimd
         XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
-            return register_type(self.data | other.data);
+            return register_type(self.data ^ other.data);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
@@ -900,6 +996,18 @@ namespace xsimd
         {
             return _mm512_fmsub_pd(x, y, z);
         }
+        // fmas
+        template <class A>
+        XSIMD_INLINE batch<float, A> fmas(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fmaddsub_ps(x, y, z);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> fmas(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fmaddsub_pd(x, y, z);
+        }
         // from bool
         template <class A, class T>
@@ -1312,12 +1420,12 @@ namespace xsimd
         template <class A>
         XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
-            return _mm512_max_ps(self, other);
+            return _mm512_max_ps(other, self);
         }
         template <class A>
         XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
         {
-            return _mm512_max_pd(self, other);
+            return _mm512_max_pd(other, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
@@ -1362,12 +1470,12 @@ namespace xsimd
         template <class A>
         XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
         {
-            return _mm512_min_ps(self, other);
+            return _mm512_min_ps(other, self);
         }
         template <class A>
         XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
         {
-            return _mm512_min_pd(self, other);
+            return _mm512_min_pd(other, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
@@ -1544,6 +1652,37 @@ namespace xsimd
             return reduce_min(batch<T, avx2>(low));
         }
+        // reduce_mul
+        template <class A>
+        XSIMD_INLINE float reduce_mul(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_reduce_mul_ps(rhs);
+        }
+        template <class A>
+        XSIMD_INLINE double reduce_mul(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_reduce_mul_pd(rhs);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_reduce_mul_epi32(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_reduce_mul_epi64(self);
+            }
+            else
+            {
+                __m256i low, high;
+                detail::split_avx512(self, low, high);
+                batch<T, avx2> blow(low), bhigh(high);
+                return reduce_mul(blow, avx2 {}) * reduce_mul(bhigh, avx2 {});
+            }
+        }
         // rsqrt
         template <class A>
         XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
@@ -1726,8 +1865,8 @@ namespace xsimd
                 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
             };
 #else
-            return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
-                                    v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
+            return _mm512_set_epi16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
+                                    v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
 #endif
         }
@@ -1743,8 +1882,8 @@ namespace xsimd
                 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
             };
 #else
-            return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
-                                    v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
+            return _mm512_set_epi16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
+                                    v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
 #endif
         }
@@ -1767,10 +1906,10 @@ namespace xsimd
                 v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63
             };
 #else
-            return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
-                                   v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
-                                   v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
-                                   v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63);
+            return _mm512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
+                                   v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
+                                   v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
+                                   v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
 #endif
         }
         template <class A, class T, detail::enable_unsigned_integer_t<T> = 0>
@@ -1792,10 +1931,10 @@ namespace xsimd
                 v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63
             };
 #else
-            return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
-                                   v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
-                                   v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
-                                   v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63);
+            return _mm512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
+                                   v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
+                                   v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
+                                   v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
 #endif
         }
@@ -1845,19 +1984,110 @@ namespace xsimd
         }
         // slide_left
+        namespace detail
+        {
+            template <size_t N>
+            struct make_slide_left_pattern
+            {
+                static constexpr size_t get(size_t i, size_t)
+                {
+                    return i >= N ? i - N : 0;
+                }
+            };
+            template <size_t N, class A, class T>
+            XSIMD_INLINE batch<T, A> slide_left_aligned_u32(batch<T, A> const& x, requires_arch<avx512f>) noexcept
+            {
+                static_assert((N & 3) == 0 || N >= 64, "N must be aligned to 32 bits");
+                if (N == 0)
+                {
+                    return x;
+                }
+                if (N >= 64)
+                {
+                    return batch<T, A>(T(0));
+                }
+                __mmask16 mask = uint16_t(0xFFFFu << (N / 4));
+                if ((N & 15) == 0)
+                {
+                    const uint8_t imm8 = uint8_t(0xe4 << (2 * (N / 16)));
+                    return _mm512_maskz_shuffle_i32x4(mask, x, x, imm8);
+                }
+                auto slide_pattern = make_batch_constant<uint32_t, detail::make_slide_left_pattern<N / 4>, A>();
+                return _mm512_maskz_permutexvar_epi32(mask, slide_pattern.as_batch(), x);
+            }
+        }
         template <size_t N, class A, class T>
-        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const&, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512f>) noexcept
         {
-            static_assert(N == 0xDEAD, "not implemented yet");
-            return {};
+            constexpr size_t NN = N & ~3;
+            if (N == NN || NN >= 64)
+            {
+                // Call fast path
+                return detail::slide_left_aligned_u32<NN>(x, A {});
+            }
+            __m512i xl = detail::slide_left_aligned_u32<NN, A, T>(_mm512_slli_epi32(x, 8 * (N - NN)), A {});
+            __m512i xr = detail::slide_left_aligned_u32<NN + 4, A, T>(_mm512_srli_epi32(x, 32 - 8 * (N - NN)), A {});
+            return _mm512_or_epi32(xl, xr);
         }
         // slide_right
+        namespace detail
+        {
+            template <size_t N>
+            struct make_slide_right_pattern
+            {
+                static constexpr size_t get(size_t i, size_t n)
+                {
+                    return i < (n - N) ? i + N : 0;
+                }
+            };
+            template <size_t N, class A, class T>
+            XSIMD_INLINE batch<T, A> slide_right_aligned_u32(batch<T, A> const& x, requires_arch<avx512f>) noexcept
+            {
+                static_assert((N & 3) == 0 || N >= 64, "N must be aligned to 32 bits");
+                if (N == 0)
+                {
+                    return x;
+                }
+                if (N >= 64)
+                {
+                    return batch<T, A>(T(0));
+                }
+                __mmask16 mask = 0xFFFFu >> (N / 4);
+                if ((N & 15) == 0)
+                {
+                    const uint8_t imm8 = 0xe4 >> (2 * (N / 16));
+                    return _mm512_maskz_shuffle_i32x4(mask, x, x, imm8);
+                }
+                auto slide_pattern = make_batch_constant<uint32_t, detail::make_slide_right_pattern<N / 4>, A>();
+                return _mm512_maskz_permutexvar_epi32(mask, slide_pattern.as_batch(), x);
+            }
+        }
         template <size_t N, class A, class T>
-        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const&, requires_arch<avx512f>) noexcept
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512f>) noexcept
         {
-            static_assert(N == 0xDEAD, "not implemented yet");
-            return {};
+            constexpr size_t NN = N & ~3;
+            if (N == NN || NN >= 64)
+            {
+                // Call fast path
+                return detail::slide_right_aligned_u32<NN>(x, A {});
+            }
+            __m512i xl = detail::slide_right_aligned_u32<NN + 4, A, T>(_mm512_slli_epi32(x, 32 - 8 * (N - NN)), A {});
+            __m512i xr = detail::slide_right_aligned_u32<NN, A, T>(_mm512_srli_epi32(x, 8 * (N - NN)), A {});
+            return _mm512_or_epi32(xl, xr);
         }
         // sqrt
@@ -2019,16 +2249,53 @@ namespace xsimd
             return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx512f {}));
         }
-        // swizzle (constant version)
-        template <class A, uint32_t... Vs>
-        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7,
+                  uint32_t V8, uint32_t V9, uint32_t V10, uint32_t V11, uint32_t V12, uint32_t V13, uint32_t V14, uint32_t V15>
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self,
+                                             batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask,
+                                             requires_arch<avx512f>) noexcept
         {
+            XSIMD_IF_CONSTEXPR(detail::is_identity(mask))
+            {
+                return self;
+            }
+            XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
+            {
+                constexpr int imm0 = detail::mod_shuffle(V0, V1, V2, V3);
+                constexpr int imm1 = detail::mod_shuffle(V4, V5, V6, V7);
+                constexpr int imm2 = detail::mod_shuffle(V8, V9, V10, V11);
+                constexpr int imm3 = detail::mod_shuffle(V12, V13, V14, V15);
+                XSIMD_IF_CONSTEXPR(imm0 == imm1 && imm0 == imm2 && imm0 == imm3)
+                {
+                    return _mm512_permute_ps(self, imm0);
+                }
+            }
             return swizzle(self, mask.as_batch(), avx512f {});
         }
-        template <class A, uint64_t... Vs>
-        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
+        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3, uint64_t V4, uint64_t V5, uint64_t V6, uint64_t V7>
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self,
+                                              batch_constant<uint64_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask,
+                                              requires_arch<avx512f>) noexcept
         {
+            XSIMD_IF_CONSTEXPR(detail::is_identity(mask))
+            {
+                return self;
+            }
+            XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
+            {
+                constexpr auto imm = ((V0 & 1) << 0) | ((V1 & 1) << 1) | ((V2 & 1) << 2) | ((V3 & 1) << 3) | ((V4 & 1) << 4) | ((V5 & 1) << 5) | ((V6 & 1) << 6) | ((V7 & 1) << 7);
+                return _mm512_permute_pd(self, imm);
+            }
+            constexpr bool dup_lo = detail::is_dup_lo(mask);
+            constexpr bool dup_hi = detail::is_dup_hi(mask);
+            XSIMD_IF_CONSTEXPR(dup_lo || dup_hi)
+            {
+                const batch<double, avx2> half = _mm512_extractf64x4_pd(self, dup_lo ? 0 : 1);
+                constexpr typename std::conditional<dup_lo, batch_constant<uint64_t, avx2, V0 % 4, V1 % 4, V2 % 4, V3 % 4>,
+                                                    batch_constant<uint64_t, avx2, V4 % 4, V5 % 4, V6 % 4, V7 % 4>>::type half_mask {};
+                return _mm512_broadcast_f64x4(swizzle(half, half_mask, avx2 {}));
+            }
+            // General case
             return swizzle(self, mask.as_batch(), avx512f {});
         }
@@ -2337,8 +2604,47 @@ namespace xsimd
                 2));
         }
-    }
+        // first
+        template <class A>
+        XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cvtss_f32(self);
+        }
+        template <class A>
+        XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cvtsd_f64(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return static_cast<T>(_mm512_cvtsi512_si32(self) & 0xFF);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return static_cast<T>(_mm512_cvtsi512_si32(self) & 0xFFFF);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return static_cast<T>(_mm512_cvtsi512_si32(self));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                batch<T, sse4_2> low = _mm512_castsi512_si128(self);
+                return first(low, sse4_2 {});
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+    }
 }
 #endif

sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp CHANGED Viewed

@@ -24,54 +24,26 @@ namespace xsimd
     {
         using namespace types;
-        namespace detail
-        {
-            template <size_t N, size_t... Is>
-            constexpr std::array<uint8_t, sizeof...(Is)> make_slide_left_bytes_pattern(::xsimd::detail::index_sequence<Is...>)
-            {
-                return { (Is >= N ? Is - N : 0)... };
-            }
-            template <size_t N, size_t... Is>
-            constexpr std::array<uint8_t, sizeof...(Is)> make_slide_right_bytes_pattern(::xsimd::detail::index_sequence<Is...>)
-            {
-                return { (Is < (64 - N) ? Is + N : 0)... };
-            }
-        }
         // slide_left
-        template <size_t N, class A, class T>
+        template <size_t N, class A, class T, class = typename std::enable_if<(N & 3) != 0 && (N < 64)>::type>
         XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512vbmi>) noexcept
         {
-            if (N == 0)
-            {
-                return x;
-            }
-            if (N >= 64)
-            {
-                return batch<T, A>(T(0));
-            }
+            static_assert((N & 3) != 0 && N < 64, "The AVX512F implementation may have a lower latency.");
             __mmask64 mask = 0xFFFFFFFFFFFFFFFFull << (N & 63);
-            alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_bytes_pattern<N>(::xsimd::detail::make_index_sequence<512 / 8>());
-            return _mm512_maskz_permutexvar_epi8(mask, _mm512_load_epi32(slide_pattern.data()), x);
+            auto slide_pattern = make_batch_constant<uint8_t, detail::make_slide_left_pattern<N>, A>();
+            return _mm512_maskz_permutexvar_epi8(mask, slide_pattern.as_batch(), x);
         }
         // slide_right
-        template <size_t N, class A, class T>
+        template <size_t N, class A, class T, class = typename std::enable_if<(N & 3) != 0 && (N < 64)>::type>
         XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512vbmi>) noexcept
         {
-            if (N == 0)
-            {
-                return x;
-            }
-            if (N >= 64)
-            {
-                return batch<T, A>(T(0));
-            }
+            static_assert((N & 3) != 0 && N < 64, "The AVX512F implementation may have a lower latency.");
             __mmask64 mask = 0xFFFFFFFFFFFFFFFFull >> (N & 63);
-            alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_bytes_pattern<N>(::xsimd::detail::make_index_sequence<512 / 8>());
-            return _mm512_maskz_permutexvar_epi8(mask, _mm512_load_epi32(slide_pattern.data()), x);
+            auto slide_pattern = make_batch_constant<uint8_t, detail::make_slide_right_pattern<N>, A>();
+            return _mm512_maskz_permutexvar_epi8(mask, slide_pattern.as_batch(), x);
         }
         // swizzle (dynamic version)