PyPI - sequenzo - Versions diffs - 0.1.17__cp39-cp39-macosx_10_9_universal2.whl → 0.1.18__cp39-cp39-macosx_10_9_universal2.whl - Mend

sequenzo 0.1.17__cp39-cp39-macosx_10_9_universal2.whl → 0.1.18__cp39-cp39-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sequenzo might be problematic. Click here for more details.

Files changed (86) hide show

sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp CHANGED Viewed

@@ -292,6 +292,36 @@ namespace xsimd
                 return {};
             }
         }
+        template <size_t shift, class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
+            static_assert(shift < bits, "Count must be less than the number of bits in T");
+            XSIMD_IF_CONSTEXPR(shift == 0)
+            {
+                return self;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                // 8-bit left shift via 16-bit shift + mask
+                __m128i shifted = _mm_slli_epi16(self, static_cast<int>(shift));
+                __m128i mask = _mm_set1_epi8(static_cast<char>(0xFF << shift));
+                return _mm_and_si128(shifted, mask);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_slli_epi16(self, static_cast<int>(shift));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_slli_epi32(self, static_cast<int>(shift));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_slli_epi64(self, static_cast<int>(shift));
+            }
+            return bitwise_lshift<shift>(self, common {});
+        }
         // bitwise_not
         template <class A>
@@ -420,6 +450,63 @@ namespace xsimd
                 }
             }
         }
+        template <size_t shift, class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
+            static_assert(shift < bits,
+                          "Shift must be less than the number of value bits in the type");
+            XSIMD_IF_CONSTEXPR(shift == 0)
+            {
+                return self;
+            }
+            XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    // 8-bit arithmetic right shift via 16-bit shift + sign-extension handling.
+                    __m128i shifted = _mm_srai_epi16(self, static_cast<int>(shift));
+                    __m128i sign_mask = _mm_set1_epi16(static_cast<short>(0xFF00 >> shift));
+                    __m128i cmp_negative = _mm_cmpgt_epi8(_mm_setzero_si128(), self);
+                    return _mm_or_si128(_mm_and_si128(sign_mask, cmp_negative),
+                                        _mm_andnot_si128(sign_mask, shifted));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_srai_epi16(self, static_cast<int>(shift));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_srai_epi32(self, static_cast<int>(shift));
+                }
+                // No 64-bit arithmetic right shift in SSE2; fall back
+                return bitwise_rshift<shift>(self, common {});
+            }
+            else // unsigned / logical right shift
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    // Emulate byte-wise logical right shift using 16-bit shifts + per-byte mask.
+                    __m128i s16 = _mm_srli_epi16(self, static_cast<int>(shift));
+                    __m128i mask = _mm_set1_epi8(static_cast<char>(0xFFu >> shift));
+                    return _mm_and_si128(s16, mask);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_srli_epi16(self, static_cast<int>(shift));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_srli_epi32(self, static_cast<int>(shift));
+                }
+                else // sizeof(T) == 8
+                {
+                    return _mm_srli_epi64(self, static_cast<int>(shift));
+                }
+            }
+        }
         // bitwise_xor
         template <class A>
@@ -673,6 +760,53 @@ namespace xsimd
             return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other)));
         }
+        // first
+        template <class A>
+        XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtss_f32(self);
+        }
+        template <class A>
+        XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtsd_f64(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return static_cast<T>(_mm_cvtsi128_si32(self) & 0xFF);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return static_cast<T>(_mm_cvtsi128_si32(self) & 0xFFFF);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return static_cast<T>(_mm_cvtsi128_si32(self));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+#if defined(__x86_64__)
+                return static_cast<T>(_mm_cvtsi128_si64(self));
+#else
+                __m128i m;
+                _mm_storel_epi64(&m, self);
+                int64_t i;
+                std::memcpy(&i, &m, sizeof(i));
+                return i;
+#endif
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
         // from_mask
         template <class A>
         XSIMD_INLINE batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
@@ -1090,7 +1224,7 @@ namespace xsimd
         template <class A>
         XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
         {
-            return _mm_max_ps(self, other);
+            return _mm_max_ps(other, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
@@ -1100,14 +1234,14 @@ namespace xsimd
         template <class A>
         XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
         {
-            return _mm_max_pd(self, other);
+            return _mm_max_pd(other, self);
         }
         // min
         template <class A>
         XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
         {
-            return _mm_min_ps(self, other);
+            return _mm_min_ps(other, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
@@ -1117,7 +1251,7 @@ namespace xsimd
         template <class A>
         XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
         {
-            return _mm_min_pd(self, other);
+            return _mm_min_pd(other, self);
         }
         // mul
@@ -1243,7 +1377,7 @@ namespace xsimd
             }
             else
             {
-                return hadd(self, common {});
+                return reduce_add(self, common {});
             }
         }
@@ -1269,10 +1403,10 @@ namespace xsimd
             batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
             batch<T, A> acc2 = max(acc1, step2);
             if (sizeof(T) == 2)
-                return acc2.get(0);
+                return first(acc2, A {});
             batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
             batch<T, A> acc3 = max(acc2, step3);
-            return acc3.get(0);
+            return first(acc3, A {});
         }
         // reduce_min
@@ -1291,10 +1425,56 @@ namespace xsimd
             batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
             batch<T, A> acc2 = min(acc1, step2);
             if (sizeof(T) == 2)
-                return acc2.get(0);
+                return first(acc2, A {});
             batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
             batch<T, A> acc3 = min(acc2, step3);
-            return acc3.get(0);
+            return first(acc3, A {});
+        }
+        // reduce_mul
+        template <class A>
+        XSIMD_INLINE float reduce_mul(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            __m128 tmp0 = _mm_mul_ps(self, _mm_movehl_ps(self, self));
+            __m128 tmp1 = _mm_mul_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
+            return _mm_cvtss_f32(tmp1);
+        }
+        template <class A>
+        XSIMD_INLINE double reduce_mul(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtsd_f64(_mm_mul_sd(self, _mm_unpackhi_pd(self, self)));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                batch<T, A> tmp1 = _mm_shuffle_epi32(self, _MM_SHUFFLE(0, 1, 2, 3));
+                tmp1 = tmp1 * self;
+                batch<T, A> tmp2 = _mm_unpackhi_epi32(tmp1, tmp1);
+                tmp2 = tmp2 * tmp1;
+                return _mm_cvtsi128_si32(tmp2);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                batch<T, A> tmp1 = _mm_unpackhi_epi64(self, self);
+                auto tmp2 = tmp1 * self;
+#if defined(__x86_64__)
+                return _mm_cvtsi128_si64(tmp2);
+#else
+                __m128i m;
+                _mm_storel_epi64(&m, tmp2);
+                int64_t i;
+                std::memcpy(&i, &m, sizeof(i));
+                return i;
+#endif
+            }
+            else
+            {
+                return reduce_mul(self, common {});
+            }
         }
         // rsqrt
@@ -1641,22 +1821,78 @@ namespace xsimd
         }
         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
-        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<sse2>) noexcept
         {
-            // permute within each lane
+            constexpr bool is_identity = detail::is_identity(mask);
+            constexpr bool is_dup_lo = detail::is_dup_lo(mask);
+            constexpr bool is_dup_hi = detail::is_dup_hi(mask);
+            XSIMD_IF_CONSTEXPR(is_identity)
+            {
+                return self;
+            }
+            XSIMD_IF_CONSTEXPR(is_dup_lo)
+            {
+                // permute the low half
+                constexpr int imm = detail::mod_shuffle(V0, V1, V2, V3);
+                const auto lo = _mm_shufflelo_epi16(self, imm);
+                // broadcast that 64-bit low half into both halves
+                const auto lo_all = _mm_unpacklo_epi64(lo, lo);
+                return lo_all;
+            }
+            XSIMD_IF_CONSTEXPR(is_dup_hi)
+            {
+                // permute the high half
+                constexpr int imm = detail::mod_shuffle(V4, V5, V6, V7);
+                const auto hi = _mm_shufflehi_epi16(self, imm);
+                // broadcast that 64-bit high half into both halves
+                const auto hi_all = _mm_unpackhi_epi64(hi, hi);
+                return hi_all;
+            }
+            // Only pick elements from the low lane
+            XSIMD_IF_CONSTEXPR((V0 < 4) && (V1 < 4) && (V2 < 4) && (V3 < 4) && (V4 < 4) && (V5 < 4) && (V6 < 4) && (V7 < 4))
+            {
+                // permute within each sub lane
+                constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3);
+                constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7);
+                __m128i lol = _mm_shufflelo_epi16(self, mask_lo);
+                __m128i loh = _mm_shufflelo_epi16(self, mask_hi);
+                // generate temporary lanes
+                return _mm_unpacklo_epi64(lol, loh);
+            }
+            // Only pick elements from the high lane
+            XSIMD_IF_CONSTEXPR((V0 >= 4) && (V1 >= 4) && (V2 >= 4) && (V3 >= 4) && (V4 >= 4) && (V5 >= 4) && (V6 >= 4) && (V7 >= 4))
+            {
+                // permute within each sub lane
+                constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3);
+                constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7);
+                __m128i hil = _mm_shufflehi_epi16(self, mask_lo);
+                __m128i hih = _mm_shufflehi_epi16(self, mask_hi);
+                // generate temporary lanes
+                return _mm_unpackhi_epi64(hil, hih);
+            }
+            // Generic case
+            // permute within each sub lane
             constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3);
             constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7);
-            __m128i lo = _mm_shufflelo_epi16(self, mask_lo);
-            __m128i hi = _mm_shufflehi_epi16(self, mask_hi);
+            __m128i lol = _mm_shufflelo_epi16(self, mask_lo);
+            __m128i loh = _mm_shufflelo_epi16(self, mask_hi);
+            __m128i hil = _mm_shufflehi_epi16(self, mask_lo);
+            __m128i hih = _mm_shufflehi_epi16(self, mask_hi);
-            __m128i lo_lo = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(lo), _mm_castsi128_pd(lo), _MM_SHUFFLE2(0, 0)));
-            __m128i hi_hi = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(hi), _mm_castsi128_pd(hi), _MM_SHUFFLE2(1, 1)));
+            // generate temporary lanes
+            __m128i lo = _mm_unpacklo_epi64(lol, loh);
+            __m128i hi = _mm_unpackhi_epi64(hil, hih);
             // mask to choose the right lane
             batch_bool_constant<uint16_t, A, (V0 < 4), (V1 < 4), (V2 < 4), (V3 < 4), (V4 < 4), (V5 < 4), (V6 < 4), (V7 < 4)> blend_mask;
             // blend the two permutes
-            return select(blend_mask, batch<uint16_t, A>(lo_lo), batch<uint16_t, A>(hi_hi));
+            return select(blend_mask, batch<uint16_t, A>(lo), batch<uint16_t, A>(hi));
         }
         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>

sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp CHANGED Viewed

@@ -51,6 +51,15 @@ namespace xsimd
             return _mm_cvtss_f32(tmp1);
         }
+        // reduce_mul
+        template <class A>
+        XSIMD_INLINE float reduce_mul(batch<float, A> const& self, requires_arch<sse3>) noexcept
+        {
+            __m128 tmp1 = _mm_mul_ps(self, _mm_movehl_ps(self, self));
+            __m128 tmp2 = _mm_mul_ps(tmp1, _mm_movehdup_ps(tmp1));
+            return _mm_cvtss_f32(tmp2);
+        }
     }
 }

sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp CHANGED Viewed

@@ -107,11 +107,22 @@ namespace xsimd
         // rotate_left
         template <size_t N, class A>
-        XSIMD_INLINE batch<uint16_t, A> rotate_left(batch<uint16_t, A> const& self, requires_arch<ssse3>) noexcept
+        XSIMD_INLINE batch<uint8_t, A> rotate_left(batch<uint8_t, A> const& self, requires_arch<ssse3>) noexcept
         {
             return _mm_alignr_epi8(self, self, N);
         }
         template <size_t N, class A>
+        XSIMD_INLINE batch<int8_t, A> rotate_left(batch<int8_t, A> const& self, requires_arch<ssse3>) noexcept
+        {
+            return bitwise_cast<int8_t>(rotate_left<N, A>(bitwise_cast<uint8_t>(self), ssse3 {}));
+        }
+        template <size_t N, class A>
+        XSIMD_INLINE batch<uint16_t, A> rotate_left(batch<uint16_t, A> const& self, requires_arch<ssse3>) noexcept
+        {
+            return _mm_alignr_epi8(self, self, 2 * N);
+        }
+        template <size_t N, class A>
         XSIMD_INLINE batch<int16_t, A> rotate_left(batch<int16_t, A> const& self, requires_arch<ssse3>) noexcept
         {
             return bitwise_cast<int16_t>(rotate_left<N, A>(bitwise_cast<uint16_t>(self), ssse3 {}));

sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp CHANGED Viewed

@@ -949,6 +949,13 @@ namespace xsimd
             return svsel(index_predicate, broadcast<A, T>(val, sve {}), arg);
         }
+        // first
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<sve>) noexcept
+        {
+            return self.data[0];
+        }
         // all
         template <class A, class T, detail::sve_enable_all_t<T> = 0>
         XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept