sequenzo 0.1.17__cp39-cp39-win_amd64.whl → 0.1.18__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sequenzo might be problematic. Click here for more details.
- sequenzo/__init__.py +25 -1
- sequenzo/big_data/clara/clara.py +1 -1
- sequenzo/big_data/clara/utils/get_weighted_diss.c +156 -156
- sequenzo/big_data/clara/utils/get_weighted_diss.cp39-win_amd64.pyd +0 -0
- sequenzo/clustering/clustering_c_code.cp39-win_amd64.pyd +0 -0
- sequenzo/clustering/hierarchical_clustering.py +202 -8
- sequenzo/define_sequence_data.py +34 -2
- sequenzo/dissimilarity_measures/c_code.cp39-win_amd64.pyd +0 -0
- sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +1 -1
- sequenzo/dissimilarity_measures/src/DHDdistance.cpp +13 -37
- sequenzo/dissimilarity_measures/src/LCPdistance.cpp +13 -37
- sequenzo/dissimilarity_measures/src/OMdistance.cpp +12 -47
- sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +103 -67
- sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +41 -16
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +4 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +10 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +127 -43
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +30 -2
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +14 -5
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +111 -54
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +131 -9
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +11 -113
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +39 -7
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +336 -30
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +9 -37
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +58 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +1 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +35 -2
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +3 -1
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +17 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +13 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +18 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +13 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +8 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +363 -34
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +13 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +41 -4
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +252 -16
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +9 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +12 -1
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +7 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +78 -1
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +3 -1
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +13 -2
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +5 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +5 -1
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +2 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +64 -1
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +36 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +40 -31
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +8 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
- sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +6 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +6 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +54 -2
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +8 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +11 -4
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +18 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +8 -14
- sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +216 -173
- sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +6 -0
- sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +1 -1
- sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +7 -4
- sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +6 -2
- sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +32 -18
- sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +21 -24
- sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +69 -9
- sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.c +156 -156
- sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cp39-win_amd64.pyd +0 -0
- sequenzo/dissimilarity_measures/utils/seqconc.c +156 -156
- sequenzo/dissimilarity_measures/utils/seqconc.cp39-win_amd64.pyd +0 -0
- sequenzo/dissimilarity_measures/utils/seqdss.c +156 -156
- sequenzo/dissimilarity_measures/utils/seqdss.cp39-win_amd64.pyd +0 -0
- sequenzo/dissimilarity_measures/utils/seqdur.c +156 -156
- sequenzo/dissimilarity_measures/utils/seqdur.cp39-win_amd64.pyd +0 -0
- sequenzo/dissimilarity_measures/utils/seqlength.c +156 -156
- sequenzo/dissimilarity_measures/utils/seqlength.cp39-win_amd64.pyd +0 -0
- sequenzo/sequence_characteristics/__init__.py +4 -0
- sequenzo/sequence_characteristics/complexity_index.py +17 -57
- sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +177 -111
- sequenzo/sequence_characteristics/plot_characteristics.py +30 -11
- sequenzo/sequence_characteristics/simple_characteristics.py +1 -0
- sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +9 -3
- sequenzo/sequence_characteristics/turbulence.py +47 -67
- sequenzo/sequence_characteristics/variance_of_spell_durations.py +19 -9
- sequenzo/sequence_characteristics/within_sequence_entropy.py +5 -58
- sequenzo/visualization/plot_sequence_index.py +58 -35
- sequenzo/visualization/plot_state_distribution.py +57 -36
- sequenzo/with_event_history_analysis/__init__.py +35 -0
- sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
- sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
- {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/METADATA +7 -6
- {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/RECORD +101 -94
- {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/WHEEL +0 -0
- {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/licenses/LICENSE +0 -0
- {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/top_level.txt +0 -0
|
@@ -9,8 +9,8 @@
|
|
|
9
9
|
* The full license is in the file LICENSE, distributed with this software. *
|
|
10
10
|
****************************************************************************/
|
|
11
11
|
|
|
12
|
-
#ifndef
|
|
13
|
-
#define
|
|
12
|
+
#ifndef XSIMD_AVX512DQ_HPP
|
|
13
|
+
#define XSIMD_AVX512DQ_HPP
|
|
14
14
|
|
|
15
15
|
#include "../types/xsimd_avx512dq_register.hpp"
|
|
16
16
|
|
|
@@ -47,12 +47,12 @@ namespace xsimd
|
|
|
47
47
|
|
|
48
48
|
// bitwise_not
|
|
49
49
|
template <class A>
|
|
50
|
-
XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<
|
|
50
|
+
XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512dq>) noexcept
|
|
51
51
|
{
|
|
52
52
|
return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1)));
|
|
53
53
|
}
|
|
54
54
|
template <class A>
|
|
55
|
-
XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<
|
|
55
|
+
XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512dq>) noexcept
|
|
56
56
|
{
|
|
57
57
|
return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1)));
|
|
58
58
|
}
|
|
@@ -96,7 +96,7 @@ namespace xsimd
|
|
|
96
96
|
// tmp1 = [a0..8, b0..8]
|
|
97
97
|
// tmp2 = [a8..f, b8..f]
|
|
98
98
|
#define XSIMD_AVX512_HADDP_STEP1(I, a, b) \
|
|
99
|
-
batch<float,
|
|
99
|
+
batch<float, avx512dq> res##I; \
|
|
100
100
|
{ \
|
|
101
101
|
auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
|
|
102
102
|
auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
|
|
@@ -180,7 +180,7 @@ namespace xsimd
|
|
|
180
180
|
|
|
181
181
|
// reduce_add
|
|
182
182
|
template <class A>
|
|
183
|
-
XSIMD_INLINE float reduce_add(batch<float, A> const& rhs, requires_arch<
|
|
183
|
+
XSIMD_INLINE float reduce_add(batch<float, A> const& rhs, requires_arch<avx512dq>) noexcept
|
|
184
184
|
{
|
|
185
185
|
__m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
|
|
186
186
|
__m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
|
|
@@ -188,11 +188,43 @@ namespace xsimd
|
|
|
188
188
|
return reduce_add(batch<float, avx2>(res1), avx2 {});
|
|
189
189
|
}
|
|
190
190
|
|
|
191
|
+
// reduce_mul
|
|
192
|
+
template <class A>
|
|
193
|
+
XSIMD_INLINE float reduce_mul(batch<float, A> const& rhs, requires_arch<avx512dq>) noexcept
|
|
194
|
+
{
|
|
195
|
+
__m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
|
|
196
|
+
__m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
|
|
197
|
+
__m256 res1 = _mm256_mul_ps(tmp1, tmp2);
|
|
198
|
+
return reduce_mul(batch<float, avx2>(res1), avx2 {});
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// swizzle constant mask
|
|
202
|
+
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7,
|
|
203
|
+
uint32_t V8, uint32_t V9, uint32_t V10, uint32_t V11, uint32_t V12, uint32_t V13, uint32_t V14, uint32_t V15>
|
|
204
|
+
XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self,
|
|
205
|
+
batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask,
|
|
206
|
+
requires_arch<avx512dq>) noexcept
|
|
207
|
+
{
|
|
208
|
+
constexpr bool dup_lo = detail::is_dup_lo(mask);
|
|
209
|
+
constexpr bool dup_hi = detail::is_dup_hi(mask);
|
|
210
|
+
|
|
211
|
+
XSIMD_IF_CONSTEXPR(dup_lo || dup_hi)
|
|
212
|
+
{
|
|
213
|
+
const batch<float, avx2> half = _mm512_extractf32x8_ps(self, dup_lo ? 0 : 1);
|
|
214
|
+
constexpr typename std::conditional<dup_lo, batch_constant<uint32_t, avx2, V0 % 8, V1 % 8, V2 % 8, V3 % 8, V4 % 8, V5 % 8, V6 % 8, V7 % 8>,
|
|
215
|
+
batch_constant<uint32_t, avx2, V8 % 8, V9 % 8, V10 % 8, V11 % 8, V12 % 8, V13 % 8, V14 % 8, V15 % 8>>::type half_mask {};
|
|
216
|
+
auto permuted = swizzle(half, half_mask, avx2 {});
|
|
217
|
+
// merge the two slices into an AVX512F register:
|
|
218
|
+
return _mm512_broadcast_f32x8(permuted); // duplicates the 256-bit perm into both halves
|
|
219
|
+
}
|
|
220
|
+
return swizzle(self, mask, avx512f {});
|
|
221
|
+
}
|
|
222
|
+
|
|
191
223
|
// convert
|
|
192
224
|
namespace detail
|
|
193
225
|
{
|
|
194
226
|
template <class A>
|
|
195
|
-
XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const&
|
|
227
|
+
XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& self, batch<double, A> const&, requires_arch<avx512dq>) noexcept
|
|
196
228
|
{
|
|
197
229
|
return _mm512_cvtepi64_pd(self);
|
|
198
230
|
}
|
|
@@ -32,6 +32,8 @@ namespace xsimd
|
|
|
32
32
|
XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, Mask const& mask, requires_arch<common>) noexcept;
|
|
33
33
|
template <class A, class T, size_t I>
|
|
34
34
|
XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<common>) noexcept;
|
|
35
|
+
template <class A, class T, class ITy, ITy... Is>
|
|
36
|
+
XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch_constant<ITy, A, Is...>, requires_arch<common>) noexcept;
|
|
35
37
|
template <class A>
|
|
36
38
|
XSIMD_INLINE void transpose(batch<uint16_t, A>* matrix_begin, batch<uint16_t, A>* matrix_end, requires_arch<common>) noexcept;
|
|
37
39
|
template <class A>
|
|
@@ -562,6 +564,100 @@ namespace xsimd
|
|
|
562
564
|
}
|
|
563
565
|
}
|
|
564
566
|
|
|
567
|
+
// rotl
|
|
568
|
+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
|
569
|
+
XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
|
|
570
|
+
{
|
|
571
|
+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
|
572
|
+
{
|
|
573
|
+
return _mm512_rolv_epi32(self, other);
|
|
574
|
+
}
|
|
575
|
+
XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
|
576
|
+
{
|
|
577
|
+
return _mm512_rolv_epi64(self, other);
|
|
578
|
+
}
|
|
579
|
+
return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
|
|
580
|
+
{ return rotl(batch<T, avx2>(s), batch<T, avx2>(o), avx2 {}); },
|
|
581
|
+
self, other);
|
|
582
|
+
}
|
|
583
|
+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
|
584
|
+
XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
|
|
585
|
+
{
|
|
586
|
+
return rotl(self, batch<T, A>(other), A {});
|
|
587
|
+
}
|
|
588
|
+
template <size_t count, class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
|
589
|
+
XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, requires_arch<avx512f>) noexcept
|
|
590
|
+
{
|
|
591
|
+
constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
|
|
592
|
+
static_assert(count < bits, "Count must be less than the number of bits in T");
|
|
593
|
+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
|
594
|
+
{
|
|
595
|
+
return _mm512_rol_epi32(self, count);
|
|
596
|
+
}
|
|
597
|
+
XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
|
598
|
+
{
|
|
599
|
+
return _mm512_rol_epi64(self, count);
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
return detail::fwd_to_avx([](__m256i s) noexcept
|
|
603
|
+
{ return rotl<count>(batch<T, avx2>(s), avx2 {}); },
|
|
604
|
+
self);
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
// rotr
|
|
608
|
+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
|
609
|
+
XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
|
|
610
|
+
{
|
|
611
|
+
XSIMD_IF_CONSTEXPR(sizeof(T) < 4)
|
|
612
|
+
{
|
|
613
|
+
return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
|
|
614
|
+
{ return rotr(batch<T, avx2>(s), batch<T, avx2>(o), avx2 {}); },
|
|
615
|
+
self, other);
|
|
616
|
+
}
|
|
617
|
+
XSIMD_IF_CONSTEXPR(std::is_unsigned<T>::value)
|
|
618
|
+
{
|
|
619
|
+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
|
620
|
+
{
|
|
621
|
+
return _mm512_rorv_epi32(self, other);
|
|
622
|
+
}
|
|
623
|
+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
|
624
|
+
{
|
|
625
|
+
return _mm512_rorv_epi64(self, other);
|
|
626
|
+
}
|
|
627
|
+
}
|
|
628
|
+
return rotr(self, other, common {});
|
|
629
|
+
}
|
|
630
|
+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
|
631
|
+
XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
|
|
632
|
+
{
|
|
633
|
+
return rotr(self, batch<T, A>(other), A {});
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
template <size_t count, class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
|
637
|
+
XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& self, requires_arch<avx512f>) noexcept
|
|
638
|
+
{
|
|
639
|
+
constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
|
|
640
|
+
static_assert(count < bits, "Count must be less than the number of bits in T");
|
|
641
|
+
XSIMD_IF_CONSTEXPR(sizeof(T) < 4)
|
|
642
|
+
{
|
|
643
|
+
return detail::fwd_to_avx([](__m256i s) noexcept
|
|
644
|
+
{ return rotr<count>(batch<T, avx2>(s), avx2 {}); },
|
|
645
|
+
self);
|
|
646
|
+
}
|
|
647
|
+
XSIMD_IF_CONSTEXPR(std::is_unsigned<T>::value)
|
|
648
|
+
{
|
|
649
|
+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
|
650
|
+
{
|
|
651
|
+
return _mm512_ror_epi32(self, count);
|
|
652
|
+
}
|
|
653
|
+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
|
654
|
+
{
|
|
655
|
+
return _mm512_ror_epi64(self, count);
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
return rotr<count>(self, common {});
|
|
659
|
+
}
|
|
660
|
+
|
|
565
661
|
// bitwise_xor
|
|
566
662
|
template <class A>
|
|
567
663
|
XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
|
|
@@ -578,7 +674,7 @@ namespace xsimd
|
|
|
578
674
|
XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
|
|
579
675
|
{
|
|
580
676
|
using register_type = typename batch_bool<T, A>::register_type;
|
|
581
|
-
return register_type(self.data
|
|
677
|
+
return register_type(self.data ^ other.data);
|
|
582
678
|
}
|
|
583
679
|
|
|
584
680
|
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
|
@@ -900,6 +996,18 @@ namespace xsimd
|
|
|
900
996
|
{
|
|
901
997
|
return _mm512_fmsub_pd(x, y, z);
|
|
902
998
|
}
|
|
999
|
+
// fmas
|
|
1000
|
+
template <class A>
|
|
1001
|
+
XSIMD_INLINE batch<float, A> fmas(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
|
|
1002
|
+
{
|
|
1003
|
+
return _mm512_fmaddsub_ps(x, y, z);
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
template <class A>
|
|
1007
|
+
XSIMD_INLINE batch<double, A> fmas(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
|
|
1008
|
+
{
|
|
1009
|
+
return _mm512_fmaddsub_pd(x, y, z);
|
|
1010
|
+
}
|
|
903
1011
|
|
|
904
1012
|
// from bool
|
|
905
1013
|
template <class A, class T>
|
|
@@ -1312,12 +1420,12 @@ namespace xsimd
|
|
|
1312
1420
|
template <class A>
|
|
1313
1421
|
XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
|
|
1314
1422
|
{
|
|
1315
|
-
return _mm512_max_ps(
|
|
1423
|
+
return _mm512_max_ps(other, self);
|
|
1316
1424
|
}
|
|
1317
1425
|
template <class A>
|
|
1318
1426
|
XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
|
|
1319
1427
|
{
|
|
1320
|
-
return _mm512_max_pd(
|
|
1428
|
+
return _mm512_max_pd(other, self);
|
|
1321
1429
|
}
|
|
1322
1430
|
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
|
1323
1431
|
XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
|
|
@@ -1362,12 +1470,12 @@ namespace xsimd
|
|
|
1362
1470
|
template <class A>
|
|
1363
1471
|
XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
|
|
1364
1472
|
{
|
|
1365
|
-
return _mm512_min_ps(
|
|
1473
|
+
return _mm512_min_ps(other, self);
|
|
1366
1474
|
}
|
|
1367
1475
|
template <class A>
|
|
1368
1476
|
XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
|
|
1369
1477
|
{
|
|
1370
|
-
return _mm512_min_pd(
|
|
1478
|
+
return _mm512_min_pd(other, self);
|
|
1371
1479
|
}
|
|
1372
1480
|
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
|
1373
1481
|
XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
|
|
@@ -1544,6 +1652,37 @@ namespace xsimd
|
|
|
1544
1652
|
return reduce_min(batch<T, avx2>(low));
|
|
1545
1653
|
}
|
|
1546
1654
|
|
|
1655
|
+
// reduce_mul
|
|
1656
|
+
template <class A>
|
|
1657
|
+
XSIMD_INLINE float reduce_mul(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
|
|
1658
|
+
{
|
|
1659
|
+
return _mm512_reduce_mul_ps(rhs);
|
|
1660
|
+
}
|
|
1661
|
+
template <class A>
|
|
1662
|
+
XSIMD_INLINE double reduce_mul(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
|
|
1663
|
+
{
|
|
1664
|
+
return _mm512_reduce_mul_pd(rhs);
|
|
1665
|
+
}
|
|
1666
|
+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
|
1667
|
+
XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx512f>) noexcept
|
|
1668
|
+
{
|
|
1669
|
+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
|
1670
|
+
{
|
|
1671
|
+
return _mm512_reduce_mul_epi32(self);
|
|
1672
|
+
}
|
|
1673
|
+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
|
1674
|
+
{
|
|
1675
|
+
return _mm512_reduce_mul_epi64(self);
|
|
1676
|
+
}
|
|
1677
|
+
else
|
|
1678
|
+
{
|
|
1679
|
+
__m256i low, high;
|
|
1680
|
+
detail::split_avx512(self, low, high);
|
|
1681
|
+
batch<T, avx2> blow(low), bhigh(high);
|
|
1682
|
+
return reduce_mul(blow, avx2 {}) * reduce_mul(bhigh, avx2 {});
|
|
1683
|
+
}
|
|
1684
|
+
}
|
|
1685
|
+
|
|
1547
1686
|
// rsqrt
|
|
1548
1687
|
template <class A>
|
|
1549
1688
|
XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
|
|
@@ -1726,8 +1865,8 @@ namespace xsimd
|
|
|
1726
1865
|
v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
|
|
1727
1866
|
};
|
|
1728
1867
|
#else
|
|
1729
|
-
return _mm512_set_epi16(
|
|
1730
|
-
|
|
1868
|
+
return _mm512_set_epi16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
|
|
1869
|
+
v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
|
|
1731
1870
|
#endif
|
|
1732
1871
|
}
|
|
1733
1872
|
|
|
@@ -1743,8 +1882,8 @@ namespace xsimd
|
|
|
1743
1882
|
v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
|
|
1744
1883
|
};
|
|
1745
1884
|
#else
|
|
1746
|
-
return _mm512_set_epi16(
|
|
1747
|
-
|
|
1885
|
+
return _mm512_set_epi16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
|
|
1886
|
+
v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
|
|
1748
1887
|
#endif
|
|
1749
1888
|
}
|
|
1750
1889
|
|
|
@@ -1767,10 +1906,10 @@ namespace xsimd
|
|
|
1767
1906
|
v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63
|
|
1768
1907
|
};
|
|
1769
1908
|
#else
|
|
1770
|
-
return _mm512_set_epi8(
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
|
|
1909
|
+
return _mm512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
|
|
1910
|
+
v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
|
|
1911
|
+
v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
|
|
1912
|
+
v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
|
|
1774
1913
|
#endif
|
|
1775
1914
|
}
|
|
1776
1915
|
template <class A, class T, detail::enable_unsigned_integer_t<T> = 0>
|
|
@@ -1792,10 +1931,10 @@ namespace xsimd
|
|
|
1792
1931
|
v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63
|
|
1793
1932
|
};
|
|
1794
1933
|
#else
|
|
1795
|
-
return _mm512_set_epi8(
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1934
|
+
return _mm512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
|
|
1935
|
+
v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
|
|
1936
|
+
v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
|
|
1937
|
+
v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
|
|
1799
1938
|
#endif
|
|
1800
1939
|
}
|
|
1801
1940
|
|
|
@@ -1845,19 +1984,110 @@ namespace xsimd
|
|
|
1845
1984
|
}
|
|
1846
1985
|
|
|
1847
1986
|
// slide_left
|
|
1987
|
+
namespace detail
|
|
1988
|
+
{
|
|
1989
|
+
template <size_t N>
|
|
1990
|
+
struct make_slide_left_pattern
|
|
1991
|
+
{
|
|
1992
|
+
static constexpr size_t get(size_t i, size_t)
|
|
1993
|
+
{
|
|
1994
|
+
return i >= N ? i - N : 0;
|
|
1995
|
+
}
|
|
1996
|
+
};
|
|
1997
|
+
|
|
1998
|
+
template <size_t N, class A, class T>
|
|
1999
|
+
XSIMD_INLINE batch<T, A> slide_left_aligned_u32(batch<T, A> const& x, requires_arch<avx512f>) noexcept
|
|
2000
|
+
{
|
|
2001
|
+
static_assert((N & 3) == 0 || N >= 64, "N must be aligned to 32 bits");
|
|
2002
|
+
|
|
2003
|
+
if (N == 0)
|
|
2004
|
+
{
|
|
2005
|
+
return x;
|
|
2006
|
+
}
|
|
2007
|
+
if (N >= 64)
|
|
2008
|
+
{
|
|
2009
|
+
return batch<T, A>(T(0));
|
|
2010
|
+
}
|
|
2011
|
+
|
|
2012
|
+
__mmask16 mask = uint16_t(0xFFFFu << (N / 4));
|
|
2013
|
+
|
|
2014
|
+
if ((N & 15) == 0)
|
|
2015
|
+
{
|
|
2016
|
+
const uint8_t imm8 = uint8_t(0xe4 << (2 * (N / 16)));
|
|
2017
|
+
return _mm512_maskz_shuffle_i32x4(mask, x, x, imm8);
|
|
2018
|
+
}
|
|
2019
|
+
|
|
2020
|
+
auto slide_pattern = make_batch_constant<uint32_t, detail::make_slide_left_pattern<N / 4>, A>();
|
|
2021
|
+
return _mm512_maskz_permutexvar_epi32(mask, slide_pattern.as_batch(), x);
|
|
2022
|
+
}
|
|
2023
|
+
}
|
|
2024
|
+
|
|
1848
2025
|
template <size_t N, class A, class T>
|
|
1849
|
-
XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const
|
|
2026
|
+
XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512f>) noexcept
|
|
1850
2027
|
{
|
|
1851
|
-
|
|
1852
|
-
|
|
2028
|
+
constexpr size_t NN = N & ~3;
|
|
2029
|
+
if (N == NN || NN >= 64)
|
|
2030
|
+
{
|
|
2031
|
+
// Call fast path
|
|
2032
|
+
return detail::slide_left_aligned_u32<NN>(x, A {});
|
|
2033
|
+
}
|
|
2034
|
+
|
|
2035
|
+
__m512i xl = detail::slide_left_aligned_u32<NN, A, T>(_mm512_slli_epi32(x, 8 * (N - NN)), A {});
|
|
2036
|
+
__m512i xr = detail::slide_left_aligned_u32<NN + 4, A, T>(_mm512_srli_epi32(x, 32 - 8 * (N - NN)), A {});
|
|
2037
|
+
return _mm512_or_epi32(xl, xr);
|
|
1853
2038
|
}
|
|
1854
2039
|
|
|
1855
2040
|
// slide_right
|
|
2041
|
+
namespace detail
|
|
2042
|
+
{
|
|
2043
|
+
template <size_t N>
|
|
2044
|
+
struct make_slide_right_pattern
|
|
2045
|
+
{
|
|
2046
|
+
static constexpr size_t get(size_t i, size_t n)
|
|
2047
|
+
{
|
|
2048
|
+
return i < (n - N) ? i + N : 0;
|
|
2049
|
+
}
|
|
2050
|
+
};
|
|
2051
|
+
|
|
2052
|
+
template <size_t N, class A, class T>
|
|
2053
|
+
XSIMD_INLINE batch<T, A> slide_right_aligned_u32(batch<T, A> const& x, requires_arch<avx512f>) noexcept
|
|
2054
|
+
{
|
|
2055
|
+
static_assert((N & 3) == 0 || N >= 64, "N must be aligned to 32 bits");
|
|
2056
|
+
|
|
2057
|
+
if (N == 0)
|
|
2058
|
+
{
|
|
2059
|
+
return x;
|
|
2060
|
+
}
|
|
2061
|
+
if (N >= 64)
|
|
2062
|
+
{
|
|
2063
|
+
return batch<T, A>(T(0));
|
|
2064
|
+
}
|
|
2065
|
+
|
|
2066
|
+
__mmask16 mask = 0xFFFFu >> (N / 4);
|
|
2067
|
+
|
|
2068
|
+
if ((N & 15) == 0)
|
|
2069
|
+
{
|
|
2070
|
+
const uint8_t imm8 = 0xe4 >> (2 * (N / 16));
|
|
2071
|
+
return _mm512_maskz_shuffle_i32x4(mask, x, x, imm8);
|
|
2072
|
+
}
|
|
2073
|
+
|
|
2074
|
+
auto slide_pattern = make_batch_constant<uint32_t, detail::make_slide_right_pattern<N / 4>, A>();
|
|
2075
|
+
return _mm512_maskz_permutexvar_epi32(mask, slide_pattern.as_batch(), x);
|
|
2076
|
+
}
|
|
2077
|
+
}
|
|
1856
2078
|
template <size_t N, class A, class T>
|
|
1857
|
-
XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const
|
|
2079
|
+
XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512f>) noexcept
|
|
1858
2080
|
{
|
|
1859
|
-
|
|
1860
|
-
|
|
2081
|
+
constexpr size_t NN = N & ~3;
|
|
2082
|
+
if (N == NN || NN >= 64)
|
|
2083
|
+
{
|
|
2084
|
+
// Call fast path
|
|
2085
|
+
return detail::slide_right_aligned_u32<NN>(x, A {});
|
|
2086
|
+
}
|
|
2087
|
+
|
|
2088
|
+
__m512i xl = detail::slide_right_aligned_u32<NN + 4, A, T>(_mm512_slli_epi32(x, 32 - 8 * (N - NN)), A {});
|
|
2089
|
+
__m512i xr = detail::slide_right_aligned_u32<NN, A, T>(_mm512_srli_epi32(x, 8 * (N - NN)), A {});
|
|
2090
|
+
return _mm512_or_epi32(xl, xr);
|
|
1861
2091
|
}
|
|
1862
2092
|
|
|
1863
2093
|
// sqrt
|
|
@@ -2019,16 +2249,53 @@ namespace xsimd
|
|
|
2019
2249
|
return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx512f {}));
|
|
2020
2250
|
}
|
|
2021
2251
|
|
|
2022
|
-
|
|
2023
|
-
|
|
2024
|
-
XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self,
|
|
2252
|
+
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7,
|
|
2253
|
+
uint32_t V8, uint32_t V9, uint32_t V10, uint32_t V11, uint32_t V12, uint32_t V13, uint32_t V14, uint32_t V15>
|
|
2254
|
+
XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self,
|
|
2255
|
+
batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask,
|
|
2256
|
+
requires_arch<avx512f>) noexcept
|
|
2025
2257
|
{
|
|
2258
|
+
XSIMD_IF_CONSTEXPR(detail::is_identity(mask))
|
|
2259
|
+
{
|
|
2260
|
+
return self;
|
|
2261
|
+
}
|
|
2262
|
+
XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
|
|
2263
|
+
{
|
|
2264
|
+
constexpr int imm0 = detail::mod_shuffle(V0, V1, V2, V3);
|
|
2265
|
+
constexpr int imm1 = detail::mod_shuffle(V4, V5, V6, V7);
|
|
2266
|
+
constexpr int imm2 = detail::mod_shuffle(V8, V9, V10, V11);
|
|
2267
|
+
constexpr int imm3 = detail::mod_shuffle(V12, V13, V14, V15);
|
|
2268
|
+
XSIMD_IF_CONSTEXPR(imm0 == imm1 && imm0 == imm2 && imm0 == imm3)
|
|
2269
|
+
{
|
|
2270
|
+
return _mm512_permute_ps(self, imm0);
|
|
2271
|
+
}
|
|
2272
|
+
}
|
|
2026
2273
|
return swizzle(self, mask.as_batch(), avx512f {});
|
|
2027
2274
|
}
|
|
2028
|
-
|
|
2029
|
-
|
|
2030
|
-
|
|
2275
|
+
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3, uint64_t V4, uint64_t V5, uint64_t V6, uint64_t V7>
|
|
2276
|
+
XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self,
|
|
2277
|
+
batch_constant<uint64_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask,
|
|
2278
|
+
requires_arch<avx512f>) noexcept
|
|
2031
2279
|
{
|
|
2280
|
+
XSIMD_IF_CONSTEXPR(detail::is_identity(mask))
|
|
2281
|
+
{
|
|
2282
|
+
return self;
|
|
2283
|
+
}
|
|
2284
|
+
XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
|
|
2285
|
+
{
|
|
2286
|
+
constexpr auto imm = ((V0 & 1) << 0) | ((V1 & 1) << 1) | ((V2 & 1) << 2) | ((V3 & 1) << 3) | ((V4 & 1) << 4) | ((V5 & 1) << 5) | ((V6 & 1) << 6) | ((V7 & 1) << 7);
|
|
2287
|
+
return _mm512_permute_pd(self, imm);
|
|
2288
|
+
}
|
|
2289
|
+
constexpr bool dup_lo = detail::is_dup_lo(mask);
|
|
2290
|
+
constexpr bool dup_hi = detail::is_dup_hi(mask);
|
|
2291
|
+
XSIMD_IF_CONSTEXPR(dup_lo || dup_hi)
|
|
2292
|
+
{
|
|
2293
|
+
const batch<double, avx2> half = _mm512_extractf64x4_pd(self, dup_lo ? 0 : 1);
|
|
2294
|
+
constexpr typename std::conditional<dup_lo, batch_constant<uint64_t, avx2, V0 % 4, V1 % 4, V2 % 4, V3 % 4>,
|
|
2295
|
+
batch_constant<uint64_t, avx2, V4 % 4, V5 % 4, V6 % 4, V7 % 4>>::type half_mask {};
|
|
2296
|
+
return _mm512_broadcast_f64x4(swizzle(half, half_mask, avx2 {}));
|
|
2297
|
+
}
|
|
2298
|
+
// General case
|
|
2032
2299
|
return swizzle(self, mask.as_batch(), avx512f {});
|
|
2033
2300
|
}
|
|
2034
2301
|
|
|
@@ -2337,8 +2604,47 @@ namespace xsimd
|
|
|
2337
2604
|
2));
|
|
2338
2605
|
}
|
|
2339
2606
|
|
|
2340
|
-
|
|
2607
|
+
// first
|
|
2608
|
+
template <class A>
|
|
2609
|
+
XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx512f>) noexcept
|
|
2610
|
+
{
|
|
2611
|
+
return _mm512_cvtss_f32(self);
|
|
2612
|
+
}
|
|
2613
|
+
|
|
2614
|
+
template <class A>
|
|
2615
|
+
XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx512f>) noexcept
|
|
2616
|
+
{
|
|
2617
|
+
return _mm512_cvtsd_f64(self);
|
|
2618
|
+
}
|
|
2341
2619
|
|
|
2620
|
+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
|
2621
|
+
XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx512f>) noexcept
|
|
2622
|
+
{
|
|
2623
|
+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
|
2624
|
+
{
|
|
2625
|
+
return static_cast<T>(_mm512_cvtsi512_si32(self) & 0xFF);
|
|
2626
|
+
}
|
|
2627
|
+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
|
2628
|
+
{
|
|
2629
|
+
return static_cast<T>(_mm512_cvtsi512_si32(self) & 0xFFFF);
|
|
2630
|
+
}
|
|
2631
|
+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
|
2632
|
+
{
|
|
2633
|
+
return static_cast<T>(_mm512_cvtsi512_si32(self));
|
|
2634
|
+
}
|
|
2635
|
+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
|
2636
|
+
{
|
|
2637
|
+
batch<T, sse4_2> low = _mm512_castsi512_si128(self);
|
|
2638
|
+
return first(low, sse4_2 {});
|
|
2639
|
+
}
|
|
2640
|
+
else
|
|
2641
|
+
{
|
|
2642
|
+
assert(false && "unsupported arch/op combination");
|
|
2643
|
+
return {};
|
|
2644
|
+
}
|
|
2645
|
+
}
|
|
2646
|
+
|
|
2647
|
+
}
|
|
2342
2648
|
}
|
|
2343
2649
|
|
|
2344
2650
|
#endif
|
|
@@ -24,54 +24,26 @@ namespace xsimd
|
|
|
24
24
|
{
|
|
25
25
|
using namespace types;
|
|
26
26
|
|
|
27
|
-
namespace detail
|
|
28
|
-
{
|
|
29
|
-
template <size_t N, size_t... Is>
|
|
30
|
-
constexpr std::array<uint8_t, sizeof...(Is)> make_slide_left_bytes_pattern(::xsimd::detail::index_sequence<Is...>)
|
|
31
|
-
{
|
|
32
|
-
return { (Is >= N ? Is - N : 0)... };
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
template <size_t N, size_t... Is>
|
|
36
|
-
constexpr std::array<uint8_t, sizeof...(Is)> make_slide_right_bytes_pattern(::xsimd::detail::index_sequence<Is...>)
|
|
37
|
-
{
|
|
38
|
-
return { (Is < (64 - N) ? Is + N : 0)... };
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
|
|
42
27
|
// slide_left
|
|
43
|
-
template <size_t N, class A, class T>
|
|
28
|
+
template <size_t N, class A, class T, class = typename std::enable_if<(N & 3) != 0 && (N < 64)>::type>
|
|
44
29
|
XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512vbmi>) noexcept
|
|
45
30
|
{
|
|
46
|
-
|
|
47
|
-
{
|
|
48
|
-
return x;
|
|
49
|
-
}
|
|
50
|
-
if (N >= 64)
|
|
51
|
-
{
|
|
52
|
-
return batch<T, A>(T(0));
|
|
53
|
-
}
|
|
31
|
+
static_assert((N & 3) != 0 && N < 64, "The AVX512F implementation may have a lower latency.");
|
|
54
32
|
|
|
55
33
|
__mmask64 mask = 0xFFFFFFFFFFFFFFFFull << (N & 63);
|
|
56
|
-
|
|
57
|
-
return _mm512_maskz_permutexvar_epi8(mask,
|
|
34
|
+
auto slide_pattern = make_batch_constant<uint8_t, detail::make_slide_left_pattern<N>, A>();
|
|
35
|
+
return _mm512_maskz_permutexvar_epi8(mask, slide_pattern.as_batch(), x);
|
|
58
36
|
}
|
|
59
37
|
|
|
60
38
|
// slide_right
|
|
61
|
-
template <size_t N, class A, class T>
|
|
39
|
+
template <size_t N, class A, class T, class = typename std::enable_if<(N & 3) != 0 && (N < 64)>::type>
|
|
62
40
|
XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512vbmi>) noexcept
|
|
63
41
|
{
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
return x;
|
|
67
|
-
}
|
|
68
|
-
if (N >= 64)
|
|
69
|
-
{
|
|
70
|
-
return batch<T, A>(T(0));
|
|
71
|
-
}
|
|
42
|
+
static_assert((N & 3) != 0 && N < 64, "The AVX512F implementation may have a lower latency.");
|
|
43
|
+
|
|
72
44
|
__mmask64 mask = 0xFFFFFFFFFFFFFFFFull >> (N & 63);
|
|
73
|
-
|
|
74
|
-
return _mm512_maskz_permutexvar_epi8(mask,
|
|
45
|
+
auto slide_pattern = make_batch_constant<uint8_t, detail::make_slide_right_pattern<N>, A>();
|
|
46
|
+
return _mm512_maskz_permutexvar_epi8(mask, slide_pattern.as_batch(), x);
|
|
75
47
|
}
|
|
76
48
|
|
|
77
49
|
// swizzle (dynamic version)
|