sequenzo 0.1.17__cp39-cp39-macosx_10_9_universal2.whl → 0.1.18__cp39-cp39-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sequenzo might be problematic. Click here for more details.

Files changed (86) hide show
  1. sequenzo/__init__.py +25 -1
  2. sequenzo/big_data/clara/clara.py +1 -1
  3. sequenzo/big_data/clara/utils/get_weighted_diss.c +157 -157
  4. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-39-darwin.so +0 -0
  5. sequenzo/clustering/hierarchical_clustering.py +202 -8
  6. sequenzo/define_sequence_data.py +34 -2
  7. sequenzo/dissimilarity_measures/c_code.cpython-39-darwin.so +0 -0
  8. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +1 -1
  9. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +13 -37
  10. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +13 -37
  11. sequenzo/dissimilarity_measures/src/OMdistance.cpp +12 -47
  12. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +103 -67
  13. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  14. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +41 -16
  15. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +4 -0
  16. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +7 -0
  17. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +10 -0
  18. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +127 -43
  19. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +30 -2
  20. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  21. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +14 -5
  22. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +111 -54
  23. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +131 -9
  24. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +11 -113
  25. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +39 -7
  26. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +336 -30
  27. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +9 -37
  28. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +58 -0
  29. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +1 -0
  30. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +35 -2
  31. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +3 -1
  32. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +17 -0
  33. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +13 -0
  34. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +18 -0
  35. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +13 -0
  36. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +8 -0
  37. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +363 -34
  38. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +7 -0
  39. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +13 -0
  40. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +41 -4
  41. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +252 -16
  42. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +9 -0
  43. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +12 -1
  44. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +7 -0
  45. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  46. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +78 -1
  47. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +3 -1
  48. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +13 -2
  49. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +5 -0
  50. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +5 -1
  51. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +2 -0
  52. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +64 -1
  53. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +36 -0
  54. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +40 -31
  55. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +8 -0
  56. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  57. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +6 -0
  58. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.c +157 -157
  59. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-39-darwin.so +0 -0
  60. sequenzo/dissimilarity_measures/utils/seqconc.c +157 -157
  61. sequenzo/dissimilarity_measures/utils/seqconc.cpython-39-darwin.so +0 -0
  62. sequenzo/dissimilarity_measures/utils/seqdss.c +157 -157
  63. sequenzo/dissimilarity_measures/utils/seqdss.cpython-39-darwin.so +0 -0
  64. sequenzo/dissimilarity_measures/utils/seqdur.c +157 -157
  65. sequenzo/dissimilarity_measures/utils/seqdur.cpython-39-darwin.so +0 -0
  66. sequenzo/dissimilarity_measures/utils/seqlength.c +157 -157
  67. sequenzo/dissimilarity_measures/utils/seqlength.cpython-39-darwin.so +0 -0
  68. sequenzo/sequence_characteristics/__init__.py +4 -0
  69. sequenzo/sequence_characteristics/complexity_index.py +17 -57
  70. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +177 -111
  71. sequenzo/sequence_characteristics/plot_characteristics.py +30 -11
  72. sequenzo/sequence_characteristics/simple_characteristics.py +1 -0
  73. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +9 -3
  74. sequenzo/sequence_characteristics/turbulence.py +47 -67
  75. sequenzo/sequence_characteristics/variance_of_spell_durations.py +19 -9
  76. sequenzo/sequence_characteristics/within_sequence_entropy.py +5 -58
  77. sequenzo/visualization/plot_sequence_index.py +58 -35
  78. sequenzo/visualization/plot_state_distribution.py +57 -36
  79. sequenzo/with_event_history_analysis/__init__.py +35 -0
  80. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  81. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  82. {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/METADATA +7 -6
  83. {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/RECORD +86 -79
  84. {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/WHEEL +0 -0
  85. {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/licenses/LICENSE +0 -0
  86. {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/top_level.txt +0 -0
@@ -9,8 +9,8 @@
9
9
  * The full license is in the file LICENSE, distributed with this software. *
10
10
  ****************************************************************************/
11
11
 
12
- #ifndef XSIMD_AVX512_DQHPP
13
- #define XSIMD_AVX512_D_HPP
12
+ #ifndef XSIMD_AVX512DQ_HPP
13
+ #define XSIMD_AVX512DQ_HPP
14
14
 
15
15
  #include "../types/xsimd_avx512dq_register.hpp"
16
16
 
@@ -47,12 +47,12 @@ namespace xsimd
47
47
 
48
48
  // bitwise_not
49
49
  template <class A>
50
- XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
50
+ XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512dq>) noexcept
51
51
  {
52
52
  return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1)));
53
53
  }
54
54
  template <class A>
55
- XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
55
+ XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512dq>) noexcept
56
56
  {
57
57
  return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1)));
58
58
  }
@@ -96,7 +96,7 @@ namespace xsimd
96
96
  // tmp1 = [a0..8, b0..8]
97
97
  // tmp2 = [a8..f, b8..f]
98
98
  #define XSIMD_AVX512_HADDP_STEP1(I, a, b) \
99
- batch<float, avx512f> res##I; \
99
+ batch<float, avx512dq> res##I; \
100
100
  { \
101
101
  auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
102
102
  auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
@@ -180,7 +180,7 @@ namespace xsimd
180
180
 
181
181
  // reduce_add
182
182
  template <class A>
183
- XSIMD_INLINE float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
183
+ XSIMD_INLINE float reduce_add(batch<float, A> const& rhs, requires_arch<avx512dq>) noexcept
184
184
  {
185
185
  __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
186
186
  __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
@@ -188,11 +188,43 @@ namespace xsimd
188
188
  return reduce_add(batch<float, avx2>(res1), avx2 {});
189
189
  }
190
190
 
191
+ // reduce_mul
192
+ template <class A>
193
+ XSIMD_INLINE float reduce_mul(batch<float, A> const& rhs, requires_arch<avx512dq>) noexcept
194
+ {
195
+ __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
196
+ __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
197
+ __m256 res1 = _mm256_mul_ps(tmp1, tmp2);
198
+ return reduce_mul(batch<float, avx2>(res1), avx2 {});
199
+ }
200
+
201
+ // swizzle constant mask
202
+ template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7,
203
+ uint32_t V8, uint32_t V9, uint32_t V10, uint32_t V11, uint32_t V12, uint32_t V13, uint32_t V14, uint32_t V15>
204
+ XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self,
205
+ batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask,
206
+ requires_arch<avx512dq>) noexcept
207
+ {
208
+ constexpr bool dup_lo = detail::is_dup_lo(mask);
209
+ constexpr bool dup_hi = detail::is_dup_hi(mask);
210
+
211
+ XSIMD_IF_CONSTEXPR(dup_lo || dup_hi)
212
+ {
213
+ const batch<float, avx2> half = _mm512_extractf32x8_ps(self, dup_lo ? 0 : 1);
214
+ constexpr typename std::conditional<dup_lo, batch_constant<uint32_t, avx2, V0 % 8, V1 % 8, V2 % 8, V3 % 8, V4 % 8, V5 % 8, V6 % 8, V7 % 8>,
215
+ batch_constant<uint32_t, avx2, V8 % 8, V9 % 8, V10 % 8, V11 % 8, V12 % 8, V13 % 8, V14 % 8, V15 % 8>>::type half_mask {};
216
+ auto permuted = swizzle(half, half_mask, avx2 {});
217
+ // merge the two slices into an AVX512F register:
218
+ return _mm512_broadcast_f32x8(permuted); // duplicates the 256-bit perm into both halves
219
+ }
220
+ return swizzle(self, mask, avx512f {});
221
+ }
222
+
191
223
  // convert
192
224
  namespace detail
193
225
  {
194
226
  template <class A>
195
- XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx512dq>) noexcept
227
+ XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& self, batch<double, A> const&, requires_arch<avx512dq>) noexcept
196
228
  {
197
229
  return _mm512_cvtepi64_pd(self);
198
230
  }
@@ -32,6 +32,8 @@ namespace xsimd
32
32
  XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, Mask const& mask, requires_arch<common>) noexcept;
33
33
  template <class A, class T, size_t I>
34
34
  XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<common>) noexcept;
35
+ template <class A, class T, class ITy, ITy... Is>
36
+ XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch_constant<ITy, A, Is...>, requires_arch<common>) noexcept;
35
37
  template <class A>
36
38
  XSIMD_INLINE void transpose(batch<uint16_t, A>* matrix_begin, batch<uint16_t, A>* matrix_end, requires_arch<common>) noexcept;
37
39
  template <class A>
@@ -562,6 +564,100 @@ namespace xsimd
562
564
  }
563
565
  }
564
566
 
567
+ // rotl
568
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
569
+ XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
570
+ {
571
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
572
+ {
573
+ return _mm512_rolv_epi32(self, other);
574
+ }
575
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
576
+ {
577
+ return _mm512_rolv_epi64(self, other);
578
+ }
579
+ return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
580
+ { return rotl(batch<T, avx2>(s), batch<T, avx2>(o), avx2 {}); },
581
+ self, other);
582
+ }
583
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
584
+ XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
585
+ {
586
+ return rotl(self, batch<T, A>(other), A {});
587
+ }
588
+ template <size_t count, class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
589
+ XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, requires_arch<avx512f>) noexcept
590
+ {
591
+ constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
592
+ static_assert(count < bits, "Count must be less than the number of bits in T");
593
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
594
+ {
595
+ return _mm512_rol_epi32(self, count);
596
+ }
597
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
598
+ {
599
+ return _mm512_rol_epi64(self, count);
600
+ }
601
+
602
+ return detail::fwd_to_avx([](__m256i s) noexcept
603
+ { return rotl<count>(batch<T, avx2>(s), avx2 {}); },
604
+ self);
605
+ }
606
+
607
+ // rotr
608
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
609
+ XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
610
+ {
611
+ XSIMD_IF_CONSTEXPR(sizeof(T) < 4)
612
+ {
613
+ return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
614
+ { return rotr(batch<T, avx2>(s), batch<T, avx2>(o), avx2 {}); },
615
+ self, other);
616
+ }
617
+ XSIMD_IF_CONSTEXPR(std::is_unsigned<T>::value)
618
+ {
619
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
620
+ {
621
+ return _mm512_rorv_epi32(self, other);
622
+ }
623
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
624
+ {
625
+ return _mm512_rorv_epi64(self, other);
626
+ }
627
+ }
628
+ return rotr(self, other, common {});
629
+ }
630
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
631
+ XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
632
+ {
633
+ return rotr(self, batch<T, A>(other), A {});
634
+ }
635
+
636
+ template <size_t count, class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
637
+ XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& self, requires_arch<avx512f>) noexcept
638
+ {
639
+ constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
640
+ static_assert(count < bits, "Count must be less than the number of bits in T");
641
+ XSIMD_IF_CONSTEXPR(sizeof(T) < 4)
642
+ {
643
+ return detail::fwd_to_avx([](__m256i s) noexcept
644
+ { return rotr<count>(batch<T, avx2>(s), avx2 {}); },
645
+ self);
646
+ }
647
+ XSIMD_IF_CONSTEXPR(std::is_unsigned<T>::value)
648
+ {
649
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
650
+ {
651
+ return _mm512_ror_epi32(self, count);
652
+ }
653
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
654
+ {
655
+ return _mm512_ror_epi64(self, count);
656
+ }
657
+ }
658
+ return rotr<count>(self, common {});
659
+ }
660
+
565
661
  // bitwise_xor
566
662
  template <class A>
567
663
  XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
@@ -578,7 +674,7 @@ namespace xsimd
578
674
  XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
579
675
  {
580
676
  using register_type = typename batch_bool<T, A>::register_type;
581
- return register_type(self.data | other.data);
677
+ return register_type(self.data ^ other.data);
582
678
  }
583
679
 
584
680
  template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
@@ -900,6 +996,18 @@ namespace xsimd
900
996
  {
901
997
  return _mm512_fmsub_pd(x, y, z);
902
998
  }
999
+ // fmas
1000
+ template <class A>
1001
+ XSIMD_INLINE batch<float, A> fmas(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
1002
+ {
1003
+ return _mm512_fmaddsub_ps(x, y, z);
1004
+ }
1005
+
1006
+ template <class A>
1007
+ XSIMD_INLINE batch<double, A> fmas(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
1008
+ {
1009
+ return _mm512_fmaddsub_pd(x, y, z);
1010
+ }
903
1011
 
904
1012
  // from bool
905
1013
  template <class A, class T>
@@ -1312,12 +1420,12 @@ namespace xsimd
1312
1420
  template <class A>
1313
1421
  XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1314
1422
  {
1315
- return _mm512_max_ps(self, other);
1423
+ return _mm512_max_ps(other, self);
1316
1424
  }
1317
1425
  template <class A>
1318
1426
  XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1319
1427
  {
1320
- return _mm512_max_pd(self, other);
1428
+ return _mm512_max_pd(other, self);
1321
1429
  }
1322
1430
  template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1323
1431
  XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
@@ -1362,12 +1470,12 @@ namespace xsimd
1362
1470
  template <class A>
1363
1471
  XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1364
1472
  {
1365
- return _mm512_min_ps(self, other);
1473
+ return _mm512_min_ps(other, self);
1366
1474
  }
1367
1475
  template <class A>
1368
1476
  XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1369
1477
  {
1370
- return _mm512_min_pd(self, other);
1478
+ return _mm512_min_pd(other, self);
1371
1479
  }
1372
1480
  template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1373
1481
  XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
@@ -1544,6 +1652,37 @@ namespace xsimd
1544
1652
  return reduce_min(batch<T, avx2>(low));
1545
1653
  }
1546
1654
 
1655
+ // reduce_mul
1656
+ template <class A>
1657
+ XSIMD_INLINE float reduce_mul(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
1658
+ {
1659
+ return _mm512_reduce_mul_ps(rhs);
1660
+ }
1661
+ template <class A>
1662
+ XSIMD_INLINE double reduce_mul(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
1663
+ {
1664
+ return _mm512_reduce_mul_pd(rhs);
1665
+ }
1666
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1667
+ XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx512f>) noexcept
1668
+ {
1669
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1670
+ {
1671
+ return _mm512_reduce_mul_epi32(self);
1672
+ }
1673
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1674
+ {
1675
+ return _mm512_reduce_mul_epi64(self);
1676
+ }
1677
+ else
1678
+ {
1679
+ __m256i low, high;
1680
+ detail::split_avx512(self, low, high);
1681
+ batch<T, avx2> blow(low), bhigh(high);
1682
+ return reduce_mul(blow, avx2 {}) * reduce_mul(bhigh, avx2 {});
1683
+ }
1684
+ }
1685
+
1547
1686
  // rsqrt
1548
1687
  template <class A>
1549
1688
  XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
@@ -1726,8 +1865,8 @@ namespace xsimd
1726
1865
  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
1727
1866
  };
1728
1867
  #else
1729
- return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1730
- v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
1868
+ return _mm512_set_epi16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
1869
+ v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
1731
1870
  #endif
1732
1871
  }
1733
1872
 
@@ -1743,8 +1882,8 @@ namespace xsimd
1743
1882
  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
1744
1883
  };
1745
1884
  #else
1746
- return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1747
- v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
1885
+ return _mm512_set_epi16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
1886
+ v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
1748
1887
  #endif
1749
1888
  }
1750
1889
 
@@ -1767,10 +1906,10 @@ namespace xsimd
1767
1906
  v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63
1768
1907
  };
1769
1908
  #else
1770
- return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1771
- v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
1772
- v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
1773
- v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63);
1909
+ return _mm512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
1910
+ v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
1911
+ v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
1912
+ v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
1774
1913
  #endif
1775
1914
  }
1776
1915
  template <class A, class T, detail::enable_unsigned_integer_t<T> = 0>
@@ -1792,10 +1931,10 @@ namespace xsimd
1792
1931
  v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63
1793
1932
  };
1794
1933
  #else
1795
- return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1796
- v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
1797
- v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
1798
- v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63);
1934
+ return _mm512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
1935
+ v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
1936
+ v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
1937
+ v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
1799
1938
  #endif
1800
1939
  }
1801
1940
 
@@ -1845,19 +1984,110 @@ namespace xsimd
1845
1984
  }
1846
1985
 
1847
1986
  // slide_left
1987
+ namespace detail
1988
+ {
1989
+ template <size_t N>
1990
+ struct make_slide_left_pattern
1991
+ {
1992
+ static constexpr size_t get(size_t i, size_t)
1993
+ {
1994
+ return i >= N ? i - N : 0;
1995
+ }
1996
+ };
1997
+
1998
+ template <size_t N, class A, class T>
1999
+ XSIMD_INLINE batch<T, A> slide_left_aligned_u32(batch<T, A> const& x, requires_arch<avx512f>) noexcept
2000
+ {
2001
+ static_assert((N & 3) == 0 || N >= 64, "N must be aligned to 32 bits");
2002
+
2003
+ if (N == 0)
2004
+ {
2005
+ return x;
2006
+ }
2007
+ if (N >= 64)
2008
+ {
2009
+ return batch<T, A>(T(0));
2010
+ }
2011
+
2012
+ __mmask16 mask = uint16_t(0xFFFFu << (N / 4));
2013
+
2014
+ if ((N & 15) == 0)
2015
+ {
2016
+ const uint8_t imm8 = uint8_t(0xe4 << (2 * (N / 16)));
2017
+ return _mm512_maskz_shuffle_i32x4(mask, x, x, imm8);
2018
+ }
2019
+
2020
+ auto slide_pattern = make_batch_constant<uint32_t, detail::make_slide_left_pattern<N / 4>, A>();
2021
+ return _mm512_maskz_permutexvar_epi32(mask, slide_pattern.as_batch(), x);
2022
+ }
2023
+ }
2024
+
1848
2025
  template <size_t N, class A, class T>
1849
- XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const&, requires_arch<avx512f>) noexcept
2026
+ XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512f>) noexcept
1850
2027
  {
1851
- static_assert(N == 0xDEAD, "not implemented yet");
1852
- return {};
2028
+ constexpr size_t NN = N & ~3;
2029
+ if (N == NN || NN >= 64)
2030
+ {
2031
+ // Call fast path
2032
+ return detail::slide_left_aligned_u32<NN>(x, A {});
2033
+ }
2034
+
2035
+ __m512i xl = detail::slide_left_aligned_u32<NN, A, T>(_mm512_slli_epi32(x, 8 * (N - NN)), A {});
2036
+ __m512i xr = detail::slide_left_aligned_u32<NN + 4, A, T>(_mm512_srli_epi32(x, 32 - 8 * (N - NN)), A {});
2037
+ return _mm512_or_epi32(xl, xr);
1853
2038
  }
1854
2039
 
1855
2040
  // slide_right
2041
+ namespace detail
2042
+ {
2043
+ template <size_t N>
2044
+ struct make_slide_right_pattern
2045
+ {
2046
+ static constexpr size_t get(size_t i, size_t n)
2047
+ {
2048
+ return i < (n - N) ? i + N : 0;
2049
+ }
2050
+ };
2051
+
2052
+ template <size_t N, class A, class T>
2053
+ XSIMD_INLINE batch<T, A> slide_right_aligned_u32(batch<T, A> const& x, requires_arch<avx512f>) noexcept
2054
+ {
2055
+ static_assert((N & 3) == 0 || N >= 64, "N must be aligned to 32 bits");
2056
+
2057
+ if (N == 0)
2058
+ {
2059
+ return x;
2060
+ }
2061
+ if (N >= 64)
2062
+ {
2063
+ return batch<T, A>(T(0));
2064
+ }
2065
+
2066
+ __mmask16 mask = 0xFFFFu >> (N / 4);
2067
+
2068
+ if ((N & 15) == 0)
2069
+ {
2070
+ const uint8_t imm8 = 0xe4 >> (2 * (N / 16));
2071
+ return _mm512_maskz_shuffle_i32x4(mask, x, x, imm8);
2072
+ }
2073
+
2074
+ auto slide_pattern = make_batch_constant<uint32_t, detail::make_slide_right_pattern<N / 4>, A>();
2075
+ return _mm512_maskz_permutexvar_epi32(mask, slide_pattern.as_batch(), x);
2076
+ }
2077
+ }
1856
2078
  template <size_t N, class A, class T>
1857
- XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const&, requires_arch<avx512f>) noexcept
2079
+ XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512f>) noexcept
1858
2080
  {
1859
- static_assert(N == 0xDEAD, "not implemented yet");
1860
- return {};
2081
+ constexpr size_t NN = N & ~3;
2082
+ if (N == NN || NN >= 64)
2083
+ {
2084
+ // Call fast path
2085
+ return detail::slide_right_aligned_u32<NN>(x, A {});
2086
+ }
2087
+
2088
+ __m512i xl = detail::slide_right_aligned_u32<NN + 4, A, T>(_mm512_slli_epi32(x, 32 - 8 * (N - NN)), A {});
2089
+ __m512i xr = detail::slide_right_aligned_u32<NN, A, T>(_mm512_srli_epi32(x, 8 * (N - NN)), A {});
2090
+ return _mm512_or_epi32(xl, xr);
1861
2091
  }
1862
2092
 
1863
2093
  // sqrt
@@ -2019,16 +2249,53 @@ namespace xsimd
2019
2249
  return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx512f {}));
2020
2250
  }
2021
2251
 
2022
- // swizzle (constant version)
2023
- template <class A, uint32_t... Vs>
2024
- XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
2252
+ template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7,
2253
+ uint32_t V8, uint32_t V9, uint32_t V10, uint32_t V11, uint32_t V12, uint32_t V13, uint32_t V14, uint32_t V15>
2254
+ XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self,
2255
+ batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask,
2256
+ requires_arch<avx512f>) noexcept
2025
2257
  {
2258
+ XSIMD_IF_CONSTEXPR(detail::is_identity(mask))
2259
+ {
2260
+ return self;
2261
+ }
2262
+ XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
2263
+ {
2264
+ constexpr int imm0 = detail::mod_shuffle(V0, V1, V2, V3);
2265
+ constexpr int imm1 = detail::mod_shuffle(V4, V5, V6, V7);
2266
+ constexpr int imm2 = detail::mod_shuffle(V8, V9, V10, V11);
2267
+ constexpr int imm3 = detail::mod_shuffle(V12, V13, V14, V15);
2268
+ XSIMD_IF_CONSTEXPR(imm0 == imm1 && imm0 == imm2 && imm0 == imm3)
2269
+ {
2270
+ return _mm512_permute_ps(self, imm0);
2271
+ }
2272
+ }
2026
2273
  return swizzle(self, mask.as_batch(), avx512f {});
2027
2274
  }
2028
-
2029
- template <class A, uint64_t... Vs>
2030
- XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
2275
+ template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3, uint64_t V4, uint64_t V5, uint64_t V6, uint64_t V7>
2276
+ XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self,
2277
+ batch_constant<uint64_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask,
2278
+ requires_arch<avx512f>) noexcept
2031
2279
  {
2280
+ XSIMD_IF_CONSTEXPR(detail::is_identity(mask))
2281
+ {
2282
+ return self;
2283
+ }
2284
+ XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
2285
+ {
2286
+ constexpr auto imm = ((V0 & 1) << 0) | ((V1 & 1) << 1) | ((V2 & 1) << 2) | ((V3 & 1) << 3) | ((V4 & 1) << 4) | ((V5 & 1) << 5) | ((V6 & 1) << 6) | ((V7 & 1) << 7);
2287
+ return _mm512_permute_pd(self, imm);
2288
+ }
2289
+ constexpr bool dup_lo = detail::is_dup_lo(mask);
2290
+ constexpr bool dup_hi = detail::is_dup_hi(mask);
2291
+ XSIMD_IF_CONSTEXPR(dup_lo || dup_hi)
2292
+ {
2293
+ const batch<double, avx2> half = _mm512_extractf64x4_pd(self, dup_lo ? 0 : 1);
2294
+ constexpr typename std::conditional<dup_lo, batch_constant<uint64_t, avx2, V0 % 4, V1 % 4, V2 % 4, V3 % 4>,
2295
+ batch_constant<uint64_t, avx2, V4 % 4, V5 % 4, V6 % 4, V7 % 4>>::type half_mask {};
2296
+ return _mm512_broadcast_f64x4(swizzle(half, half_mask, avx2 {}));
2297
+ }
2298
+ // General case
2032
2299
  return swizzle(self, mask.as_batch(), avx512f {});
2033
2300
  }
2034
2301
 
@@ -2337,8 +2604,47 @@ namespace xsimd
2337
2604
  2));
2338
2605
  }
2339
2606
 
2340
- }
2607
+ // first
2608
+ template <class A>
2609
+ XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx512f>) noexcept
2610
+ {
2611
+ return _mm512_cvtss_f32(self);
2612
+ }
2613
+
2614
+ template <class A>
2615
+ XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx512f>) noexcept
2616
+ {
2617
+ return _mm512_cvtsd_f64(self);
2618
+ }
2341
2619
 
2620
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
2621
+ XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx512f>) noexcept
2622
+ {
2623
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
2624
+ {
2625
+ return static_cast<T>(_mm512_cvtsi512_si32(self) & 0xFF);
2626
+ }
2627
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
2628
+ {
2629
+ return static_cast<T>(_mm512_cvtsi512_si32(self) & 0xFFFF);
2630
+ }
2631
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
2632
+ {
2633
+ return static_cast<T>(_mm512_cvtsi512_si32(self));
2634
+ }
2635
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
2636
+ {
2637
+ batch<T, sse4_2> low = _mm512_castsi512_si128(self);
2638
+ return first(low, sse4_2 {});
2639
+ }
2640
+ else
2641
+ {
2642
+ assert(false && "unsupported arch/op combination");
2643
+ return {};
2644
+ }
2645
+ }
2646
+
2647
+ }
2342
2648
  }
2343
2649
 
2344
2650
  #endif
@@ -24,54 +24,26 @@ namespace xsimd
24
24
  {
25
25
  using namespace types;
26
26
 
27
- namespace detail
28
- {
29
- template <size_t N, size_t... Is>
30
- constexpr std::array<uint8_t, sizeof...(Is)> make_slide_left_bytes_pattern(::xsimd::detail::index_sequence<Is...>)
31
- {
32
- return { (Is >= N ? Is - N : 0)... };
33
- }
34
-
35
- template <size_t N, size_t... Is>
36
- constexpr std::array<uint8_t, sizeof...(Is)> make_slide_right_bytes_pattern(::xsimd::detail::index_sequence<Is...>)
37
- {
38
- return { (Is < (64 - N) ? Is + N : 0)... };
39
- }
40
- }
41
-
42
27
  // slide_left
43
- template <size_t N, class A, class T>
28
+ template <size_t N, class A, class T, class = typename std::enable_if<(N & 3) != 0 && (N < 64)>::type>
44
29
  XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512vbmi>) noexcept
45
30
  {
46
- if (N == 0)
47
- {
48
- return x;
49
- }
50
- if (N >= 64)
51
- {
52
- return batch<T, A>(T(0));
53
- }
31
+ static_assert((N & 3) != 0 && N < 64, "The AVX512F implementation may have a lower latency.");
54
32
 
55
33
  __mmask64 mask = 0xFFFFFFFFFFFFFFFFull << (N & 63);
56
- alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_bytes_pattern<N>(::xsimd::detail::make_index_sequence<512 / 8>());
57
- return _mm512_maskz_permutexvar_epi8(mask, _mm512_load_epi32(slide_pattern.data()), x);
34
+ auto slide_pattern = make_batch_constant<uint8_t, detail::make_slide_left_pattern<N>, A>();
35
+ return _mm512_maskz_permutexvar_epi8(mask, slide_pattern.as_batch(), x);
58
36
  }
59
37
 
60
38
  // slide_right
61
- template <size_t N, class A, class T>
39
+ template <size_t N, class A, class T, class = typename std::enable_if<(N & 3) != 0 && (N < 64)>::type>
62
40
  XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512vbmi>) noexcept
63
41
  {
64
- if (N == 0)
65
- {
66
- return x;
67
- }
68
- if (N >= 64)
69
- {
70
- return batch<T, A>(T(0));
71
- }
42
+ static_assert((N & 3) != 0 && N < 64, "The AVX512F implementation may have a lower latency.");
43
+
72
44
  __mmask64 mask = 0xFFFFFFFFFFFFFFFFull >> (N & 63);
73
- alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_bytes_pattern<N>(::xsimd::detail::make_index_sequence<512 / 8>());
74
- return _mm512_maskz_permutexvar_epi8(mask, _mm512_load_epi32(slide_pattern.data()), x);
45
+ auto slide_pattern = make_batch_constant<uint8_t, detail::make_slide_right_pattern<N>, A>();
46
+ return _mm512_maskz_permutexvar_epi8(mask, slide_pattern.as_batch(), x);
75
47
  }
76
48
 
77
49
  // swizzle (dynamic version)