sequenzo 0.1.17__cp39-cp39-win_amd64.whl → 0.1.18__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sequenzo might be problematic. Click here for more details.

Files changed (101) hide show
  1. sequenzo/__init__.py +25 -1
  2. sequenzo/big_data/clara/clara.py +1 -1
  3. sequenzo/big_data/clara/utils/get_weighted_diss.c +156 -156
  4. sequenzo/big_data/clara/utils/get_weighted_diss.cp39-win_amd64.pyd +0 -0
  5. sequenzo/clustering/clustering_c_code.cp39-win_amd64.pyd +0 -0
  6. sequenzo/clustering/hierarchical_clustering.py +202 -8
  7. sequenzo/define_sequence_data.py +34 -2
  8. sequenzo/dissimilarity_measures/c_code.cp39-win_amd64.pyd +0 -0
  9. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +1 -1
  10. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +13 -37
  11. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +13 -37
  12. sequenzo/dissimilarity_measures/src/OMdistance.cpp +12 -47
  13. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +103 -67
  14. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  15. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +41 -16
  16. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +4 -0
  17. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +7 -0
  18. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +10 -0
  19. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +127 -43
  20. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +30 -2
  21. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  22. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +14 -5
  23. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +111 -54
  24. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +131 -9
  25. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +11 -113
  26. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +39 -7
  27. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +336 -30
  28. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +9 -37
  29. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +58 -0
  30. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +1 -0
  31. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +35 -2
  32. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +3 -1
  33. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +17 -0
  34. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +13 -0
  35. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +18 -0
  36. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +13 -0
  37. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +8 -0
  38. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +363 -34
  39. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +7 -0
  40. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +13 -0
  41. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +41 -4
  42. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +252 -16
  43. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +9 -0
  44. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +12 -1
  45. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +7 -0
  46. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  47. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +78 -1
  48. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +3 -1
  49. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +13 -2
  50. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +5 -0
  51. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +5 -1
  52. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +2 -0
  53. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +64 -1
  54. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +36 -0
  55. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +40 -31
  56. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +8 -0
  57. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  58. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +6 -0
  59. sequenzo/dissimilarity_measures/src/xsimd/test/test_basic_math.cpp +6 -0
  60. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch.cpp +54 -2
  61. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_bool.cpp +8 -0
  62. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_cast.cpp +11 -4
  63. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_complex.cpp +18 -0
  64. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_int.cpp +8 -14
  65. sequenzo/dissimilarity_measures/src/xsimd/test/test_batch_manip.cpp +216 -173
  66. sequenzo/dissimilarity_measures/src/xsimd/test/test_load_store.cpp +6 -0
  67. sequenzo/dissimilarity_measures/src/xsimd/test/test_memory.cpp +1 -1
  68. sequenzo/dissimilarity_measures/src/xsimd/test/test_power.cpp +7 -4
  69. sequenzo/dissimilarity_measures/src/xsimd/test/test_select.cpp +6 -2
  70. sequenzo/dissimilarity_measures/src/xsimd/test/test_shuffle.cpp +32 -18
  71. sequenzo/dissimilarity_measures/src/xsimd/test/test_utils.hpp +21 -24
  72. sequenzo/dissimilarity_measures/src/xsimd/test/test_xsimd_api.cpp +69 -9
  73. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.c +156 -156
  74. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cp39-win_amd64.pyd +0 -0
  75. sequenzo/dissimilarity_measures/utils/seqconc.c +156 -156
  76. sequenzo/dissimilarity_measures/utils/seqconc.cp39-win_amd64.pyd +0 -0
  77. sequenzo/dissimilarity_measures/utils/seqdss.c +156 -156
  78. sequenzo/dissimilarity_measures/utils/seqdss.cp39-win_amd64.pyd +0 -0
  79. sequenzo/dissimilarity_measures/utils/seqdur.c +156 -156
  80. sequenzo/dissimilarity_measures/utils/seqdur.cp39-win_amd64.pyd +0 -0
  81. sequenzo/dissimilarity_measures/utils/seqlength.c +156 -156
  82. sequenzo/dissimilarity_measures/utils/seqlength.cp39-win_amd64.pyd +0 -0
  83. sequenzo/sequence_characteristics/__init__.py +4 -0
  84. sequenzo/sequence_characteristics/complexity_index.py +17 -57
  85. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +177 -111
  86. sequenzo/sequence_characteristics/plot_characteristics.py +30 -11
  87. sequenzo/sequence_characteristics/simple_characteristics.py +1 -0
  88. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +9 -3
  89. sequenzo/sequence_characteristics/turbulence.py +47 -67
  90. sequenzo/sequence_characteristics/variance_of_spell_durations.py +19 -9
  91. sequenzo/sequence_characteristics/within_sequence_entropy.py +5 -58
  92. sequenzo/visualization/plot_sequence_index.py +58 -35
  93. sequenzo/visualization/plot_state_distribution.py +57 -36
  94. sequenzo/with_event_history_analysis/__init__.py +35 -0
  95. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  96. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  97. {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/METADATA +7 -6
  98. {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/RECORD +101 -94
  99. {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/WHEEL +0 -0
  100. {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/licenses/LICENSE +0 -0
  101. {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/top_level.txt +0 -0
@@ -3,6 +3,7 @@
3
3
  * Martin Renou *
4
4
  * Copyright (c) QuantStack *
5
5
  * Copyright (c) Serge Guelton *
6
+ * Copyright (c) Marco Barbone *
6
7
  * *
7
8
  * Distributed under the terms of the BSD 3-Clause License. *
8
9
  * *
@@ -20,7 +21,6 @@
20
21
 
21
22
  namespace xsimd
22
23
  {
23
-
24
24
  namespace kernel
25
25
  {
26
26
  using namespace types;
@@ -925,12 +925,12 @@ namespace xsimd
925
925
  template <class A>
926
926
  XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
927
927
  {
928
- return _mm256_max_ps(self, other);
928
+ return _mm256_max_ps(other, self);
929
929
  }
930
930
  template <class A>
931
931
  XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
932
932
  {
933
- return _mm256_max_pd(self, other);
933
+ return _mm256_max_pd(other, self);
934
934
  }
935
935
  template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
936
936
  XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
@@ -942,12 +942,12 @@ namespace xsimd
942
942
  template <class A>
943
943
  XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
944
944
  {
945
- return _mm256_min_ps(self, other);
945
+ return _mm256_min_ps(other, self);
946
946
  }
947
947
  template <class A>
948
948
  XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
949
949
  {
950
- return _mm256_min_pd(self, other);
950
+ return _mm256_min_pd(other, self);
951
951
  }
952
952
  template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
953
953
  XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
@@ -1046,7 +1046,7 @@ namespace xsimd
1046
1046
  }
1047
1047
 
1048
1048
  // reduce_add
1049
- template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value || std::is_same<T, double>::value, void>::type>
1049
+ template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
1050
1050
  XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
1051
1051
  {
1052
1052
  typename batch<T, sse4_2>::register_type low, high;
@@ -1077,6 +1077,16 @@ namespace xsimd
1077
1077
  return reduce_min(batch<T, sse4_2>(low));
1078
1078
  }
1079
1079
 
1080
+ // reduce_mul
1081
+ template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
1082
+ XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx>) noexcept
1083
+ {
1084
+ typename batch<T, sse4_2>::register_type low, high;
1085
+ detail::split_avx(self, low, high);
1086
+ batch<T, sse4_2> blow(low), bhigh(high);
1087
+ return reduce_mul(blow * bhigh);
1088
+ }
1089
+
1080
1090
  // rsqrt
1081
1091
  template <class A>
1082
1092
  XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
@@ -1418,23 +1428,19 @@ namespace xsimd
1418
1428
  XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx>) noexcept
1419
1429
  {
1420
1430
  // duplicate low and high part of input
1421
- __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
1422
- __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0);
1423
-
1424
- __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self));
1425
- __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
1431
+ // Duplicate lanes separately
1432
+ // 1) duplicate low and high lanes
1433
+ __m256 lo = _mm256_permute2f128_ps(self, self, 0x00); // [low | low]
1434
+ __m256 hi = _mm256_permute2f128_ps(self, self, 0x11); // [high| high]
1426
1435
 
1427
1436
  // normalize mask
1428
1437
  batch<uint32_t, A> half_mask = mask % 4;
1429
1438
 
1430
1439
  // permute within each lane
1431
- __m256 r0 = _mm256_permutevar_ps(low_low, half_mask);
1432
- __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask);
1440
+ __m256 r0 = _mm256_permutevar_ps(lo, half_mask);
1441
+ __m256 r1 = _mm256_permutevar_ps(hi, half_mask);
1433
1442
 
1434
- // mask to choose the right lane
1435
1443
  batch_bool<uint32_t, A> blend_mask = mask >= 4;
1436
-
1437
- // blend the two permutes
1438
1444
  return _mm256_blendv_ps(r0, r1, batch_bool_cast<float>(blend_mask));
1439
1445
  }
1440
1446
 
@@ -1442,18 +1448,15 @@ namespace xsimd
1442
1448
  XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx>) noexcept
1443
1449
  {
1444
1450
  // duplicate low and high part of input
1445
- __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
1446
- __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0);
1447
-
1448
- __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self));
1449
- __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
1451
+ __m256d lo = _mm256_permute2f128_pd(self, self, 0x00);
1452
+ __m256d hi = _mm256_permute2f128_pd(self, self, 0x11);
1450
1453
 
1451
1454
  // normalize mask
1452
1455
  batch<uint64_t, A> half_mask = -(mask & 1);
1453
1456
 
1454
1457
  // permute within each lane
1455
- __m256d r0 = _mm256_permutevar_pd(low_low, half_mask);
1456
- __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask);
1458
+ __m256d r0 = _mm256_permutevar_pd(lo, half_mask);
1459
+ __m256d r1 = _mm256_permutevar_pd(hi, half_mask);
1457
1460
 
1458
1461
  // mask to choose the right lane
1459
1462
  batch_bool<uint64_t, A> blend_mask = mask >= 2;
@@ -1479,53 +1482,67 @@ namespace xsimd
1479
1482
 
1480
1483
  // swizzle (constant mask)
1481
1484
  template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
1482
- XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
1485
+ XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx>) noexcept
1483
1486
  {
1484
- // duplicate low and high part of input
1485
- __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
1486
- __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0);
1487
-
1488
- __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self));
1489
- __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
1487
+ constexpr bool is_identity = detail::is_identity(mask);
1488
+ constexpr bool is_dup_low = detail::is_dup_lo(mask);
1489
+ constexpr bool is_dup_hi = detail::is_dup_hi(mask);
1490
+ constexpr bool is_dup = is_dup_low || is_dup_hi;
1491
+ XSIMD_IF_CONSTEXPR(is_identity)
1492
+ {
1493
+ return self;
1494
+ }
1495
+ XSIMD_IF_CONSTEXPR(is_dup)
1496
+ {
1497
+ constexpr auto control = is_dup_low ? 0x00 : 0x11;
1498
+ constexpr auto is_dup_identity = is_dup_low ? detail::is_identity<uint32_t, V0, V1, V2, V3>() : detail::is_identity<int64_t, V4 - 4, V5 - 4, V6 - 4, V7 - 4>();
1499
+ auto split = _mm256_permute2f128_ps(self, self, control);
1500
+ XSIMD_IF_CONSTEXPR(!is_dup_identity)
1501
+ {
1502
+ constexpr auto shuffle_mask = is_dup_low ? detail::mod_shuffle(V0, V1, V2, V3) : detail::mod_shuffle(V4 - 4, V5 - 4, V6 - 4, V7 - 4);
1503
+ split = _mm256_permute_ps(split, shuffle_mask);
1504
+ }
1505
+ return split;
1506
+ }
1507
+ // Duplicate lanes separately
1508
+ // 1) duplicate low and high lanes
1509
+ __m256 low_dup = _mm256_permute2f128_ps(self, self, 0x00); // [low | low]
1510
+ __m256 hi_dup = _mm256_permute2f128_ps(self, self, 0x11); // [high| high]
1490
1511
 
1491
- // normalize mask
1492
- batch_constant<uint32_t, A, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
1512
+ // 2) build lane-local index vector (each element = source_index & 3)
1513
+ constexpr batch_constant<uint32_t, A, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
1493
1514
 
1494
- // permute within each lane
1495
- __m256 r0 = _mm256_permutevar_ps(low_low, half_mask.as_batch());
1496
- __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask.as_batch());
1515
+ __m256 r0 = _mm256_permutevar_ps(low_dup, half_mask.as_batch()); // pick from low lane
1516
+ __m256 r1 = _mm256_permutevar_ps(hi_dup, half_mask.as_batch()); // pick from high lane
1497
1517
 
1498
- // mask to choose the right lane
1499
- batch_bool_constant<uint32_t, A, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;
1518
+ constexpr batch_bool_constant<uint32_t, A, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> lane_mask {};
1500
1519
 
1501
- // blend the two permutes
1502
- constexpr auto mask = blend_mask.mask();
1503
- return _mm256_blend_ps(r0, r1, mask);
1520
+ return _mm256_blend_ps(r0, r1, lane_mask.mask());
1504
1521
  }
1505
1522
 
1506
1523
  template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
1507
- XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx>) noexcept
1524
+ XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx>) noexcept
1508
1525
  {
1526
+ // cannot use detail::mod_shuffle as the mod and shift are different in this case
1527
+ constexpr auto imm = ((V0 & 1) << 0) | ((V1 & 1) << 1) | ((V2 & 1) << 2) | ((V3 & 1) << 3);
1528
+ XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; }
1529
+ XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
1530
+ {
1531
+ return _mm256_permute_pd(self, imm);
1532
+ }
1509
1533
  // duplicate low and high part of input
1510
- __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
1511
- __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0);
1512
-
1513
- __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self));
1514
- __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
1515
-
1516
- // normalize mask
1517
- batch_constant<uint64_t, A, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;
1534
+ __m256d lo = _mm256_permute2f128_pd(self, self, 0x00);
1535
+ __m256d hi = _mm256_permute2f128_pd(self, self, 0x11);
1518
1536
 
1519
1537
  // permute within each lane
1520
- __m256d r0 = _mm256_permutevar_pd(low_low, half_mask.as_batch());
1521
- __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask.as_batch());
1538
+ __m256d r0 = _mm256_permute_pd(lo, imm);
1539
+ __m256d r1 = _mm256_permute_pd(hi, imm);
1522
1540
 
1523
1541
  // mask to choose the right lane
1524
- batch_bool_constant<uint64_t, A, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
1542
+ constexpr batch_bool_constant<uint64_t, A, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
1525
1543
 
1526
1544
  // blend the two permutes
1527
- constexpr auto mask = blend_mask.mask();
1528
- return _mm256_blend_pd(r0, r1, mask);
1545
+ return _mm256_blend_pd(r0, r1, blend_mask.mask());
1529
1546
  }
1530
1547
  template <class A,
1531
1548
  typename T,
@@ -1861,6 +1878,46 @@ namespace xsimd
1861
1878
  auto hi = _mm256_unpackhi_pd(self, other);
1862
1879
  return _mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1);
1863
1880
  }
1881
+
1882
+ // first
1883
+ template <class A>
1884
+ XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx>) noexcept
1885
+ {
1886
+ return _mm256_cvtss_f32(self);
1887
+ }
1888
+
1889
+ template <class A>
1890
+ XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx>) noexcept
1891
+ {
1892
+ return _mm256_cvtsd_f64(self);
1893
+ }
1894
+
1895
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1896
+ XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx>) noexcept
1897
+ {
1898
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1899
+ {
1900
+ return static_cast<T>(_mm256_cvtsi256_si32(self) & 0xFF);
1901
+ }
1902
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1903
+ {
1904
+ return static_cast<T>(_mm256_cvtsi256_si32(self) & 0xFFFF);
1905
+ }
1906
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1907
+ {
1908
+ return static_cast<T>(_mm256_cvtsi256_si32(self));
1909
+ }
1910
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1911
+ {
1912
+ batch<T, sse4_2> low = _mm256_castsi256_si128(self);
1913
+ return first(low, sse4_2 {});
1914
+ }
1915
+ else
1916
+ {
1917
+ assert(false && "unsupported arch/op combination");
1918
+ return {};
1919
+ }
1920
+ }
1864
1921
  }
1865
1922
  }
1866
1923
 
@@ -17,6 +17,8 @@
17
17
 
18
18
  #include "../types/xsimd_avx2_register.hpp"
19
19
 
20
+ #include <limits>
21
+
20
22
  namespace xsimd
21
23
  {
22
24
 
@@ -172,6 +174,29 @@ namespace xsimd
172
174
  }
173
175
  }
174
176
 
177
+ template <size_t shift, class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
178
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, requires_arch<avx2>) noexcept
179
+ {
180
+ constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
181
+ static_assert(shift < bits, "Shift must be less than the number of bits in T");
182
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
183
+ {
184
+ return _mm256_slli_epi16(self, shift);
185
+ }
186
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
187
+ {
188
+ return _mm256_slli_epi32(self, shift);
189
+ }
190
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
191
+ {
192
+ return _mm256_slli_epi64(self, shift);
193
+ }
194
+ else
195
+ {
196
+ return bitwise_lshift<shift>(self, avx {});
197
+ }
198
+ }
199
+
175
200
  template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
176
201
  XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
177
202
  {
@@ -252,6 +277,65 @@ namespace xsimd
252
277
  }
253
278
  }
254
279
 
280
+ template <size_t shift, class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
281
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, requires_arch<avx2>) noexcept
282
+ {
283
+ constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
284
+ static_assert(shift < bits, "Shift amount must be less than the number of bits in T");
285
+ if (std::is_signed<T>::value)
286
+ {
287
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
288
+ {
289
+ __m256i sign_mask = _mm256_set1_epi16((0xFF00 >> shift) & 0x00FF);
290
+ __m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self);
291
+ __m256i res = _mm256_srai_epi16(self, shift);
292
+ return _mm256_or_si256(
293
+ detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
294
+ { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
295
+ sign_mask, cmp_is_negative),
296
+ _mm256_andnot_si256(sign_mask, res));
297
+ }
298
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
299
+ {
300
+ return _mm256_srai_epi16(self, shift);
301
+ }
302
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
303
+ {
304
+ return _mm256_srai_epi32(self, shift);
305
+ }
306
+ else
307
+ {
308
+ return bitwise_rshift<shift>(self, avx {});
309
+ }
310
+ }
311
+ else
312
+ {
313
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
314
+ {
315
+ const __m256i byte_mask = _mm256_set1_epi16(0x00FF);
316
+ __m256i u16 = _mm256_and_si256(self, byte_mask);
317
+ __m256i r16 = _mm256_srli_epi16(u16, shift);
318
+ return _mm256_and_si256(r16, byte_mask);
319
+ }
320
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
321
+ {
322
+ return _mm256_srli_epi16(self, shift);
323
+ }
324
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
325
+ {
326
+ return _mm256_srli_epi32(self, shift);
327
+ }
328
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
329
+ {
330
+ return _mm256_srli_epi64(self, shift);
331
+ }
332
+ else
333
+ {
334
+ return bitwise_rshift<shift>(self, avx {});
335
+ }
336
+ }
337
+ }
338
+
255
339
  template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
256
340
  XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
257
341
  {
@@ -657,9 +741,35 @@ namespace xsimd
657
741
 
658
742
  // rotate_left
659
743
  template <size_t N, class A>
744
+ XSIMD_INLINE batch<uint8_t, A> rotate_left(batch<uint8_t, A> const& self, requires_arch<avx2>) noexcept
745
+ {
746
+ auto other = _mm256_permute2x128_si256(self, self, 0x1);
747
+ if (N < 16)
748
+ {
749
+ return _mm256_alignr_epi8(other, self, N);
750
+ }
751
+ else
752
+ {
753
+ return _mm256_alignr_epi8(self, other, N - 16);
754
+ }
755
+ }
756
+ template <size_t N, class A>
757
+ XSIMD_INLINE batch<int8_t, A> rotate_left(batch<int8_t, A> const& self, requires_arch<avx2>) noexcept
758
+ {
759
+ return bitwise_cast<int8_t>(rotate_left<N, A>(bitwise_cast<uint8_t>(self), avx2 {}));
760
+ }
761
+ template <size_t N, class A>
660
762
  XSIMD_INLINE batch<uint16_t, A> rotate_left(batch<uint16_t, A> const& self, requires_arch<avx2>) noexcept
661
763
  {
662
- return _mm256_alignr_epi8(self, self, N);
764
+ auto other = _mm256_permute2x128_si256(self, self, 0x1);
765
+ if (N < 8)
766
+ {
767
+ return _mm256_alignr_epi8(other, self, 2 * N);
768
+ }
769
+ else
770
+ {
771
+ return _mm256_alignr_epi8(self, other, 2 * (N - 8));
772
+ }
663
773
  }
664
774
  template <size_t N, class A>
665
775
  XSIMD_INLINE batch<int16_t, A> rotate_left(batch<int16_t, A> const& self, requires_arch<avx2>) noexcept
@@ -879,9 +989,8 @@ namespace xsimd
879
989
  template <class A>
880
990
  XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
881
991
  {
882
- return _mm256_permutevar8x32_ps(self, mask);
992
+ return swizzle(self, mask, avx {});
883
993
  }
884
-
885
994
  template <class A>
886
995
  XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept
887
996
  {
@@ -903,7 +1012,7 @@ namespace xsimd
903
1012
  template <class A>
904
1013
  XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
905
1014
  {
906
- return _mm256_permutevar8x32_epi32(self, mask);
1015
+ return swizzle(self, mask, avx {});
907
1016
  }
908
1017
  template <class A>
909
1018
  XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
@@ -915,20 +1024,33 @@ namespace xsimd
915
1024
  template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
916
1025
  XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
917
1026
  {
918
- return _mm256_permutevar8x32_ps(self, mask.as_batch());
1027
+ XSIMD_IF_CONSTEXPR(detail::is_all_different(mask) && !detail::is_identity(mask))
1028
+ {
1029
+ // The intrinsic does NOT allow to copy the same element of the source vector to more than one element of the destination vector.
1030
+ // one-shot 8-lane permute
1031
+ return _mm256_permutevar8x32_ps(self, mask.as_batch());
1032
+ }
1033
+ return swizzle(self, mask, avx {});
919
1034
  }
920
1035
 
921
1036
  template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
922
- XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
1037
+ XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
923
1038
  {
924
- constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
925
- return _mm256_permute4x64_pd(self, mask);
1039
+ XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; }
1040
+ XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
1041
+ {
1042
+ constexpr auto imm = ((V0 & 1) << 0) | ((V1 & 1) << 1) | ((V2 & 1) << 2) | ((V3 & 1) << 3);
1043
+ return _mm256_permute_pd(self, imm);
1044
+ }
1045
+ constexpr auto imm = detail::mod_shuffle(V0, V1, V2, V3);
1046
+ // fallback to full 4-element permute
1047
+ return _mm256_permute4x64_pd(self, imm);
926
1048
  }
927
1049
 
928
1050
  template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
929
1051
  XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
930
1052
  {
931
- constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
1053
+ constexpr auto mask = detail::mod_shuffle(V0, V1, V2, V3);
932
1054
  return _mm256_permute4x64_epi64(self, mask);
933
1055
  }
934
1056
  template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
@@ -429,18 +429,6 @@ namespace xsimd
429
429
  return detail::compare_int_avx512bw<A, T, _MM_CMPINT_NE>(self, other);
430
430
  }
431
431
 
432
- // rotate_left
433
- template <size_t N, class A>
434
- XSIMD_INLINE batch<uint16_t, A> rotate_left(batch<uint16_t, A> const& self, requires_arch<avx512bw>) noexcept
435
- {
436
- return _mm512_alignr_epi8(self, self, N);
437
- }
438
- template <size_t N, class A>
439
- XSIMD_INLINE batch<int16_t, A> rotate_left(batch<int16_t, A> const& self, requires_arch<avx512bw>) noexcept
440
- {
441
- return bitwise_cast<int16_t>(rotate_left<N, A>(bitwise_cast<uint16_t>(self), avx512bw {}));
442
- }
443
-
444
432
  // sadd
445
433
  template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
446
434
  XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
@@ -496,115 +484,25 @@ namespace xsimd
496
484
  }
497
485
 
498
486
  // slide_left
499
- namespace detail
500
- {
501
- template <size_t... Is>
502
- constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_hi(::xsimd::detail::index_sequence<Is...>)
503
- {
504
- return { (Is == 0 ? 8 : Is - 1)... };
505
- }
506
-
507
- template <size_t N, size_t... Is>
508
- constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_pattern(::xsimd::detail::index_sequence<Is...>)
509
- {
510
- return { (Is >= N ? Is - N : 0)... };
511
- }
512
- template <size_t N, size_t... Is>
513
- constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_mask(::xsimd::detail::index_sequence<Is...>)
514
- {
515
- return { (Is >= N ? 0xFFFF : 0x0000)... };
516
- }
517
- }
518
-
519
- template <size_t N, class A, class T>
487
+ template <size_t N, class A, class T, class = typename std::enable_if<(N & 3) == 2 && (N < 64)>::type>
520
488
  XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
521
489
  {
522
- constexpr unsigned BitCount = N * 8;
523
- if (BitCount == 0)
524
- {
525
- return x;
526
- }
527
- if (BitCount >= 512)
528
- {
529
- return batch<T, A>(T(0));
530
- }
531
- batch<T, A> xx;
532
- if (N & 1)
533
- {
534
- alignas(A::alignment()) uint64_t buffer[8];
535
- _mm512_store_epi64(&buffer[0], x);
536
- for (int i = 7; i > 0; --i)
537
- buffer[i] = (buffer[i] << 8) | (buffer[i - 1] >> 56);
538
- buffer[0] = buffer[0] << 8;
539
- xx = _mm512_load_epi64(&buffer[0]);
490
+ static_assert((N & 3) == 2 && N < 64, "The AVX512F implementation may have a lower latency.");
540
491
 
541
- alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_hi(::xsimd::detail::make_index_sequence<512 / 64>());
542
- __m512i xl = _mm512_slli_epi64(x, 8);
543
- __m512i xr = _mm512_srli_epi64(x, 56);
544
- xr = _mm512_permutex2var_epi64(xr, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
545
- xx = _mm512_or_si512(xr, xl);
546
- if (N == 1)
547
- return xx;
548
- }
549
- else
550
- {
551
- xx = x;
552
- }
553
- alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
554
- alignas(A::alignment()) auto slide_mask = detail::make_slide_left_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
555
- return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
492
+ __mmask32 mask = 0xFFFFFFFFu << ((N / 2) & 31);
493
+ auto slide_pattern = make_batch_constant<uint16_t, detail::make_slide_left_pattern<N / 2>, A>();
494
+ return _mm512_maskz_permutexvar_epi16(mask, slide_pattern.as_batch(), x);
556
495
  }
557
496
 
558
497
  // slide_right
559
- namespace detail
560
- {
561
- template <size_t... Is>
562
- constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_low(::xsimd::detail::index_sequence<Is...>)
563
- {
564
- return { (Is + 1)... };
565
- }
566
-
567
- template <size_t N, size_t... Is>
568
- constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_pattern(::xsimd::detail::index_sequence<Is...>)
569
- {
570
- return { (Is < (32 - N) ? Is + N : 0)... };
571
- }
572
- template <size_t N, size_t... Is>
573
- constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_mask(::xsimd::detail::index_sequence<Is...>)
574
- {
575
- return { (Is < 32 - N ? 0xFFFF : 0x0000)... };
576
- }
577
- }
578
- template <size_t N, class A, class T>
498
+ template <size_t N, class A, class T, class = typename std::enable_if<(N & 3) == 2 && (N < 64)>::type>
579
499
  XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
580
500
  {
581
- constexpr unsigned BitCount = N * 8;
582
- if (BitCount == 0)
583
- {
584
- return x;
585
- }
586
- if (BitCount >= 512)
587
- {
588
- return batch<T, A>(T(0));
589
- }
590
- batch<T, A> xx;
591
- if (N & 1)
592
- {
593
- alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_low(::xsimd::detail::make_index_sequence<512 / 64>());
594
- __m512i xr = _mm512_srli_epi64(x, 8);
595
- __m512i xl = _mm512_slli_epi64(x, 56);
596
- xl = _mm512_permutex2var_epi64(xl, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
597
- xx = _mm512_or_si512(xr, xl);
598
- if (N == 1)
599
- return xx;
600
- }
601
- else
602
- {
603
- xx = x;
604
- }
605
- alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
606
- alignas(A::alignment()) auto slide_mask = detail::make_slide_right_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
607
- return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
501
+ static_assert((N & 3) == 2 && N < 64, "The AVX512F implementation may have a lower latency.");
502
+
503
+ __mmask32 mask = 0xFFFFFFFFu >> ((N / 2) & 31);
504
+ auto slide_pattern = make_batch_constant<uint16_t, detail::make_slide_right_pattern<N / 2>, A>();
505
+ return _mm512_maskz_permutexvar_epi16(mask, slide_pattern.as_batch(), x);
608
506
  }
609
507
 
610
508
  // ssub