sequenzo 0.1.17__cp39-cp39-macosx_10_9_universal2.whl → 0.1.18__cp39-cp39-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sequenzo might be problematic. Click here for more details.

Files changed (86) hide show
  1. sequenzo/__init__.py +25 -1
  2. sequenzo/big_data/clara/clara.py +1 -1
  3. sequenzo/big_data/clara/utils/get_weighted_diss.c +157 -157
  4. sequenzo/big_data/clara/utils/get_weighted_diss.cpython-39-darwin.so +0 -0
  5. sequenzo/clustering/hierarchical_clustering.py +202 -8
  6. sequenzo/define_sequence_data.py +34 -2
  7. sequenzo/dissimilarity_measures/c_code.cpython-39-darwin.so +0 -0
  8. sequenzo/dissimilarity_measures/get_substitution_cost_matrix.py +1 -1
  9. sequenzo/dissimilarity_measures/src/DHDdistance.cpp +13 -37
  10. sequenzo/dissimilarity_measures/src/LCPdistance.cpp +13 -37
  11. sequenzo/dissimilarity_measures/src/OMdistance.cpp +12 -47
  12. sequenzo/dissimilarity_measures/src/OMspellDistance.cpp +103 -67
  13. sequenzo/dissimilarity_measures/src/dp_utils.h +160 -0
  14. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +41 -16
  15. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_complex.hpp +4 -0
  16. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_details.hpp +7 -0
  17. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_logical.hpp +10 -0
  18. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_math.hpp +127 -43
  19. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_memory.hpp +30 -2
  20. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_swizzle.hpp +174 -0
  21. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/common/xsimd_common_trigo.hpp +14 -5
  22. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx.hpp +111 -54
  23. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx2.hpp +131 -9
  24. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +11 -113
  25. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp +39 -7
  26. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +336 -30
  27. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp +9 -37
  28. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_avx512vbmi2.hpp +58 -0
  29. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common.hpp +1 -0
  30. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_common_fwd.hpp +35 -2
  31. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_constants.hpp +3 -1
  32. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_emulated.hpp +17 -0
  33. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp +13 -0
  34. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp +18 -0
  35. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_fma4.hpp +13 -0
  36. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_isa.hpp +8 -0
  37. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon.hpp +363 -34
  38. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_neon64.hpp +7 -0
  39. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_rvv.hpp +13 -0
  40. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_scalar.hpp +41 -4
  41. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse2.hpp +252 -16
  42. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sse3.hpp +9 -0
  43. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_ssse3.hpp +12 -1
  44. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_sve.hpp +7 -0
  45. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_vsx.hpp +892 -0
  46. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/arch/xsimd_wasm.hpp +78 -1
  47. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_arch.hpp +3 -1
  48. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_config.hpp +13 -2
  49. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_cpuid.hpp +5 -0
  50. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/config/xsimd_inline.hpp +5 -1
  51. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_all_registers.hpp +2 -0
  52. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_api.hpp +64 -1
  53. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_batch.hpp +36 -0
  54. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_rvv_register.hpp +40 -31
  55. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_traits.hpp +8 -0
  56. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/types/xsimd_vsx_register.hpp +77 -0
  57. sequenzo/dissimilarity_measures/src/xsimd/include/xsimd/xsimd.hpp +6 -0
  58. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.c +157 -157
  59. sequenzo/dissimilarity_measures/utils/get_sm_trate_substitution_cost_matrix.cpython-39-darwin.so +0 -0
  60. sequenzo/dissimilarity_measures/utils/seqconc.c +157 -157
  61. sequenzo/dissimilarity_measures/utils/seqconc.cpython-39-darwin.so +0 -0
  62. sequenzo/dissimilarity_measures/utils/seqdss.c +157 -157
  63. sequenzo/dissimilarity_measures/utils/seqdss.cpython-39-darwin.so +0 -0
  64. sequenzo/dissimilarity_measures/utils/seqdur.c +157 -157
  65. sequenzo/dissimilarity_measures/utils/seqdur.cpython-39-darwin.so +0 -0
  66. sequenzo/dissimilarity_measures/utils/seqlength.c +157 -157
  67. sequenzo/dissimilarity_measures/utils/seqlength.cpython-39-darwin.so +0 -0
  68. sequenzo/sequence_characteristics/__init__.py +4 -0
  69. sequenzo/sequence_characteristics/complexity_index.py +17 -57
  70. sequenzo/sequence_characteristics/overall_cross_sectional_entropy.py +177 -111
  71. sequenzo/sequence_characteristics/plot_characteristics.py +30 -11
  72. sequenzo/sequence_characteristics/simple_characteristics.py +1 -0
  73. sequenzo/sequence_characteristics/state_frequencies_and_entropy_per_sequence.py +9 -3
  74. sequenzo/sequence_characteristics/turbulence.py +47 -67
  75. sequenzo/sequence_characteristics/variance_of_spell_durations.py +19 -9
  76. sequenzo/sequence_characteristics/within_sequence_entropy.py +5 -58
  77. sequenzo/visualization/plot_sequence_index.py +58 -35
  78. sequenzo/visualization/plot_state_distribution.py +57 -36
  79. sequenzo/with_event_history_analysis/__init__.py +35 -0
  80. sequenzo/with_event_history_analysis/sequence_analysis_multi_state_model.py +850 -0
  81. sequenzo/with_event_history_analysis/sequence_history_analysis.py +283 -0
  82. {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/METADATA +7 -6
  83. {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/RECORD +86 -79
  84. {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/WHEEL +0 -0
  85. {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/licenses/LICENSE +0 -0
  86. {sequenzo-0.1.17.dist-info → sequenzo-0.1.18.dist-info}/top_level.txt +0 -0
@@ -292,6 +292,36 @@ namespace xsimd
292
292
  return {};
293
293
  }
294
294
  }
295
+ template <size_t shift, class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
296
+ XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, requires_arch<sse2>) noexcept
297
+ {
298
+ constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
299
+ static_assert(shift < bits, "Count must be less than the number of bits in T");
300
+ XSIMD_IF_CONSTEXPR(shift == 0)
301
+ {
302
+ return self;
303
+ }
304
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
305
+ {
306
+ // 8-bit left shift via 16-bit shift + mask
307
+ __m128i shifted = _mm_slli_epi16(self, static_cast<int>(shift));
308
+ __m128i mask = _mm_set1_epi8(static_cast<char>(0xFF << shift));
309
+ return _mm_and_si128(shifted, mask);
310
+ }
311
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
312
+ {
313
+ return _mm_slli_epi16(self, static_cast<int>(shift));
314
+ }
315
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
316
+ {
317
+ return _mm_slli_epi32(self, static_cast<int>(shift));
318
+ }
319
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
320
+ {
321
+ return _mm_slli_epi64(self, static_cast<int>(shift));
322
+ }
323
+ return bitwise_lshift<shift>(self, common {});
324
+ }
295
325
 
296
326
  // bitwise_not
297
327
  template <class A>
@@ -420,6 +450,63 @@ namespace xsimd
420
450
  }
421
451
  }
422
452
  }
453
+ template <size_t shift, class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
454
+ XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, requires_arch<sse2>) noexcept
455
+ {
456
+ constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
457
+ static_assert(shift < bits,
458
+ "Shift must be less than the number of value bits in the type");
459
+
460
+ XSIMD_IF_CONSTEXPR(shift == 0)
461
+ {
462
+ return self;
463
+ }
464
+
465
+ XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
466
+ {
467
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
468
+ {
469
+ // 8-bit arithmetic right shift via 16-bit shift + sign-extension handling.
470
+ __m128i shifted = _mm_srai_epi16(self, static_cast<int>(shift));
471
+ __m128i sign_mask = _mm_set1_epi16(static_cast<short>(0xFF00 >> shift));
472
+ __m128i cmp_negative = _mm_cmpgt_epi8(_mm_setzero_si128(), self);
473
+ return _mm_or_si128(_mm_and_si128(sign_mask, cmp_negative),
474
+ _mm_andnot_si128(sign_mask, shifted));
475
+ }
476
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
477
+ {
478
+ return _mm_srai_epi16(self, static_cast<int>(shift));
479
+ }
480
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
481
+ {
482
+ return _mm_srai_epi32(self, static_cast<int>(shift));
483
+ }
484
+ // No 64-bit arithmetic right shift in SSE2; fall back
485
+ return bitwise_rshift<shift>(self, common {});
486
+ }
487
+ else // unsigned / logical right shift
488
+ {
489
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
490
+ {
491
+ // Emulate byte-wise logical right shift using 16-bit shifts + per-byte mask.
492
+ __m128i s16 = _mm_srli_epi16(self, static_cast<int>(shift));
493
+ __m128i mask = _mm_set1_epi8(static_cast<char>(0xFFu >> shift));
494
+ return _mm_and_si128(s16, mask);
495
+ }
496
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
497
+ {
498
+ return _mm_srli_epi16(self, static_cast<int>(shift));
499
+ }
500
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
501
+ {
502
+ return _mm_srli_epi32(self, static_cast<int>(shift));
503
+ }
504
+ else // sizeof(T) == 8
505
+ {
506
+ return _mm_srli_epi64(self, static_cast<int>(shift));
507
+ }
508
+ }
509
+ }
423
510
 
424
511
  // bitwise_xor
425
512
  template <class A>
@@ -673,6 +760,53 @@ namespace xsimd
673
760
  return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other)));
674
761
  }
675
762
 
763
+ // first
764
+ template <class A>
765
+ XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<sse2>) noexcept
766
+ {
767
+ return _mm_cvtss_f32(self);
768
+ }
769
+
770
+ template <class A>
771
+ XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<sse2>) noexcept
772
+ {
773
+ return _mm_cvtsd_f64(self);
774
+ }
775
+
776
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
777
+ XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<sse2>) noexcept
778
+ {
779
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
780
+ {
781
+ return static_cast<T>(_mm_cvtsi128_si32(self) & 0xFF);
782
+ }
783
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
784
+ {
785
+ return static_cast<T>(_mm_cvtsi128_si32(self) & 0xFFFF);
786
+ }
787
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
788
+ {
789
+ return static_cast<T>(_mm_cvtsi128_si32(self));
790
+ }
791
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
792
+ {
793
+ #if defined(__x86_64__)
794
+ return static_cast<T>(_mm_cvtsi128_si64(self));
795
+ #else
796
+ __m128i m;
797
+ _mm_storel_epi64(&m, self);
798
+ int64_t i;
799
+ std::memcpy(&i, &m, sizeof(i));
800
+ return i;
801
+ #endif
802
+ }
803
+ else
804
+ {
805
+ assert(false && "unsupported arch/op combination");
806
+ return {};
807
+ }
808
+ }
809
+
676
810
  // from_mask
677
811
  template <class A>
678
812
  XSIMD_INLINE batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
@@ -1090,7 +1224,7 @@ namespace xsimd
1090
1224
  template <class A>
1091
1225
  XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1092
1226
  {
1093
- return _mm_max_ps(self, other);
1227
+ return _mm_max_ps(other, self);
1094
1228
  }
1095
1229
  template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1096
1230
  XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
@@ -1100,14 +1234,14 @@ namespace xsimd
1100
1234
  template <class A>
1101
1235
  XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1102
1236
  {
1103
- return _mm_max_pd(self, other);
1237
+ return _mm_max_pd(other, self);
1104
1238
  }
1105
1239
 
1106
1240
  // min
1107
1241
  template <class A>
1108
1242
  XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1109
1243
  {
1110
- return _mm_min_ps(self, other);
1244
+ return _mm_min_ps(other, self);
1111
1245
  }
1112
1246
  template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1113
1247
  XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
@@ -1117,7 +1251,7 @@ namespace xsimd
1117
1251
  template <class A>
1118
1252
  XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1119
1253
  {
1120
- return _mm_min_pd(self, other);
1254
+ return _mm_min_pd(other, self);
1121
1255
  }
1122
1256
 
1123
1257
  // mul
@@ -1243,7 +1377,7 @@ namespace xsimd
1243
1377
  }
1244
1378
  else
1245
1379
  {
1246
- return hadd(self, common {});
1380
+ return reduce_add(self, common {});
1247
1381
  }
1248
1382
  }
1249
1383
 
@@ -1269,10 +1403,10 @@ namespace xsimd
1269
1403
  batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
1270
1404
  batch<T, A> acc2 = max(acc1, step2);
1271
1405
  if (sizeof(T) == 2)
1272
- return acc2.get(0);
1406
+ return first(acc2, A {});
1273
1407
  batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
1274
1408
  batch<T, A> acc3 = max(acc2, step3);
1275
- return acc3.get(0);
1409
+ return first(acc3, A {});
1276
1410
  }
1277
1411
 
1278
1412
  // reduce_min
@@ -1291,10 +1425,56 @@ namespace xsimd
1291
1425
  batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
1292
1426
  batch<T, A> acc2 = min(acc1, step2);
1293
1427
  if (sizeof(T) == 2)
1294
- return acc2.get(0);
1428
+ return first(acc2, A {});
1295
1429
  batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
1296
1430
  batch<T, A> acc3 = min(acc2, step3);
1297
- return acc3.get(0);
1431
+ return first(acc3, A {});
1432
+ }
1433
+
1434
+ // reduce_mul
1435
+ template <class A>
1436
+ XSIMD_INLINE float reduce_mul(batch<float, A> const& self, requires_arch<sse2>) noexcept
1437
+ {
1438
+ __m128 tmp0 = _mm_mul_ps(self, _mm_movehl_ps(self, self));
1439
+ __m128 tmp1 = _mm_mul_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
1440
+ return _mm_cvtss_f32(tmp1);
1441
+ }
1442
+
1443
+ template <class A>
1444
+ XSIMD_INLINE double reduce_mul(batch<double, A> const& self, requires_arch<sse2>) noexcept
1445
+ {
1446
+ return _mm_cvtsd_f64(_mm_mul_sd(self, _mm_unpackhi_pd(self, self)));
1447
+ }
1448
+
1449
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1450
+ XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<sse2>) noexcept
1451
+ {
1452
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1453
+ {
1454
+ batch<T, A> tmp1 = _mm_shuffle_epi32(self, _MM_SHUFFLE(0, 1, 2, 3));
1455
+ tmp1 = tmp1 * self;
1456
+ batch<T, A> tmp2 = _mm_unpackhi_epi32(tmp1, tmp1);
1457
+ tmp2 = tmp2 * tmp1;
1458
+ return _mm_cvtsi128_si32(tmp2);
1459
+ }
1460
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1461
+ {
1462
+ batch<T, A> tmp1 = _mm_unpackhi_epi64(self, self);
1463
+ auto tmp2 = tmp1 * self;
1464
+ #if defined(__x86_64__)
1465
+ return _mm_cvtsi128_si64(tmp2);
1466
+ #else
1467
+ __m128i m;
1468
+ _mm_storel_epi64(&m, tmp2);
1469
+ int64_t i;
1470
+ std::memcpy(&i, &m, sizeof(i));
1471
+ return i;
1472
+ #endif
1473
+ }
1474
+ else
1475
+ {
1476
+ return reduce_mul(self, common {});
1477
+ }
1298
1478
  }
1299
1479
 
1300
1480
  // rsqrt
@@ -1641,22 +1821,78 @@ namespace xsimd
1641
1821
  }
1642
1822
 
1643
1823
  template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1644
- XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<sse2>) noexcept
1824
+ XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<sse2>) noexcept
1645
1825
  {
1646
- // permute within each lane
1826
+ constexpr bool is_identity = detail::is_identity(mask);
1827
+ constexpr bool is_dup_lo = detail::is_dup_lo(mask);
1828
+ constexpr bool is_dup_hi = detail::is_dup_hi(mask);
1829
+
1830
+ XSIMD_IF_CONSTEXPR(is_identity)
1831
+ {
1832
+ return self;
1833
+ }
1834
+ XSIMD_IF_CONSTEXPR(is_dup_lo)
1835
+ {
1836
+ // permute the low half
1837
+ constexpr int imm = detail::mod_shuffle(V0, V1, V2, V3);
1838
+ const auto lo = _mm_shufflelo_epi16(self, imm);
1839
+ // broadcast that 64-bit low half into both halves
1840
+ const auto lo_all = _mm_unpacklo_epi64(lo, lo);
1841
+ return lo_all;
1842
+ }
1843
+ XSIMD_IF_CONSTEXPR(is_dup_hi)
1844
+ {
1845
+ // permute the high half
1846
+ constexpr int imm = detail::mod_shuffle(V4, V5, V6, V7);
1847
+ const auto hi = _mm_shufflehi_epi16(self, imm);
1848
+ // broadcast that 64-bit high half into both halves
1849
+ const auto hi_all = _mm_unpackhi_epi64(hi, hi);
1850
+ return hi_all;
1851
+ }
1852
+ // Only pick elements from the low lane
1853
+ XSIMD_IF_CONSTEXPR((V0 < 4) && (V1 < 4) && (V2 < 4) && (V3 < 4) && (V4 < 4) && (V5 < 4) && (V6 < 4) && (V7 < 4))
1854
+ {
1855
+ // permute within each sub lane
1856
+ constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3);
1857
+ constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7);
1858
+ __m128i lol = _mm_shufflelo_epi16(self, mask_lo);
1859
+ __m128i loh = _mm_shufflelo_epi16(self, mask_hi);
1860
+
1861
+ // generate temporary lanes
1862
+ return _mm_unpacklo_epi64(lol, loh);
1863
+ }
1864
+ // Only pick elements from the high lane
1865
+ XSIMD_IF_CONSTEXPR((V0 >= 4) && (V1 >= 4) && (V2 >= 4) && (V3 >= 4) && (V4 >= 4) && (V5 >= 4) && (V6 >= 4) && (V7 >= 4))
1866
+ {
1867
+ // permute within each sub lane
1868
+ constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3);
1869
+ constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7);
1870
+ __m128i hil = _mm_shufflehi_epi16(self, mask_lo);
1871
+ __m128i hih = _mm_shufflehi_epi16(self, mask_hi);
1872
+
1873
+ // generate temporary lanes
1874
+ return _mm_unpackhi_epi64(hil, hih);
1875
+ }
1876
+
1877
+ // Generic case
1878
+
1879
+ // permute within each sub lane
1647
1880
  constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3);
1648
1881
  constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7);
1649
- __m128i lo = _mm_shufflelo_epi16(self, mask_lo);
1650
- __m128i hi = _mm_shufflehi_epi16(self, mask_hi);
1882
+ __m128i lol = _mm_shufflelo_epi16(self, mask_lo);
1883
+ __m128i loh = _mm_shufflelo_epi16(self, mask_hi);
1884
+ __m128i hil = _mm_shufflehi_epi16(self, mask_lo);
1885
+ __m128i hih = _mm_shufflehi_epi16(self, mask_hi);
1651
1886
 
1652
- __m128i lo_lo = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(lo), _mm_castsi128_pd(lo), _MM_SHUFFLE2(0, 0)));
1653
- __m128i hi_hi = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(hi), _mm_castsi128_pd(hi), _MM_SHUFFLE2(1, 1)));
1887
+ // generate temporary lanes
1888
+ __m128i lo = _mm_unpacklo_epi64(lol, loh);
1889
+ __m128i hi = _mm_unpackhi_epi64(hil, hih);
1654
1890
 
1655
1891
  // mask to choose the right lane
1656
1892
  batch_bool_constant<uint16_t, A, (V0 < 4), (V1 < 4), (V2 < 4), (V3 < 4), (V4 < 4), (V5 < 4), (V6 < 4), (V7 < 4)> blend_mask;
1657
1893
 
1658
1894
  // blend the two permutes
1659
- return select(blend_mask, batch<uint16_t, A>(lo_lo), batch<uint16_t, A>(hi_hi));
1895
+ return select(blend_mask, batch<uint16_t, A>(lo), batch<uint16_t, A>(hi));
1660
1896
  }
1661
1897
 
1662
1898
  template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
@@ -51,6 +51,15 @@ namespace xsimd
51
51
  return _mm_cvtss_f32(tmp1);
52
52
  }
53
53
 
54
+ // reduce_mul
55
+ template <class A>
56
+ XSIMD_INLINE float reduce_mul(batch<float, A> const& self, requires_arch<sse3>) noexcept
57
+ {
58
+ __m128 tmp1 = _mm_mul_ps(self, _mm_movehl_ps(self, self));
59
+ __m128 tmp2 = _mm_mul_ps(tmp1, _mm_movehdup_ps(tmp1));
60
+ return _mm_cvtss_f32(tmp2);
61
+ }
62
+
54
63
  }
55
64
 
56
65
  }
@@ -107,11 +107,22 @@ namespace xsimd
107
107
 
108
108
  // rotate_left
109
109
  template <size_t N, class A>
110
- XSIMD_INLINE batch<uint16_t, A> rotate_left(batch<uint16_t, A> const& self, requires_arch<ssse3>) noexcept
110
+ XSIMD_INLINE batch<uint8_t, A> rotate_left(batch<uint8_t, A> const& self, requires_arch<ssse3>) noexcept
111
111
  {
112
112
  return _mm_alignr_epi8(self, self, N);
113
113
  }
114
114
  template <size_t N, class A>
115
+ XSIMD_INLINE batch<int8_t, A> rotate_left(batch<int8_t, A> const& self, requires_arch<ssse3>) noexcept
116
+ {
117
+ return bitwise_cast<int8_t>(rotate_left<N, A>(bitwise_cast<uint8_t>(self), ssse3 {}));
118
+ }
119
+
120
+ template <size_t N, class A>
121
+ XSIMD_INLINE batch<uint16_t, A> rotate_left(batch<uint16_t, A> const& self, requires_arch<ssse3>) noexcept
122
+ {
123
+ return _mm_alignr_epi8(self, self, 2 * N);
124
+ }
125
+ template <size_t N, class A>
115
126
  XSIMD_INLINE batch<int16_t, A> rotate_left(batch<int16_t, A> const& self, requires_arch<ssse3>) noexcept
116
127
  {
117
128
  return bitwise_cast<int16_t>(rotate_left<N, A>(bitwise_cast<uint16_t>(self), ssse3 {}));
@@ -949,6 +949,13 @@ namespace xsimd
949
949
  return svsel(index_predicate, broadcast<A, T>(val, sve {}), arg);
950
950
  }
951
951
 
952
+ // first
953
+ template <class A, class T, detail::sve_enable_all_t<T> = 0>
954
+ XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<sve>) noexcept
955
+ {
956
+ return self.data[0];
957
+ }
958
+
952
959
  // all
953
960
  template <class A, class T, detail::sve_enable_all_t<T> = 0>
954
961
  XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept