@img/sharp-libvips-dev 1.2.0 → 1.2.2-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/include/ffi.h +3 -3
  2. package/include/harfbuzz/hb-deprecated.h +4 -4
  3. package/include/harfbuzz/hb-font.h +120 -9
  4. package/include/harfbuzz/hb-version.h +3 -3
  5. package/include/hwy/abort.h +2 -19
  6. package/include/hwy/aligned_allocator.h +11 -7
  7. package/include/hwy/auto_tune.h +504 -0
  8. package/include/hwy/base.h +425 -104
  9. package/include/hwy/cache_control.h +16 -0
  10. package/include/hwy/detect_compiler_arch.h +32 -1
  11. package/include/hwy/detect_targets.h +251 -67
  12. package/include/hwy/foreach_target.h +35 -0
  13. package/include/hwy/highway.h +185 -76
  14. package/include/hwy/nanobenchmark.h +1 -19
  15. package/include/hwy/ops/arm_neon-inl.h +969 -458
  16. package/include/hwy/ops/arm_sve-inl.h +1137 -359
  17. package/include/hwy/ops/emu128-inl.h +97 -11
  18. package/include/hwy/ops/generic_ops-inl.h +1222 -34
  19. package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
  20. package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
  21. package/include/hwy/ops/ppc_vsx-inl.h +306 -126
  22. package/include/hwy/ops/rvv-inl.h +546 -51
  23. package/include/hwy/ops/scalar-inl.h +77 -22
  24. package/include/hwy/ops/set_macros-inl.h +138 -17
  25. package/include/hwy/ops/shared-inl.h +50 -10
  26. package/include/hwy/ops/wasm_128-inl.h +137 -92
  27. package/include/hwy/ops/x86_128-inl.h +773 -214
  28. package/include/hwy/ops/x86_256-inl.h +712 -255
  29. package/include/hwy/ops/x86_512-inl.h +429 -753
  30. package/include/hwy/ops/x86_avx3-inl.h +501 -0
  31. package/include/hwy/per_target.h +2 -1
  32. package/include/hwy/profiler.h +622 -486
  33. package/include/hwy/targets.h +62 -20
  34. package/include/hwy/timer-inl.h +8 -160
  35. package/include/hwy/timer.h +170 -3
  36. package/include/hwy/x86_cpuid.h +81 -0
  37. package/include/libheif/heif_cxx.h +25 -5
  38. package/include/libheif/heif_regions.h +5 -5
  39. package/include/libheif/heif_version.h +2 -2
  40. package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
  41. package/include/libxml2/libxml/xmlversion.h +4 -4
  42. package/include/pango-1.0/pango/pango-enum-types.h +3 -0
  43. package/include/pango-1.0/pango/pango-features.h +3 -3
  44. package/include/pango-1.0/pango/pango-font.h +30 -0
  45. package/include/pango-1.0/pango/pango-version-macros.h +26 -0
  46. package/include/pixman-1/pixman-version.h +2 -2
  47. package/include/webp/decode.h +11 -2
  48. package/include/webp/demux.h +2 -0
  49. package/include/webp/encode.h +2 -0
  50. package/include/webp/mux_types.h +1 -0
  51. package/include/webp/sharpyuv/sharpyuv.h +1 -1
  52. package/include/webp/types.h +2 -2
  53. package/include/zlib.h +3 -3
  54. package/package.json +1 -1
  55. package/versions.json +11 -11
@@ -538,6 +538,17 @@ HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a,
538
538
  return Vec128<uint16_t, N>{wasm_u16x8_avgr(a.raw, b.raw)};
539
539
  }
540
540
 
541
+ template <class V, HWY_IF_SIGNED_V(V),
542
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
543
+ HWY_API V AverageRound(V a, V b) {
544
+ const DFromV<decltype(a)> d;
545
+ const RebindToUnsigned<decltype(d)> du;
546
+ const V sign_bit = SignBit(d);
547
+ return Xor(BitCast(d, AverageRound(BitCast(du, Xor(a, sign_bit)),
548
+ BitCast(du, Xor(b, sign_bit)))),
549
+ sign_bit);
550
+ }
551
+
541
552
  // ------------------------------ Absolute value
542
553
 
543
554
  // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
@@ -892,6 +903,24 @@ HWY_API Vec128<double, N> Max(Vec128<double, N> a, Vec128<double, N> b) {
892
903
  return Vec128<double, N>{wasm_f64x2_pmax(b.raw, a.raw)};
893
904
  }
894
905
 
906
+ // ------------------------------ MinNumber and MaxNumber
907
+
908
+ #ifdef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
909
+ #undef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
910
+ #else
911
+ #define HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
912
+ #endif
913
+
914
+ template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
915
+ HWY_API V MinNumber(V a, V b) {
916
+ return Min(a, IfThenElse(IsNaN(b), a, b));
917
+ }
918
+
919
+ template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
920
+ HWY_API V MaxNumber(V a, V b) {
921
+ return Max(a, IfThenElse(IsNaN(b), a, b));
922
+ }
923
+
895
924
  // ------------------------------ Integer multiplication
896
925
 
897
926
  // Unsigned
@@ -1093,9 +1122,9 @@ HWY_API Vec128<double, N> operator/(const Vec128<double, N> a,
1093
1122
  return Vec128<double, N>{wasm_f64x2_div(a.raw, b.raw)};
1094
1123
  }
1095
1124
 
1096
- template <typename T, size_t N>
1097
- HWY_API Vec128<T, N> ApproximateReciprocal(const Vec128<T, N> v) {
1098
- return Set(DFromV<decltype(v)>(), T{1.0}) / v;
1125
+ template <class V, HWY_IF_F32(TFromV<V>)>
1126
+ HWY_API V ApproximateReciprocal(const V v) {
1127
+ return Set(DFromV<decltype(v)>(), 1.0f) / v;
1099
1128
  }
1100
1129
 
1101
1130
  // Integer overload defined in generic_ops-inl.h.
@@ -1143,10 +1172,10 @@ HWY_API Vec128<double, N> Sqrt(const Vec128<double, N> v) {
1143
1172
  }
1144
1173
 
1145
1174
  // Approximate reciprocal square root
1146
- template <typename T, size_t N>
1147
- HWY_API Vec128<T, N> ApproximateReciprocalSqrt(const Vec128<T, N> v) {
1175
+ template <class V, HWY_IF_F32(TFromV<V>)>
1176
+ HWY_API V ApproximateReciprocalSqrt(V v) {
1148
1177
  // TODO(eustas): find cheaper a way to calculate this.
1149
- return Set(DFromV<decltype(v)>(), T{1.0}) / Sqrt(v);
1178
+ return Set(DFromV<decltype(v)>(), static_cast<TFromV<V>>(1.0)) / Sqrt(v);
1150
1179
  }
1151
1180
 
1152
1181
  // ------------------------------ Floating-point rounding
@@ -3185,6 +3214,19 @@ HWY_API Vec128<int64_t, N> InterleaveUpper(Vec128<int64_t, N> a,
3185
3214
  return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
3186
3215
  }
3187
3216
 
3217
+ template <size_t N>
3218
+ HWY_API Vec128<float16_t, N> InterleaveUpper(Vec128<float16_t, N> a,
3219
+ Vec128<float16_t, N> b) {
3220
+ return Vec128<float16_t, N>{
3221
+ wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
3222
+ }
3223
+ template <size_t N>
3224
+ HWY_API Vec128<bfloat16_t, N> InterleaveUpper(Vec128<bfloat16_t, N> a,
3225
+ Vec128<bfloat16_t, N> b) {
3226
+ return Vec128<bfloat16_t, N>{
3227
+ wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
3228
+ }
3229
+
3188
3230
  template <size_t N>
3189
3231
  HWY_API Vec128<float, N> InterleaveUpper(Vec128<float, N> a,
3190
3232
  Vec128<float, N> b) {
@@ -3885,18 +3927,26 @@ HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
3885
3927
  }
3886
3928
 
3887
3929
  // ------------------------------ SwapAdjacentBlocks
3888
-
3889
3930
  template <typename T, size_t N>
3890
3931
  HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
3891
3932
  return v;
3892
3933
  }
3893
3934
 
3894
- // ------------------------------ ReverseBlocks
3935
+ // ------------------------------ InterleaveEvenBlocks
3936
+ template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)>
3937
+ HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) {
3938
+ return a;
3939
+ }
3940
+ // ------------------------------ InterleaveOddBlocks
3941
+ template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)>
3942
+ HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) {
3943
+ return a;
3944
+ }
3895
3945
 
3896
- // Single block: no change
3946
+ // ------------------------------ ReverseBlocks
3897
3947
  template <class D>
3898
3948
  HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
3899
- return v;
3949
+ return v; // Single block: no change
3900
3950
  }
3901
3951
 
3902
3952
  // ================================================== CONVERT
@@ -4791,11 +4841,19 @@ HWY_API VFromD<DU> ConvertTo(DU du, VFromD<Rebind<double, DU>> v) {
4791
4841
  }
4792
4842
 
4793
4843
  // ------------------------------ NearestInt (Round)
4794
- template <size_t N>
4795
- HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
4844
+ template <typename T, size_t N, HWY_IF_FLOAT3264(T)>
4845
+ HWY_API Vec128<MakeSigned<T>, N> NearestInt(const Vec128<T, N> v) {
4796
4846
  return ConvertTo(RebindToSigned<DFromV<decltype(v)>>(), Round(v));
4797
4847
  }
4798
4848
 
4849
+ // ------------------------------ DemoteToNearestInt (Round)
4850
+ template <class DI32, HWY_IF_I32_D(DI32)>
4851
+ HWY_API VFromD<DI32> DemoteToNearestInt(DI32 di32,
4852
+ VFromD<Rebind<double, DI32>> v) {
4853
+ // No single instruction, round then demote.
4854
+ return DemoteTo(di32, Round(v));
4855
+ }
4856
+
4799
4857
  // ================================================== MISC
4800
4858
 
4801
4859
  // ------------------------------ SumsOf8 (ShiftRight, Add)
@@ -4914,76 +4972,74 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
4914
4972
 
4915
4973
  namespace detail {
4916
4974
 
4917
- // Full
4918
- template <typename T>
4919
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
4920
- const Mask128<T> mask) {
4975
+ // Returns the lowest N bits for the BitsFromMask result.
4976
+ template <class D>
4977
+ constexpr uint64_t OnlyActive(D d, uint64_t bits) {
4978
+ return (d.MaxBytes() == 16) ? bits : bits & ((1ull << d.MaxLanes()) - 1);
4979
+ }
4980
+
4981
+ } // namespace detail
4982
+
4983
+ template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 16)>
4984
+ HWY_API uint64_t BitsFromMask(D /*d*/, const MFromD<D> mask) {
4921
4985
  alignas(16) uint64_t lanes[2];
4922
4986
  wasm_v128_store(lanes, mask.raw);
4923
4987
 
4924
4988
  constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
4925
4989
  const uint64_t lo = ((lanes[0] * kMagic) >> 56);
4926
4990
  const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
4927
- return (hi + lo);
4991
+ return hi + lo; // exactly 16 bits, no OnlyActive required
4928
4992
  }
4929
4993
 
4930
- // 64-bit
4931
- template <typename T>
4932
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
4933
- const Mask128<T, 8> mask) {
4994
+ template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 8)>
4995
+ HWY_API uint64_t BitsFromMask(D /*d*/, const MFromD<D> mask) {
4934
4996
  constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
4935
- return (static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0)) *
4936
- kMagic) >>
4937
- 56;
4997
+ const uint64_t bytes =
4998
+ static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0));
4999
+ return (bytes * kMagic) >> 56; // exactly 8 bits, no OnlyActive required
4938
5000
  }
4939
5001
 
4940
5002
  // 32-bit or less: need masking
4941
- template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 4)>
4942
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
4943
- const Mask128<T, N> mask) {
5003
+ template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 4)>
5004
+ HWY_API uint64_t BitsFromMask(D d, const MFromD<D> mask) {
4944
5005
  uint64_t bytes = static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0));
4945
5006
  // Clear potentially undefined bytes.
4946
- bytes &= (1ULL << (N * 8)) - 1;
5007
+ bytes &= (1ULL << (Lanes(d) * 8)) - 1;
4947
5008
  constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
4948
- return (bytes * kMagic) >> 56;
5009
+ return detail::OnlyActive(d, (bytes * kMagic) >> 56);
4949
5010
  }
4950
5011
 
4951
- template <typename T, size_t N>
4952
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/,
4953
- const Mask128<T, N> mask) {
5012
+ template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_LE_D(D, 16)>
5013
+ HWY_API uint64_t BitsFromMask(D /*d*/, const MFromD<D> mask) {
4954
5014
  // Remove useless lower half of each u16 while preserving the sign bit.
5015
+ const Rebind<uint8_t, D> d8;
5016
+ using M8 = MFromD<decltype(d8)>;
4955
5017
  const __i16x8 zero = wasm_i16x8_splat(0);
4956
- const Mask128<uint8_t, N> mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)};
4957
- return BitsFromMask(hwy::SizeTag<1>(), mask8);
5018
+ const M8 mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)};
5019
+ return detail::OnlyActive(d8, BitsFromMask(d8, mask8));
4958
5020
  }
4959
5021
 
4960
- template <typename T, size_t N>
4961
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/,
4962
- const Mask128<T, N> mask) {
5022
+ template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_LE_D(D, 16)>
5023
+ HWY_API uint64_t BitsFromMask(D d, const MFromD<D> mask) {
4963
5024
  const __i32x4 mask_i = static_cast<__i32x4>(mask.raw);
4964
5025
  const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
4965
5026
  const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
4966
5027
  alignas(16) uint32_t lanes[4];
4967
5028
  wasm_v128_store(lanes, sliced_mask);
4968
- return lanes[0] | lanes[1] | lanes[2] | lanes[3];
5029
+ return detail::OnlyActive(d, lanes[0] | lanes[1] | lanes[2] | lanes[3]);
4969
5030
  }
4970
5031
 
4971
- template <typename T, size_t N>
4972
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/,
4973
- const Mask128<T, N> mask) {
5032
+ template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_LE_D(D, 16)>
5033
+ HWY_API uint64_t BitsFromMask(D d, const MFromD<D> mask) {
4974
5034
  const __i64x2 mask_i = static_cast<__i64x2>(mask.raw);
4975
5035
  const __i64x2 slice = wasm_i64x2_make(1, 2);
4976
5036
  const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice);
4977
5037
  alignas(16) uint64_t lanes[2];
4978
5038
  wasm_v128_store(lanes, sliced_mask);
4979
- return lanes[0] | lanes[1];
5039
+ return detail::OnlyActive(d, lanes[0] | lanes[1]);
4980
5040
  }
4981
5041
 
4982
- // Returns the lowest N bits for the BitsFromMask result.
4983
- template <typename T, size_t N>
4984
- constexpr uint64_t OnlyActive(uint64_t bits) {
4985
- return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1);
4986
- }
5042
+ namespace detail {
4987
5043
 
4988
5044
  // Returns 0xFF for bytes with index >= N, otherwise 0.
4989
5045
  template <size_t N>
@@ -5015,53 +5071,40 @@ constexpr __i8x16 BytesAbove() {
5015
5071
  : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
5016
5072
  }
5017
5073
 
5018
- template <typename T, size_t N>
5019
- HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
5020
- return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
5021
- }
5074
+ } // namespace detail
5022
5075
 
5023
- template <typename T>
5024
- HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128<T> m) {
5025
- return PopCount(BitsFromMask(tag, m));
5076
+ // `p` points to at least 8 writable bytes.
5077
+ template <class D>
5078
+ HWY_API size_t StoreMaskBits(D d, const MFromD<D> mask, uint8_t* bits) {
5079
+ const uint64_t mask_bits = BitsFromMask(d, mask);
5080
+ const size_t kNumBytes = (d.MaxLanes() + 7) / 8;
5081
+ CopyBytes<kNumBytes>(&mask_bits, bits);
5082
+ return kNumBytes;
5026
5083
  }
5027
5084
 
5028
- template <typename T>
5029
- HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128<T> m) {
5030
- return PopCount(BitsFromMask(tag, m));
5085
+ template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 16)>
5086
+ HWY_API size_t CountTrue(D d, const MFromD<D> m) {
5087
+ return PopCount(BitsFromMask(d, m));
5031
5088
  }
5032
-
5033
- template <typename T>
5034
- HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
5089
+ template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 16)>
5090
+ HWY_API size_t CountTrue(D d, const MFromD<D> m) {
5091
+ return PopCount(BitsFromMask(d, m));
5092
+ }
5093
+ template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_D(D, 16)>
5094
+ HWY_API size_t CountTrue(D /*d*/, const MFromD<D> m) {
5035
5095
  const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
5036
5096
  const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
5037
5097
  alignas(16) uint64_t lanes[2];
5038
5098
  wasm_v128_store(lanes, shifted_bits);
5039
5099
  return PopCount(lanes[0] | lanes[1]);
5040
5100
  }
5041
-
5042
- template <typename T>
5043
- HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> m) {
5101
+ template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 16)>
5102
+ HWY_API size_t CountTrue(D /*d*/, const MFromD<D> m) {
5044
5103
  alignas(16) int64_t lanes[2];
5045
5104
  wasm_v128_store(lanes, m.raw);
5046
5105
  return static_cast<size_t>(-(lanes[0] + lanes[1]));
5047
5106
  }
5048
5107
 
5049
- } // namespace detail
5050
-
5051
- // `p` points to at least 8 writable bytes.
5052
- template <class D>
5053
- HWY_API size_t StoreMaskBits(D d, const MFromD<D> mask, uint8_t* bits) {
5054
- const uint64_t mask_bits = detail::BitsFromMask(mask);
5055
- const size_t kNumBytes = (d.MaxLanes() + 7) / 8;
5056
- CopyBytes<kNumBytes>(&mask_bits, bits);
5057
- return kNumBytes;
5058
- }
5059
-
5060
- template <class D, HWY_IF_V_SIZE_D(D, 16)>
5061
- HWY_API size_t CountTrue(D /* tag */, const MFromD<D> m) {
5062
- return detail::CountTrue(hwy::SizeTag<sizeof(TFromD<D>)>(), m);
5063
- }
5064
-
5065
5108
  // Partial
5066
5109
  template <class D, typename T = TFromD<D>, HWY_IF_V_SIZE_LE_D(D, 8)>
5067
5110
  HWY_API size_t CountTrue(D d, MFromD<D> m) {
@@ -5121,26 +5164,26 @@ HWY_API bool AllTrue(D d, const MFromD<D> m) {
5121
5164
  }
5122
5165
 
5123
5166
  template <class D>
5124
- HWY_API size_t FindKnownFirstTrue(D /* tag */, const MFromD<D> mask) {
5125
- const uint32_t bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
5167
+ HWY_API size_t FindKnownFirstTrue(D d, const MFromD<D> mask) {
5168
+ const uint32_t bits = static_cast<uint32_t>(BitsFromMask(d, mask));
5126
5169
  return Num0BitsBelowLS1Bit_Nonzero32(bits);
5127
5170
  }
5128
5171
 
5129
5172
  template <class D>
5130
- HWY_API intptr_t FindFirstTrue(D /* tag */, const MFromD<D> mask) {
5131
- const uint32_t bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
5173
+ HWY_API intptr_t FindFirstTrue(D d, const MFromD<D> mask) {
5174
+ const uint32_t bits = static_cast<uint32_t>(BitsFromMask(d, mask));
5132
5175
  return bits ? static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero32(bits)) : -1;
5133
5176
  }
5134
5177
 
5135
5178
  template <class D>
5136
- HWY_API size_t FindKnownLastTrue(D /* tag */, const MFromD<D> mask) {
5137
- const uint32_t bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
5179
+ HWY_API size_t FindKnownLastTrue(D d, const MFromD<D> mask) {
5180
+ const uint32_t bits = static_cast<uint32_t>(BitsFromMask(d, mask));
5138
5181
  return 31 - Num0BitsAboveMS1Bit_Nonzero32(bits);
5139
5182
  }
5140
5183
 
5141
5184
  template <class D>
5142
- HWY_API intptr_t FindLastTrue(D /* tag */, const MFromD<D> mask) {
5143
- const uint32_t bits = static_cast<uint32_t>(detail::BitsFromMask(mask));
5185
+ HWY_API intptr_t FindLastTrue(D d, const MFromD<D> mask) {
5186
+ const uint32_t bits = static_cast<uint32_t>(BitsFromMask(d, mask));
5144
5187
  return bits
5145
5188
  ? (31 - static_cast<intptr_t>(Num0BitsAboveMS1Bit_Nonzero32(bits)))
5146
5189
  : -1;
@@ -5586,7 +5629,8 @@ HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
5586
5629
  // General case, 2 or 4 byte lanes
5587
5630
  template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 2))>
5588
5631
  HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
5589
- return detail::Compress(v, detail::BitsFromMask(mask));
5632
+ const DFromV<decltype(v)> d;
5633
+ return detail::Compress(v, BitsFromMask(d, mask));
5590
5634
  }
5591
5635
 
5592
5636
  // Single lane: no-op
@@ -5610,12 +5654,13 @@ HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
5610
5654
  // General case, 2 or 4 byte lanes
5611
5655
  template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
5612
5656
  HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
5657
+ const DFromV<decltype(v)> d;
5613
5658
  // For partial vectors, we cannot pull the Not() into the table because
5614
5659
  // BitsFromMask clears the upper bits.
5615
5660
  if (N < 16 / sizeof(T)) {
5616
- return detail::Compress(v, detail::BitsFromMask(Not(mask)));
5661
+ return detail::Compress(v, BitsFromMask(d, Not(mask)));
5617
5662
  }
5618
- return detail::CompressNot(v, detail::BitsFromMask(mask));
5663
+ return detail::CompressNot(v, BitsFromMask(d, mask));
5619
5664
  }
5620
5665
 
5621
5666
  // ------------------------------ CompressBlocksNot
@@ -5642,7 +5687,7 @@ HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
5642
5687
  template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
5643
5688
  HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
5644
5689
  TFromD<D>* HWY_RESTRICT unaligned) {
5645
- const uint64_t mask_bits = detail::BitsFromMask(mask);
5690
+ const uint64_t mask_bits = BitsFromMask(d, mask);
5646
5691
  const auto c = detail::Compress(v, mask_bits);
5647
5692
  StoreU(c, d, unaligned);
5648
5693
  return PopCount(mask_bits);
@@ -5653,7 +5698,7 @@ template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
5653
5698
  HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
5654
5699
  TFromD<D>* HWY_RESTRICT unaligned) {
5655
5700
  const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16
5656
- const uint64_t mask_bits = detail::BitsFromMask(m);
5701
+ const uint64_t mask_bits = BitsFromMask(d, m);
5657
5702
  const size_t count = PopCount(mask_bits);
5658
5703
  const VFromD<decltype(du)> compressed =
5659
5704
  detail::Compress(BitCast(du, v), mask_bits);