@img/sharp-libvips-dev 1.2.1 → 1.2.2-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/include/ffi.h +3 -3
  2. package/include/harfbuzz/hb-version.h +3 -3
  3. package/include/hwy/abort.h +2 -19
  4. package/include/hwy/aligned_allocator.h +11 -7
  5. package/include/hwy/auto_tune.h +504 -0
  6. package/include/hwy/base.h +425 -104
  7. package/include/hwy/cache_control.h +16 -0
  8. package/include/hwy/detect_compiler_arch.h +32 -1
  9. package/include/hwy/detect_targets.h +251 -67
  10. package/include/hwy/foreach_target.h +35 -0
  11. package/include/hwy/highway.h +185 -76
  12. package/include/hwy/nanobenchmark.h +1 -19
  13. package/include/hwy/ops/arm_neon-inl.h +969 -458
  14. package/include/hwy/ops/arm_sve-inl.h +1137 -359
  15. package/include/hwy/ops/emu128-inl.h +97 -11
  16. package/include/hwy/ops/generic_ops-inl.h +1222 -34
  17. package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
  18. package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
  19. package/include/hwy/ops/ppc_vsx-inl.h +306 -126
  20. package/include/hwy/ops/rvv-inl.h +546 -51
  21. package/include/hwy/ops/scalar-inl.h +77 -22
  22. package/include/hwy/ops/set_macros-inl.h +138 -17
  23. package/include/hwy/ops/shared-inl.h +50 -10
  24. package/include/hwy/ops/wasm_128-inl.h +137 -92
  25. package/include/hwy/ops/x86_128-inl.h +773 -214
  26. package/include/hwy/ops/x86_256-inl.h +712 -255
  27. package/include/hwy/ops/x86_512-inl.h +429 -753
  28. package/include/hwy/ops/x86_avx3-inl.h +501 -0
  29. package/include/hwy/per_target.h +2 -1
  30. package/include/hwy/profiler.h +622 -486
  31. package/include/hwy/targets.h +62 -20
  32. package/include/hwy/timer-inl.h +8 -160
  33. package/include/hwy/timer.h +170 -3
  34. package/include/hwy/x86_cpuid.h +81 -0
  35. package/include/libheif/heif_cxx.h +25 -5
  36. package/include/libheif/heif_regions.h +5 -5
  37. package/include/libheif/heif_version.h +2 -2
  38. package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
  39. package/include/pango-1.0/pango/pango-enum-types.h +3 -0
  40. package/include/pango-1.0/pango/pango-features.h +3 -3
  41. package/include/pango-1.0/pango/pango-font.h +30 -0
  42. package/include/pango-1.0/pango/pango-version-macros.h +26 -0
  43. package/include/zlib.h +3 -3
  44. package/package.json +1 -1
  45. package/versions.json +8 -8
@@ -0,0 +1,501 @@
1
+ // Copyright 2019 Google LLC
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ //
4
+ // Licensed under the Apache License, Version 2.0 (the "License");
5
+ // you may not use this file except in compliance with the License.
6
+ // You may obtain a copy of the License at
7
+ //
8
+ // http://www.apache.org/licenses/LICENSE-2.0
9
+ //
10
+ // Unless required by applicable law or agreed to in writing, software
11
+ // distributed under the License is distributed on an "AS IS" BASIS,
12
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ // See the License for the specific language governing permissions and
14
+ // limitations under the License.
15
+
16
+ // External include guard in highway.h - see comment there.
17
+
18
+ // For AVX3/AVX10 targets that support 512-byte vectors. Already includes base.h
19
+ // and shared-inl.h.
20
+ #include "hwy/ops/x86_512-inl.h"
21
+
22
+ // AVX3/AVX10 ops that have dependencies on ops defined in x86_512-inl.h if
23
+ // HWY_MAX_BYTES >= 64 is true are defined below
24
+
25
+ // Avoid uninitialized warnings in GCC's avx512fintrin.h - see
26
+ // https://github.com/google/highway/issues/710)
27
+ HWY_DIAGNOSTICS(push)
28
+ #if HWY_COMPILER_GCC_ACTUAL
29
+ HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
30
+ HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494,
31
+ ignored "-Wmaybe-uninitialized")
32
+ #endif
33
+
34
+ HWY_BEFORE_NAMESPACE();
35
+ namespace hwy {
36
+ namespace HWY_NAMESPACE {
37
+
38
+ #if HWY_TARGET <= HWY_AVX3_DL
39
+
40
+ // ------------------------------ ShiftLeft
41
+
42
+ // Generic for all vector lengths. Must be defined after all GaloisAffine.
43
+ template <int kBits, class V, HWY_IF_T_SIZE_V(V, 1)>
44
+ HWY_API V ShiftLeft(const V v) {
45
+ const Repartition<uint64_t, DFromV<V>> du64;
46
+ if (kBits == 0) return v;
47
+ if (kBits == 1) return v + v;
48
+ constexpr uint64_t kMatrix = (0x0102040810204080ULL >> kBits) &
49
+ (0x0101010101010101ULL * (0xFF >> kBits));
50
+ return detail::GaloisAffine(v, Set(du64, kMatrix));
51
+ }
52
+
53
+ // ------------------------------ ShiftRight
54
+
55
+ // Generic for all vector lengths. Must be defined after all GaloisAffine.
56
+ template <int kBits, class V, HWY_IF_U8_D(DFromV<V>)>
57
+ HWY_API V ShiftRight(const V v) {
58
+ const Repartition<uint64_t, DFromV<V>> du64;
59
+ if (kBits == 0) return v;
60
+ constexpr uint64_t kMatrix =
61
+ (0x0102040810204080ULL << kBits) &
62
+ (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
63
+ return detail::GaloisAffine(v, Set(du64, kMatrix));
64
+ }
65
+
66
+ // Generic for all vector lengths. Must be defined after all GaloisAffine.
67
+ template <int kBits, class V, HWY_IF_I8_D(DFromV<V>)>
68
+ HWY_API V ShiftRight(const V v) {
69
+ const Repartition<uint64_t, DFromV<V>> du64;
70
+ if (kBits == 0) return v;
71
+ constexpr uint64_t kShift =
72
+ (0x0102040810204080ULL << kBits) &
73
+ (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
74
+ constexpr uint64_t kSign =
75
+ kBits == 0 ? 0 : (0x8080808080808080ULL >> (64 - (8 * kBits)));
76
+ return detail::GaloisAffine(v, Set(du64, kShift | kSign));
77
+ }
78
+
79
+ // ------------------------------ RotateRight
80
+
81
+ // U8 RotateRight is generic for all vector lengths on AVX3_DL
82
+ template <int kBits, class V, HWY_IF_U8(TFromV<V>)>
83
+ HWY_API V RotateRight(V v) {
84
+ static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
85
+
86
+ const Repartition<uint64_t, DFromV<V>> du64;
87
+ if (kBits == 0) return v;
88
+
89
+ constexpr uint64_t kShrMatrix =
90
+ (0x0102040810204080ULL << kBits) &
91
+ (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
92
+ constexpr int kShlBits = (-kBits) & 7;
93
+ constexpr uint64_t kShlMatrix = (0x0102040810204080ULL >> kShlBits) &
94
+ (0x0101010101010101ULL * (0xFF >> kShlBits));
95
+ constexpr uint64_t kMatrix = kShrMatrix | kShlMatrix;
96
+
97
+ return detail::GaloisAffine(v, Set(du64, kMatrix));
98
+ }
99
+
100
+ #endif // HWY_TARGET <= HWY_AVX3_DL
101
+
102
+ // ------------------------------ Compress
103
+
104
+ #pragma push_macro("HWY_X86_SLOW_COMPRESS_STORE")
105
+
106
+ #ifndef HWY_X86_SLOW_COMPRESS_STORE // allow override
107
+ // Slow on Zen4 and SPR, faster if we emulate via Compress().
108
+ #if HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR
109
+ #define HWY_X86_SLOW_COMPRESS_STORE 1
110
+ #else
111
+ #define HWY_X86_SLOW_COMPRESS_STORE 0
112
+ #endif
113
+ #endif // HWY_X86_SLOW_COMPRESS_STORE
114
+
115
+ // Always implement 8-bit here even if we lack VBMI2 because we can do better
116
+ // than generic_ops (8 at a time) via the native 32-bit compress (16 at a time).
117
+ #ifdef HWY_NATIVE_COMPRESS8
118
+ #undef HWY_NATIVE_COMPRESS8
119
+ #else
120
+ #define HWY_NATIVE_COMPRESS8
121
+ #endif
122
+
123
+ namespace detail {
124
+
125
+ #if HWY_TARGET <= HWY_AVX3_DL // VBMI2
126
+ template <size_t N>
127
+ HWY_INLINE Vec128<uint8_t, N> NativeCompress(const Vec128<uint8_t, N> v,
128
+ const Mask128<uint8_t, N> mask) {
129
+ return Vec128<uint8_t, N>{_mm_maskz_compress_epi8(mask.raw, v.raw)};
130
+ }
131
+ HWY_INLINE Vec256<uint8_t> NativeCompress(const Vec256<uint8_t> v,
132
+ const Mask256<uint8_t> mask) {
133
+ return Vec256<uint8_t>{_mm256_maskz_compress_epi8(mask.raw, v.raw)};
134
+ }
135
+ #if HWY_MAX_BYTES >= 64
136
+ HWY_INLINE Vec512<uint8_t> NativeCompress(const Vec512<uint8_t> v,
137
+ const Mask512<uint8_t> mask) {
138
+ return Vec512<uint8_t>{_mm512_maskz_compress_epi8(mask.raw, v.raw)};
139
+ }
140
+ #endif
141
+
142
+ template <size_t N>
143
+ HWY_INLINE Vec128<uint16_t, N> NativeCompress(const Vec128<uint16_t, N> v,
144
+ const Mask128<uint16_t, N> mask) {
145
+ return Vec128<uint16_t, N>{_mm_maskz_compress_epi16(mask.raw, v.raw)};
146
+ }
147
+ HWY_INLINE Vec256<uint16_t> NativeCompress(const Vec256<uint16_t> v,
148
+ const Mask256<uint16_t> mask) {
149
+ return Vec256<uint16_t>{_mm256_maskz_compress_epi16(mask.raw, v.raw)};
150
+ }
151
+ #if HWY_MAX_BYTES >= 64
152
+ HWY_INLINE Vec512<uint16_t> NativeCompress(const Vec512<uint16_t> v,
153
+ const Mask512<uint16_t> mask) {
154
+ return Vec512<uint16_t>{_mm512_maskz_compress_epi16(mask.raw, v.raw)};
155
+ }
156
+ #endif
157
+
158
+ // Do not even define these to prevent accidental usage.
159
+ #if !HWY_X86_SLOW_COMPRESS_STORE
160
+
161
+ template <size_t N>
162
+ HWY_INLINE void NativeCompressStore(Vec128<uint8_t, N> v,
163
+ Mask128<uint8_t, N> mask,
164
+ uint8_t* HWY_RESTRICT unaligned) {
165
+ _mm_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
166
+ }
167
+ HWY_INLINE void NativeCompressStore(Vec256<uint8_t> v, Mask256<uint8_t> mask,
168
+ uint8_t* HWY_RESTRICT unaligned) {
169
+ _mm256_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
170
+ }
171
+ #if HWY_MAX_BYTES >= 64
172
+ HWY_INLINE void NativeCompressStore(Vec512<uint8_t> v, Mask512<uint8_t> mask,
173
+ uint8_t* HWY_RESTRICT unaligned) {
174
+ _mm512_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw);
175
+ }
176
+ #endif
177
+
178
+ template <size_t N>
179
+ HWY_INLINE void NativeCompressStore(Vec128<uint16_t, N> v,
180
+ Mask128<uint16_t, N> mask,
181
+ uint16_t* HWY_RESTRICT unaligned) {
182
+ _mm_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
183
+ }
184
+ HWY_INLINE void NativeCompressStore(Vec256<uint16_t> v, Mask256<uint16_t> mask,
185
+ uint16_t* HWY_RESTRICT unaligned) {
186
+ _mm256_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
187
+ }
188
+ #if HWY_MAX_BYTES >= 64
189
+ HWY_INLINE void NativeCompressStore(Vec512<uint16_t> v, Mask512<uint16_t> mask,
190
+ uint16_t* HWY_RESTRICT unaligned) {
191
+ _mm512_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw);
192
+ }
193
+ #endif // HWY_MAX_BYTES >= 64
194
+
195
+ #endif // HWY_X86_SLOW_COMPRESS_STORE
196
+
197
+ #endif // HWY_TARGET <= HWY_AVX3_DL
198
+
199
+ template <size_t N>
200
+ HWY_INLINE Vec128<uint32_t, N> NativeCompress(Vec128<uint32_t, N> v,
201
+ Mask128<uint32_t, N> mask) {
202
+ return Vec128<uint32_t, N>{_mm_maskz_compress_epi32(mask.raw, v.raw)};
203
+ }
204
+ HWY_INLINE Vec256<uint32_t> NativeCompress(Vec256<uint32_t> v,
205
+ Mask256<uint32_t> mask) {
206
+ return Vec256<uint32_t>{_mm256_maskz_compress_epi32(mask.raw, v.raw)};
207
+ }
208
+
209
+ #if HWY_MAX_BYTES >= 64
210
+ HWY_INLINE Vec512<uint32_t> NativeCompress(Vec512<uint32_t> v,
211
+ Mask512<uint32_t> mask) {
212
+ return Vec512<uint32_t>{_mm512_maskz_compress_epi32(mask.raw, v.raw)};
213
+ }
214
+ #endif
215
+ // We use table-based compress for 64-bit lanes, see CompressIsPartition.
216
+
217
+ // Do not even define these to prevent accidental usage.
218
+ #if !HWY_X86_SLOW_COMPRESS_STORE
219
+
220
+ template <size_t N>
221
+ HWY_INLINE void NativeCompressStore(Vec128<uint32_t, N> v,
222
+ Mask128<uint32_t, N> mask,
223
+ uint32_t* HWY_RESTRICT unaligned) {
224
+ _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
225
+ }
226
+ HWY_INLINE void NativeCompressStore(Vec256<uint32_t> v, Mask256<uint32_t> mask,
227
+ uint32_t* HWY_RESTRICT unaligned) {
228
+ _mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
229
+ }
230
+ #if HWY_MAX_BYTES >= 64
231
+ HWY_INLINE void NativeCompressStore(Vec512<uint32_t> v, Mask512<uint32_t> mask,
232
+ uint32_t* HWY_RESTRICT unaligned) {
233
+ _mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
234
+ }
235
+ #endif
236
+
237
+ template <size_t N>
238
+ HWY_INLINE void NativeCompressStore(Vec128<uint64_t, N> v,
239
+ Mask128<uint64_t, N> mask,
240
+ uint64_t* HWY_RESTRICT unaligned) {
241
+ _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
242
+ }
243
+ HWY_INLINE void NativeCompressStore(Vec256<uint64_t> v, Mask256<uint64_t> mask,
244
+ uint64_t* HWY_RESTRICT unaligned) {
245
+ _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
246
+ }
247
+ #if HWY_MAX_BYTES >= 64
248
+ HWY_INLINE void NativeCompressStore(Vec512<uint64_t> v, Mask512<uint64_t> mask,
249
+ uint64_t* HWY_RESTRICT unaligned) {
250
+ _mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
251
+ }
252
+ #endif
253
+
254
+ template <size_t N>
255
+ HWY_INLINE void NativeCompressStore(Vec128<float, N> v, Mask128<float, N> mask,
256
+ float* HWY_RESTRICT unaligned) {
257
+ _mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
258
+ }
259
+ HWY_INLINE void NativeCompressStore(Vec256<float> v, Mask256<float> mask,
260
+ float* HWY_RESTRICT unaligned) {
261
+ _mm256_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
262
+ }
263
+ #if HWY_MAX_BYTES >= 64
264
+ HWY_INLINE void NativeCompressStore(Vec512<float> v, Mask512<float> mask,
265
+ float* HWY_RESTRICT unaligned) {
266
+ _mm512_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
267
+ }
268
+ #endif
269
+
270
+ template <size_t N>
271
+ HWY_INLINE void NativeCompressStore(Vec128<double, N> v,
272
+ Mask128<double, N> mask,
273
+ double* HWY_RESTRICT unaligned) {
274
+ _mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
275
+ }
276
+ HWY_INLINE void NativeCompressStore(Vec256<double> v, Mask256<double> mask,
277
+ double* HWY_RESTRICT unaligned) {
278
+ _mm256_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
279
+ }
280
+ #if HWY_MAX_BYTES >= 64
281
+ HWY_INLINE void NativeCompressStore(Vec512<double> v, Mask512<double> mask,
282
+ double* HWY_RESTRICT unaligned) {
283
+ _mm512_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
284
+ }
285
+ #endif
286
+
287
+ #endif // HWY_X86_SLOW_COMPRESS_STORE
288
+
289
+ // For u8x16 and <= u16x16 we can avoid store+load for Compress because there is
290
+ // only a single compressed vector (u32x16). Other EmuCompress are implemented
291
+ // after the EmuCompressStore they build upon.
292
+ template <class V, HWY_IF_U8(TFromV<V>),
293
+ HWY_IF_LANES_LE_D(DFromV<V>, HWY_MAX_BYTES / 4)>
294
+ static HWY_INLINE HWY_MAYBE_UNUSED V EmuCompress(V v, MFromD<DFromV<V>> mask) {
295
+ const DFromV<decltype(v)> d;
296
+ const Rebind<uint32_t, decltype(d)> d32;
297
+ const VFromD<decltype(d32)> v0 = PromoteTo(d32, v);
298
+
299
+ using M32 = MFromD<decltype(d32)>;
300
+ const M32 m0 = PromoteMaskTo(d32, d, mask);
301
+ return TruncateTo(d, Compress(v0, m0));
302
+ }
303
+
304
+ template <class V, HWY_IF_U16(TFromV<V>),
305
+ HWY_IF_LANES_LE_D(DFromV<V>, HWY_MAX_BYTES / 4)>
306
+ static HWY_INLINE HWY_MAYBE_UNUSED V EmuCompress(V v, MFromD<DFromV<V>> mask) {
307
+ const DFromV<decltype(v)> d;
308
+ const Rebind<int32_t, decltype(d)> di32;
309
+ const RebindToUnsigned<decltype(di32)> du32;
310
+
311
+ const MFromD<decltype(du32)> mask32 = PromoteMaskTo(du32, d, mask);
312
+ // DemoteTo is 2 ops, but likely lower latency than TruncateTo on SKX.
313
+ // Only i32 -> u16 is supported, whereas NativeCompress expects u32.
314
+ const VFromD<decltype(du32)> v32 = PromoteTo(du32, v);
315
+ return DemoteTo(d, BitCast(di32, NativeCompress(v32, mask32)));
316
+ }
317
+
318
+ // See above - small-vector EmuCompressStore are implemented via EmuCompress.
319
+ template <class D, HWY_IF_UNSIGNED_D(D),
320
+ HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
321
+ HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)>
322
+ static HWY_INLINE HWY_MAYBE_UNUSED void EmuCompressStore(
323
+ VFromD<D> v, MFromD<D> mask, D d, TFromD<D>* HWY_RESTRICT unaligned) {
324
+ StoreU(EmuCompress(v, mask), d, unaligned);
325
+ }
326
+
327
+ // Main emulation logic for wider vector, starting with EmuCompressStore because
328
+ // it is most convenient to merge pieces using memory (concatenating vectors at
329
+ // byte offsets is difficult).
330
+ template <class D, HWY_IF_UNSIGNED_D(D),
331
+ HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
332
+ HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4)>
333
+ static HWY_INLINE HWY_MAYBE_UNUSED void EmuCompressStore(
334
+ VFromD<D> v, MFromD<D> mask, D d, TFromD<D>* HWY_RESTRICT unaligned) {
335
+ const Half<decltype(d)> dh;
336
+
337
+ const MFromD<decltype(dh)> m0 = LowerHalfOfMask(dh, mask);
338
+ const MFromD<decltype(dh)> m1 = UpperHalfOfMask(dh, mask);
339
+
340
+ const VFromD<decltype(dh)> v0 = LowerHalf(dh, v);
341
+ const VFromD<decltype(dh)> v1 = UpperHalf(dh, v);
342
+
343
+ EmuCompressStore(v0, m0, dh, unaligned);
344
+ EmuCompressStore(v1, m1, dh, unaligned + CountTrue(dh, m0));
345
+ }
346
+
347
+ // Finally, the remaining EmuCompress for wide vectors, using EmuCompressStore.
348
+ template <class V, HWY_IF_UNSIGNED_V(V),
349
+ HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
350
+ HWY_IF_LANES_GT_D(DFromV<V>, HWY_MAX_BYTES / 4)>
351
+ static HWY_INLINE HWY_MAYBE_UNUSED V EmuCompress(V v, MFromD<DFromV<V>> mask) {
352
+ using D = DFromV<decltype(v)>;
353
+ using T = TFromD<D>;
354
+ const D d;
355
+
356
+ alignas(HWY_MAX_LANES_D(D) * sizeof(T)) T buf[2 * HWY_MAX_LANES_D(D)];
357
+ EmuCompressStore(v, mask, d, buf);
358
+ return Load(d, buf);
359
+ }
360
+
361
+ } // namespace detail
362
+
363
+ template <class V, class M, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
364
+ HWY_API V Compress(V v, const M mask) {
365
+ const DFromV<decltype(v)> d;
366
+ const RebindToUnsigned<decltype(d)> du;
367
+ const auto mu = RebindMask(du, mask);
368
+ #if HWY_TARGET <= HWY_AVX3_DL // VBMI2
369
+ return BitCast(d, detail::NativeCompress(BitCast(du, v), mu));
370
+ #else
371
+ return BitCast(d, detail::EmuCompress(BitCast(du, v), mu));
372
+ #endif
373
+ }
374
+
375
+ template <class V, class M, HWY_IF_T_SIZE_V(V, 4)>
376
+ HWY_API V Compress(V v, const M mask) {
377
+ const DFromV<decltype(v)> d;
378
+ const RebindToUnsigned<decltype(d)> du;
379
+ const auto mu = RebindMask(du, mask);
380
+ return BitCast(d, detail::NativeCompress(BitCast(du, v), mu));
381
+ }
382
+
383
+ // ------------------------------ CompressNot
384
+
385
+ template <class V, class M, HWY_IF_NOT_T_SIZE_V(V, 8)>
386
+ HWY_API V CompressNot(V v, const M mask) {
387
+ return Compress(v, Not(mask));
388
+ }
389
+
390
+ // uint64_t lanes. Only implement for 256 and 512-bit vectors because this is a
391
+ // no-op for 128-bit.
392
+ template <class V, class M, HWY_IF_V_SIZE_GT_D(DFromV<V>, 16)>
393
+ HWY_API V CompressBlocksNot(V v, M mask) {
394
+ return CompressNot(v, mask);
395
+ }
396
+
397
+ // ------------------------------ CompressBits
398
+ template <class V>
399
+ HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
400
+ return Compress(v, LoadMaskBits(DFromV<V>(), bits));
401
+ }
402
+
403
+ // ------------------------------ CompressStore
404
+
405
+ // Generic for all vector lengths.
406
+
407
+ template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
408
+ HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
409
+ TFromD<D>* HWY_RESTRICT unaligned) {
410
+ #if HWY_X86_SLOW_COMPRESS_STORE
411
+ StoreU(Compress(v, mask), d, unaligned);
412
+ #else
413
+ const RebindToUnsigned<decltype(d)> du;
414
+ const auto mu = RebindMask(du, mask);
415
+ auto pu = reinterpret_cast<TFromD<decltype(du)> * HWY_RESTRICT>(unaligned);
416
+
417
+ #if HWY_TARGET <= HWY_AVX3_DL // VBMI2
418
+ detail::NativeCompressStore(BitCast(du, v), mu, pu);
419
+ #else
420
+ detail::EmuCompressStore(BitCast(du, v), mu, du, pu);
421
+ #endif
422
+ #endif // HWY_X86_SLOW_COMPRESS_STORE
423
+ const size_t count = CountTrue(d, mask);
424
+ detail::MaybeUnpoison(unaligned, count);
425
+ return count;
426
+ }
427
+
428
+ template <class D, HWY_IF_NOT_FLOAT_D(D),
429
+ HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
430
+ HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
431
+ TFromD<D>* HWY_RESTRICT unaligned) {
432
+ #if HWY_X86_SLOW_COMPRESS_STORE
433
+ StoreU(Compress(v, mask), d, unaligned);
434
+ #else
435
+ const RebindToUnsigned<decltype(d)> du;
436
+ const auto mu = RebindMask(du, mask);
437
+ using TU = TFromD<decltype(du)>;
438
+ TU* HWY_RESTRICT pu = reinterpret_cast<TU*>(unaligned);
439
+ detail::NativeCompressStore(BitCast(du, v), mu, pu);
440
+ #endif // HWY_X86_SLOW_COMPRESS_STORE
441
+ const size_t count = CountTrue(d, mask);
442
+ detail::MaybeUnpoison(unaligned, count);
443
+ return count;
444
+ }
445
+
446
+ // Additional overloads to avoid casting to uint32_t (delay?).
447
+ template <class D, HWY_IF_FLOAT3264_D(D)>
448
+ HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
449
+ TFromD<D>* HWY_RESTRICT unaligned) {
450
+ #if HWY_X86_SLOW_COMPRESS_STORE
451
+ StoreU(Compress(v, mask), d, unaligned);
452
+ #else
453
+ (void)d;
454
+ detail::NativeCompressStore(v, mask, unaligned);
455
+ #endif // HWY_X86_SLOW_COMPRESS_STORE
456
+ const size_t count = PopCount(uint64_t{mask.raw});
457
+ detail::MaybeUnpoison(unaligned, count);
458
+ return count;
459
+ }
460
+
461
+ // ------------------------------ CompressBlendedStore
462
+ template <class D>
463
+ HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
464
+ TFromD<D>* HWY_RESTRICT unaligned) {
465
+ // Native CompressStore already does the blending at no extra cost (latency
466
+ // 11, rthroughput 2 - same as compress plus store).
467
+
468
+ HWY_IF_CONSTEXPR(HWY_MAX_LANES_D(D) < (16 / sizeof(TFromD<D>))) {
469
+ m = And(m, FirstN(d, HWY_MAX_LANES_D(D)));
470
+ }
471
+
472
+ HWY_IF_CONSTEXPR(!HWY_X86_SLOW_COMPRESS_STORE &&
473
+ (HWY_TARGET <= HWY_AVX3_DL || sizeof(TFromD<D>) > 2)) {
474
+ return CompressStore(v, m, d, unaligned);
475
+ }
476
+ else {
477
+ const size_t count = CountTrue(d, m);
478
+ StoreN(Compress(v, m), d, unaligned, count);
479
+ detail::MaybeUnpoison(unaligned, count);
480
+ return count;
481
+ }
482
+ }
483
+
484
+ // ------------------------------ CompressBitsStore
485
+ // Generic for all vector lengths.
486
+ template <class D>
487
+ HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
488
+ D d, TFromD<D>* HWY_RESTRICT unaligned) {
489
+ return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
490
+ }
491
+
492
+ #pragma pop_macro("HWY_X86_SLOW_COMPRESS_STORE")
493
+
494
+ // NOLINTNEXTLINE(google-readability-namespace-comments)
495
+ } // namespace HWY_NAMESPACE
496
+ } // namespace hwy
497
+ HWY_AFTER_NAMESPACE();
498
+
499
+ // Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
500
+ // the warning seems to be issued at the call site of intrinsics, i.e. our code.
501
+ HWY_DIAGNOSTICS(pop)
@@ -39,7 +39,8 @@ HWY_DLLEXPORT int64_t DispatchedTarget();
39
39
  // unnecessarily.
40
40
  HWY_DLLEXPORT size_t VectorBytes();
41
41
 
42
- // Returns whether 16/64-bit floats are a supported lane type.
42
+ // Returns whether 64-bit integers, 16/64-bit floats are a supported lane type.
43
+ HWY_DLLEXPORT bool HaveInteger64();
43
44
  HWY_DLLEXPORT bool HaveFloat16();
44
45
  HWY_DLLEXPORT bool HaveFloat64();
45
46