@img/sharp-libvips-dev 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/expat.h +21 -10
- package/include/expat_config.h +11 -5
- package/include/ffi.h +12 -25
- package/include/freetype2/freetype/config/ftoption.h +1 -1
- package/include/gio-unix-2.0/gio/gfiledescriptorbased.h +3 -2
- package/include/glib-2.0/gio/gapplication.h +6 -0
- package/include/glib-2.0/gio/giotypes.h +0 -1
- package/include/glib-2.0/girepository/giarginfo.h +23 -6
- package/include/glib-2.0/girepository/gibaseinfo.h +44 -18
- package/include/glib-2.0/girepository/gicallableinfo.h +26 -16
- package/include/glib-2.0/girepository/gicallbackinfo.h +17 -2
- package/include/glib-2.0/girepository/giconstantinfo.h +19 -4
- package/include/glib-2.0/girepository/gienuminfo.h +20 -21
- package/include/glib-2.0/girepository/gifieldinfo.h +22 -7
- package/include/glib-2.0/girepository/giflagsinfo.h +60 -0
- package/include/glib-2.0/girepository/gifunctioninfo.h +22 -7
- package/include/glib-2.0/girepository/giinterfaceinfo.h +33 -18
- package/include/glib-2.0/girepository/giobjectinfo.h +41 -26
- package/include/glib-2.0/girepository/gipropertyinfo.h +18 -3
- package/include/glib-2.0/girepository/giregisteredtypeinfo.h +22 -11
- package/include/glib-2.0/girepository/girepository-autocleanups.h +56 -0
- package/include/glib-2.0/girepository/girepository.h +53 -62
- package/include/glib-2.0/girepository/girffi.h +8 -7
- package/include/glib-2.0/girepository/gisignalinfo.h +18 -3
- package/include/glib-2.0/girepository/gistructinfo.h +26 -11
- package/include/glib-2.0/girepository/gitypeinfo.h +29 -16
- package/include/glib-2.0/girepository/gitypelib.h +9 -13
- package/include/glib-2.0/girepository/gitypes.h +52 -104
- package/include/glib-2.0/girepository/giunioninfo.h +28 -12
- package/include/glib-2.0/girepository/giunresolvedinfo.h +17 -2
- package/include/glib-2.0/girepository/givalueinfo.h +65 -0
- package/include/glib-2.0/girepository/givfuncinfo.h +23 -8
- package/include/glib-2.0/glib/deprecated/gthread.h +9 -5
- package/include/glib-2.0/glib/gbitlock.h +31 -0
- package/include/glib-2.0/glib/gmessages.h +8 -0
- package/include/glib-2.0/glib/gslice.h +2 -0
- package/include/glib-2.0/glib/gstrfuncs.h +24 -18
- package/include/glib-2.0/glib/gthread.h +191 -3
- package/include/glib-2.0/glib-unix.h +7 -1
- package/include/glib-2.0/gobject/genums.h +6 -6
- package/include/glib-2.0/gobject/glib-types.h +11 -0
- package/include/glib-2.0/gobject/gsignal.h +16 -6
- package/include/hwy/aligned_allocator.h +171 -6
- package/include/hwy/base.h +1765 -543
- package/include/hwy/cache_control.h +24 -6
- package/include/hwy/detect_compiler_arch.h +23 -2
- package/include/hwy/detect_targets.h +56 -13
- package/include/hwy/foreach_target.h +24 -0
- package/include/hwy/highway.h +20 -3
- package/include/hwy/ops/arm_neon-inl.h +1086 -667
- package/include/hwy/ops/arm_sve-inl.h +1091 -235
- package/include/hwy/ops/emu128-inl.h +271 -196
- package/include/hwy/ops/generic_ops-inl.h +2270 -399
- package/include/hwy/ops/ppc_vsx-inl.h +1786 -563
- package/include/hwy/ops/rvv-inl.h +1043 -311
- package/include/hwy/ops/scalar-inl.h +189 -159
- package/include/hwy/ops/set_macros-inl.h +66 -6
- package/include/hwy/ops/shared-inl.h +175 -56
- package/include/hwy/ops/wasm_128-inl.h +153 -136
- package/include/hwy/ops/x86_128-inl.h +1647 -646
- package/include/hwy/ops/x86_256-inl.h +1003 -370
- package/include/hwy/ops/x86_512-inl.h +948 -353
- package/include/hwy/per_target.h +4 -0
- package/include/hwy/profiler.h +648 -0
- package/include/hwy/robust_statistics.h +2 -2
- package/include/hwy/targets.h +18 -11
- package/include/hwy/timer.h +11 -0
- package/include/libpng16/png.h +32 -29
- package/include/libpng16/pngconf.h +2 -2
- package/include/libpng16/pnglibconf.h +7 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/libxml2/libxml/parser.h +16 -7
- package/include/libxml2/libxml/xmlIO.h +0 -1
- package/include/libxml2/libxml/xmlversion.h +4 -4
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-fontmap.h +7 -0
- package/include/pixman-1/pixman-version.h +2 -2
- package/include/png.h +32 -29
- package/include/pngconf.h +2 -2
- package/include/pnglibconf.h +7 -2
- package/include/vips/connection.h +9 -3
- package/include/vips/util.h +0 -9
- package/include/vips/version.h +4 -4
- package/package.json +1 -1
- package/versions.json +11 -11
|
@@ -47,6 +47,13 @@ namespace hwy {
|
|
|
47
47
|
namespace HWY_NAMESPACE {
|
|
48
48
|
namespace detail {
|
|
49
49
|
|
|
50
|
+
// Enable generic functions for whichever of (f16, bf16) are not supported.
|
|
51
|
+
#if !HWY_HAVE_FLOAT16
|
|
52
|
+
#define HWY_X86_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
|
|
53
|
+
#else
|
|
54
|
+
#define HWY_X86_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
|
|
55
|
+
#endif
|
|
56
|
+
|
|
50
57
|
template <typename T>
|
|
51
58
|
struct Raw128 {
|
|
52
59
|
using type = __m128i;
|
|
@@ -90,6 +97,9 @@ class Vec128 {
|
|
|
90
97
|
HWY_INLINE Vec128& operator-=(const Vec128 other) {
|
|
91
98
|
return *this = (*this - other);
|
|
92
99
|
}
|
|
100
|
+
HWY_INLINE Vec128& operator%=(const Vec128 other) {
|
|
101
|
+
return *this = (*this % other);
|
|
102
|
+
}
|
|
93
103
|
HWY_INLINE Vec128& operator&=(const Vec128 other) {
|
|
94
104
|
return *this = (*this & other);
|
|
95
105
|
}
|
|
@@ -194,18 +204,12 @@ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
|
|
|
194
204
|
HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
|
|
195
205
|
return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()};
|
|
196
206
|
}
|
|
197
|
-
|
|
198
|
-
HWY_API Vec128<bfloat16_t, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
|
|
199
|
-
return Vec128<bfloat16_t, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()};
|
|
200
|
-
}
|
|
207
|
+
#if HWY_HAVE_FLOAT16
|
|
201
208
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
|
|
202
209
|
HWY_API Vec128<float16_t, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
|
|
203
|
-
#if HWY_HAVE_FLOAT16
|
|
204
210
|
return Vec128<float16_t, HWY_MAX_LANES_D(D)>{_mm_setzero_ph()};
|
|
205
|
-
#else
|
|
206
|
-
return Vec128<float16_t, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()};
|
|
207
|
-
#endif
|
|
208
211
|
}
|
|
212
|
+
#endif // HWY_HAVE_FLOAT16
|
|
209
213
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
|
|
210
214
|
HWY_API Vec128<float, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
|
|
211
215
|
return Vec128<float, HWY_MAX_LANES_D(D)>{_mm_setzero_ps()};
|
|
@@ -214,6 +218,10 @@ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
|
|
|
214
218
|
HWY_API Vec128<double, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
|
|
215
219
|
return Vec128<double, HWY_MAX_LANES_D(D)>{_mm_setzero_pd()};
|
|
216
220
|
}
|
|
221
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_X86_IF_EMULATED_D(D)>
|
|
222
|
+
HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
|
|
223
|
+
return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()};
|
|
224
|
+
}
|
|
217
225
|
|
|
218
226
|
// Using the existing Zero function instead of a dedicated function for
|
|
219
227
|
// deduction avoids having to forward-declare Vec256 here.
|
|
@@ -307,7 +315,7 @@ HWY_API VFromD<D> Set(D /* tag */, double t) {
|
|
|
307
315
|
}
|
|
308
316
|
|
|
309
317
|
// Generic for all vector lengths.
|
|
310
|
-
template <class D,
|
|
318
|
+
template <class D, HWY_X86_IF_EMULATED_D(D)>
|
|
311
319
|
HWY_API VFromD<D> Set(D df, TFromD<D> t) {
|
|
312
320
|
const RebindToUnsigned<decltype(df)> du;
|
|
313
321
|
static_assert(sizeof(TFromD<D>) == 2, "Expecting [b]f16");
|
|
@@ -328,18 +336,12 @@ HWY_API VFromD<D> Undefined(D /* tag */) {
|
|
|
328
336
|
// generate an XOR instruction.
|
|
329
337
|
return VFromD<D>{_mm_undefined_si128()};
|
|
330
338
|
}
|
|
331
|
-
|
|
332
|
-
HWY_API VFromD<D> Undefined(D /* tag */) {
|
|
333
|
-
return VFromD<D>{_mm_undefined_si128()};
|
|
334
|
-
}
|
|
339
|
+
#if HWY_HAVE_FLOAT16
|
|
335
340
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
|
|
336
341
|
HWY_API VFromD<D> Undefined(D /* tag */) {
|
|
337
|
-
#if HWY_HAVE_FLOAT16
|
|
338
342
|
return VFromD<D>{_mm_undefined_ph()};
|
|
339
|
-
#else
|
|
340
|
-
return VFromD<D>{_mm_undefined_si128()};
|
|
341
|
-
#endif
|
|
342
343
|
}
|
|
344
|
+
#endif // HWY_HAVE_FLOAT16
|
|
343
345
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
|
|
344
346
|
HWY_API VFromD<D> Undefined(D /* tag */) {
|
|
345
347
|
return VFromD<D>{_mm_undefined_ps()};
|
|
@@ -348,6 +350,10 @@ template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
|
|
|
348
350
|
HWY_API VFromD<D> Undefined(D /* tag */) {
|
|
349
351
|
return VFromD<D>{_mm_undefined_pd()};
|
|
350
352
|
}
|
|
353
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_X86_IF_EMULATED_D(D)>
|
|
354
|
+
HWY_API VFromD<D> Undefined(D /* tag */) {
|
|
355
|
+
return VFromD<D>{_mm_undefined_si128()};
|
|
356
|
+
}
|
|
351
357
|
|
|
352
358
|
HWY_DIAGNOSTICS(pop)
|
|
353
359
|
|
|
@@ -359,7 +365,11 @@ HWY_API T GetLane(const Vec128<T, N> v) {
|
|
|
359
365
|
}
|
|
360
366
|
template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
|
|
361
367
|
HWY_API T GetLane(const Vec128<T, N> v) {
|
|
362
|
-
|
|
368
|
+
const DFromV<decltype(v)> d;
|
|
369
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
370
|
+
const uint16_t bits =
|
|
371
|
+
static_cast<uint16_t>(_mm_cvtsi128_si32(BitCast(du, v).raw) & 0xFFFF);
|
|
372
|
+
return BitCastScalar<T>(bits);
|
|
363
373
|
}
|
|
364
374
|
template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
|
|
365
375
|
HWY_API T GetLane(const Vec128<T, N> v) {
|
|
@@ -394,6 +404,104 @@ HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
|
|
|
394
404
|
return BitCast(d, VFromD<decltype(du8)>{detail::BitCastToInteger(v.raw)});
|
|
395
405
|
}
|
|
396
406
|
|
|
407
|
+
// ------------------------------ Dup128VecFromValues
|
|
408
|
+
|
|
409
|
+
template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
410
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
411
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
412
|
+
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
|
|
413
|
+
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
|
|
414
|
+
TFromD<D> t11, TFromD<D> t12,
|
|
415
|
+
TFromD<D> t13, TFromD<D> t14,
|
|
416
|
+
TFromD<D> t15) {
|
|
417
|
+
return VFromD<D>{_mm_setr_epi8(
|
|
418
|
+
static_cast<char>(t0), static_cast<char>(t1), static_cast<char>(t2),
|
|
419
|
+
static_cast<char>(t3), static_cast<char>(t4), static_cast<char>(t5),
|
|
420
|
+
static_cast<char>(t6), static_cast<char>(t7), static_cast<char>(t8),
|
|
421
|
+
static_cast<char>(t9), static_cast<char>(t10), static_cast<char>(t11),
|
|
422
|
+
static_cast<char>(t12), static_cast<char>(t13), static_cast<char>(t14),
|
|
423
|
+
static_cast<char>(t15))};
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
427
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
428
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
429
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
430
|
+
TFromD<D> t7) {
|
|
431
|
+
return VFromD<D>{
|
|
432
|
+
_mm_setr_epi16(static_cast<int16_t>(t0), static_cast<int16_t>(t1),
|
|
433
|
+
static_cast<int16_t>(t2), static_cast<int16_t>(t3),
|
|
434
|
+
static_cast<int16_t>(t4), static_cast<int16_t>(t5),
|
|
435
|
+
static_cast<int16_t>(t6), static_cast<int16_t>(t7))};
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
// Generic for all vector lengths
|
|
439
|
+
template <class D, HWY_IF_BF16_D(D)>
|
|
440
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
441
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
442
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
443
|
+
TFromD<D> t7) {
|
|
444
|
+
const RebindToSigned<decltype(d)> di;
|
|
445
|
+
return BitCast(d,
|
|
446
|
+
Dup128VecFromValues(
|
|
447
|
+
di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
|
|
448
|
+
BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
|
|
449
|
+
BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
|
|
450
|
+
BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
#if HWY_HAVE_FLOAT16
|
|
454
|
+
template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
455
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
456
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
457
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
458
|
+
TFromD<D> t7) {
|
|
459
|
+
return VFromD<D>{_mm_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7)};
|
|
460
|
+
}
|
|
461
|
+
#else
|
|
462
|
+
// Generic for all vector lengths if HWY_HAVE_FLOAT16 is not true
|
|
463
|
+
template <class D, HWY_IF_F16_D(D)>
|
|
464
|
+
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
|
|
465
|
+
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
|
|
466
|
+
TFromD<D> t5, TFromD<D> t6,
|
|
467
|
+
TFromD<D> t7) {
|
|
468
|
+
const RebindToSigned<decltype(d)> di;
|
|
469
|
+
return BitCast(d,
|
|
470
|
+
Dup128VecFromValues(
|
|
471
|
+
di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
|
|
472
|
+
BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
|
|
473
|
+
BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
|
|
474
|
+
BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
|
|
475
|
+
}
|
|
476
|
+
#endif // HWY_HAVE_FLOAT16
|
|
477
|
+
|
|
478
|
+
template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
479
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
480
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
481
|
+
return VFromD<D>{
|
|
482
|
+
_mm_setr_epi32(static_cast<int32_t>(t0), static_cast<int32_t>(t1),
|
|
483
|
+
static_cast<int32_t>(t2), static_cast<int32_t>(t3))};
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
487
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
|
|
488
|
+
TFromD<D> t2, TFromD<D> t3) {
|
|
489
|
+
return VFromD<D>{_mm_setr_ps(t0, t1, t2, t3)};
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
493
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
494
|
+
// Need to use _mm_set_epi64x as there is no _mm_setr_epi64x intrinsic
|
|
495
|
+
// available
|
|
496
|
+
return VFromD<D>{
|
|
497
|
+
_mm_set_epi64x(static_cast<int64_t>(t1), static_cast<int64_t>(t0))};
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
501
|
+
HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
|
|
502
|
+
return VFromD<D>{_mm_setr_pd(t0, t1)};
|
|
503
|
+
}
|
|
504
|
+
|
|
397
505
|
// ================================================== LOGICAL
|
|
398
506
|
|
|
399
507
|
// ------------------------------ And
|
|
@@ -402,7 +510,8 @@ template <typename T, size_t N>
|
|
|
402
510
|
HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
|
|
403
511
|
const DFromV<decltype(a)> d; // for float16_t
|
|
404
512
|
const RebindToUnsigned<decltype(d)> du;
|
|
405
|
-
return BitCast(d, VFromD<decltype(du)>{
|
|
513
|
+
return BitCast(d, VFromD<decltype(du)>{
|
|
514
|
+
_mm_and_si128(BitCast(du, a).raw, BitCast(du, b).raw)});
|
|
406
515
|
}
|
|
407
516
|
template <size_t N>
|
|
408
517
|
HWY_API Vec128<float, N> And(Vec128<float, N> a, Vec128<float, N> b) {
|
|
@@ -420,8 +529,8 @@ template <typename T, size_t N>
|
|
|
420
529
|
HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
|
|
421
530
|
const DFromV<decltype(mask)> d; // for float16_t
|
|
422
531
|
const RebindToUnsigned<decltype(d)> du;
|
|
423
|
-
return BitCast(
|
|
424
|
-
|
|
532
|
+
return BitCast(d, VFromD<decltype(du)>{_mm_andnot_si128(
|
|
533
|
+
BitCast(du, not_mask).raw, BitCast(du, mask).raw)});
|
|
425
534
|
}
|
|
426
535
|
template <size_t N>
|
|
427
536
|
HWY_API Vec128<float, N> AndNot(Vec128<float, N> not_mask,
|
|
@@ -440,7 +549,8 @@ template <typename T, size_t N>
|
|
|
440
549
|
HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
|
|
441
550
|
const DFromV<decltype(a)> d; // for float16_t
|
|
442
551
|
const RebindToUnsigned<decltype(d)> du;
|
|
443
|
-
return BitCast(d, VFromD<decltype(du)>{
|
|
552
|
+
return BitCast(d, VFromD<decltype(du)>{
|
|
553
|
+
_mm_or_si128(BitCast(du, a).raw, BitCast(du, b).raw)});
|
|
444
554
|
}
|
|
445
555
|
|
|
446
556
|
template <size_t N>
|
|
@@ -458,7 +568,8 @@ template <typename T, size_t N>
|
|
|
458
568
|
HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
|
|
459
569
|
const DFromV<decltype(a)> d; // for float16_t
|
|
460
570
|
const RebindToUnsigned<decltype(d)> du;
|
|
461
|
-
return BitCast(d, VFromD<decltype(du)>{
|
|
571
|
+
return BitCast(d, VFromD<decltype(du)>{
|
|
572
|
+
_mm_xor_si128(BitCast(du, a).raw, BitCast(du, b).raw)});
|
|
462
573
|
}
|
|
463
574
|
|
|
464
575
|
template <size_t N>
|
|
@@ -651,8 +762,9 @@ HWY_INLINE Vec128<T, N> Neg(const Vec128<T, N> v) {
|
|
|
651
762
|
}
|
|
652
763
|
|
|
653
764
|
// ------------------------------ Floating-point Abs
|
|
654
|
-
|
|
655
|
-
|
|
765
|
+
// Generic for all vector lengths
|
|
766
|
+
template <class V, HWY_IF_FLOAT(TFromV<V>)>
|
|
767
|
+
HWY_API V Abs(V v) {
|
|
656
768
|
const DFromV<decltype(v)> d;
|
|
657
769
|
const RebindToSigned<decltype(d)> di;
|
|
658
770
|
using TI = TFromD<decltype(di)>;
|
|
@@ -691,14 +803,332 @@ HWY_API V CopySignToAbs(const V abs, const V sign) {
|
|
|
691
803
|
// ================================================== MASK
|
|
692
804
|
|
|
693
805
|
#if HWY_TARGET <= HWY_AVX3
|
|
806
|
+
// ------------------------------ MaskFromVec
|
|
694
807
|
|
|
695
|
-
|
|
808
|
+
namespace detail {
|
|
809
|
+
|
|
810
|
+
template <typename T, size_t N>
|
|
811
|
+
HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<1> /*tag*/,
|
|
812
|
+
const Vec128<T, N> v) {
|
|
813
|
+
return Mask128<T, N>{_mm_movepi8_mask(v.raw)};
|
|
814
|
+
}
|
|
815
|
+
template <typename T, size_t N>
|
|
816
|
+
HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<2> /*tag*/,
|
|
817
|
+
const Vec128<T, N> v) {
|
|
818
|
+
return Mask128<T, N>{_mm_movepi16_mask(v.raw)};
|
|
819
|
+
}
|
|
820
|
+
template <typename T, size_t N>
|
|
821
|
+
HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<4> /*tag*/,
|
|
822
|
+
const Vec128<T, N> v) {
|
|
823
|
+
return Mask128<T, N>{_mm_movepi32_mask(v.raw)};
|
|
824
|
+
}
|
|
825
|
+
template <typename T, size_t N>
|
|
826
|
+
HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<8> /*tag*/,
|
|
827
|
+
const Vec128<T, N> v) {
|
|
828
|
+
return Mask128<T, N>{_mm_movepi64_mask(v.raw)};
|
|
829
|
+
}
|
|
830
|
+
|
|
831
|
+
} // namespace detail
|
|
832
|
+
|
|
833
|
+
template <typename T, size_t N>
|
|
834
|
+
HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
|
|
835
|
+
return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
|
|
836
|
+
}
|
|
837
|
+
// There do not seem to be native floating-point versions of these instructions.
|
|
838
|
+
#if HWY_HAVE_FLOAT16
|
|
839
|
+
template <size_t N>
|
|
840
|
+
HWY_API Mask128<float16_t, N> MaskFromVec(const Vec128<float16_t, N> v) {
|
|
841
|
+
const RebindToSigned<DFromV<decltype(v)>> di;
|
|
842
|
+
return Mask128<float16_t, N>{MaskFromVec(BitCast(di, v)).raw};
|
|
843
|
+
}
|
|
844
|
+
#endif
|
|
845
|
+
template <size_t N>
|
|
846
|
+
HWY_API Mask128<float, N> MaskFromVec(const Vec128<float, N> v) {
|
|
847
|
+
const RebindToSigned<DFromV<decltype(v)>> di;
|
|
848
|
+
return Mask128<float, N>{MaskFromVec(BitCast(di, v)).raw};
|
|
849
|
+
}
|
|
850
|
+
template <size_t N>
|
|
851
|
+
HWY_API Mask128<double, N> MaskFromVec(const Vec128<double, N> v) {
|
|
852
|
+
const RebindToSigned<DFromV<decltype(v)>> di;
|
|
853
|
+
return Mask128<double, N>{MaskFromVec(BitCast(di, v)).raw};
|
|
854
|
+
}
|
|
855
|
+
|
|
856
|
+
template <class D>
|
|
857
|
+
using MFromD = decltype(MaskFromVec(VFromD<D>()));
|
|
858
|
+
|
|
859
|
+
// ------------------------------ MaskFalse (MFromD)
|
|
860
|
+
|
|
861
|
+
#ifdef HWY_NATIVE_MASK_FALSE
|
|
862
|
+
#undef HWY_NATIVE_MASK_FALSE
|
|
863
|
+
#else
|
|
864
|
+
#define HWY_NATIVE_MASK_FALSE
|
|
865
|
+
#endif
|
|
866
|
+
|
|
867
|
+
// Generic for all vector lengths
|
|
868
|
+
template <class D>
|
|
869
|
+
HWY_API MFromD<D> MaskFalse(D /*d*/) {
|
|
870
|
+
return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(0)};
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
// ------------------------------ PromoteMaskTo (MFromD)
|
|
874
|
+
|
|
875
|
+
#ifdef HWY_NATIVE_PROMOTE_MASK_TO
|
|
876
|
+
#undef HWY_NATIVE_PROMOTE_MASK_TO
|
|
877
|
+
#else
|
|
878
|
+
#define HWY_NATIVE_PROMOTE_MASK_TO
|
|
879
|
+
#endif
|
|
880
|
+
|
|
881
|
+
// AVX3 PromoteMaskTo is generic for all vector lengths
|
|
882
|
+
template <class DTo, class DFrom,
|
|
883
|
+
HWY_IF_T_SIZE_GT_D(DTo, sizeof(TFromD<DFrom>)),
|
|
884
|
+
class DFrom_2 = Rebind<TFromD<DFrom>, DTo>,
|
|
885
|
+
hwy::EnableIf<IsSame<MFromD<DFrom>, MFromD<DFrom_2>>()>* = nullptr>
|
|
886
|
+
HWY_API MFromD<DTo> PromoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
|
|
887
|
+
MFromD<DFrom> m) {
|
|
888
|
+
return MFromD<DTo>{static_cast<decltype(MFromD<DTo>().raw)>(m.raw)};
|
|
889
|
+
}
|
|
890
|
+
|
|
891
|
+
// ------------------------------ DemoteMaskTo (MFromD)
|
|
892
|
+
|
|
893
|
+
#ifdef HWY_NATIVE_DEMOTE_MASK_TO
|
|
894
|
+
#undef HWY_NATIVE_DEMOTE_MASK_TO
|
|
895
|
+
#else
|
|
896
|
+
#define HWY_NATIVE_DEMOTE_MASK_TO
|
|
897
|
+
#endif
|
|
898
|
+
|
|
899
|
+
// AVX3 DemoteMaskTo is generic for all vector lengths
|
|
900
|
+
template <class DTo, class DFrom,
|
|
901
|
+
HWY_IF_T_SIZE_LE_D(DTo, sizeof(TFromD<DFrom>) - 1),
|
|
902
|
+
class DFrom_2 = Rebind<TFromD<DFrom>, DTo>,
|
|
903
|
+
hwy::EnableIf<IsSame<MFromD<DFrom>, MFromD<DFrom_2>>()>* = nullptr>
|
|
904
|
+
HWY_API MFromD<DTo> DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
|
|
905
|
+
MFromD<DFrom> m) {
|
|
906
|
+
return MFromD<DTo>{static_cast<decltype(MFromD<DTo>().raw)>(m.raw)};
|
|
907
|
+
}
|
|
908
|
+
|
|
909
|
+
// ------------------------------ CombineMasks (MFromD)
|
|
910
|
+
|
|
911
|
+
#ifdef HWY_NATIVE_COMBINE_MASKS
|
|
912
|
+
#undef HWY_NATIVE_COMBINE_MASKS
|
|
913
|
+
#else
|
|
914
|
+
#define HWY_NATIVE_COMBINE_MASKS
|
|
915
|
+
#endif
|
|
916
|
+
|
|
917
|
+
template <class D, HWY_IF_LANES_D(D, 2)>
|
|
918
|
+
HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
|
|
919
|
+
MFromD<Half<D>> lo) {
|
|
920
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
921
|
+
const __mmask8 combined_mask = _kor_mask8(
|
|
922
|
+
_kshiftli_mask8(static_cast<__mmask8>(hi.raw), 1),
|
|
923
|
+
_kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(1)));
|
|
924
|
+
#else
|
|
925
|
+
const auto combined_mask =
|
|
926
|
+
(static_cast<unsigned>(hi.raw) << 1) | (lo.raw & 1);
|
|
927
|
+
#endif
|
|
928
|
+
|
|
929
|
+
return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
|
|
930
|
+
}
|
|
931
|
+
|
|
932
|
+
template <class D, HWY_IF_LANES_D(D, 4)>
|
|
933
|
+
HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
|
|
934
|
+
MFromD<Half<D>> lo) {
|
|
935
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
936
|
+
const __mmask8 combined_mask = _kor_mask8(
|
|
937
|
+
_kshiftli_mask8(static_cast<__mmask8>(hi.raw), 2),
|
|
938
|
+
_kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(3)));
|
|
939
|
+
#else
|
|
940
|
+
const auto combined_mask =
|
|
941
|
+
(static_cast<unsigned>(hi.raw) << 2) | (lo.raw & 3);
|
|
942
|
+
#endif
|
|
943
|
+
|
|
944
|
+
return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
|
|
945
|
+
}
|
|
946
|
+
|
|
947
|
+
template <class D, HWY_IF_LANES_D(D, 8)>
|
|
948
|
+
HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
|
|
949
|
+
MFromD<Half<D>> lo) {
|
|
950
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
951
|
+
const __mmask8 combined_mask = _kor_mask8(
|
|
952
|
+
_kshiftli_mask8(static_cast<__mmask8>(hi.raw), 4),
|
|
953
|
+
_kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(15)));
|
|
954
|
+
#else
|
|
955
|
+
const auto combined_mask =
|
|
956
|
+
(static_cast<unsigned>(hi.raw) << 4) | (lo.raw & 15u);
|
|
957
|
+
#endif
|
|
958
|
+
|
|
959
|
+
return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
|
|
960
|
+
}
|
|
961
|
+
|
|
962
|
+
template <class D, HWY_IF_LANES_D(D, 16)>
|
|
963
|
+
HWY_API MFromD<D> CombineMasks(D /*d*/, MFromD<Half<D>> hi,
|
|
964
|
+
MFromD<Half<D>> lo) {
|
|
965
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
966
|
+
const __mmask16 combined_mask = _mm512_kunpackb(
|
|
967
|
+
static_cast<__mmask16>(hi.raw), static_cast<__mmask16>(lo.raw));
|
|
968
|
+
#else
|
|
969
|
+
const auto combined_mask =
|
|
970
|
+
((static_cast<unsigned>(hi.raw) << 8) | (lo.raw & 0xFFu));
|
|
971
|
+
#endif
|
|
972
|
+
|
|
973
|
+
return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(combined_mask)};
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
// ------------------------------ LowerHalfOfMask (MFromD)
|
|
977
|
+
|
|
978
|
+
#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
|
|
979
|
+
#undef HWY_NATIVE_LOWER_HALF_OF_MASK
|
|
980
|
+
#else
|
|
981
|
+
#define HWY_NATIVE_LOWER_HALF_OF_MASK
|
|
982
|
+
#endif
|
|
983
|
+
|
|
984
|
+
// Generic for all vector lengths
|
|
985
|
+
template <class D>
|
|
986
|
+
HWY_API MFromD<D> LowerHalfOfMask(D d, MFromD<Twice<D>> m) {
|
|
987
|
+
using RawM = decltype(MFromD<D>().raw);
|
|
988
|
+
constexpr size_t kN = MaxLanes(d);
|
|
989
|
+
constexpr size_t kNumOfBitsInRawMask = sizeof(RawM) * 8;
|
|
990
|
+
|
|
991
|
+
MFromD<D> result_mask{static_cast<RawM>(m.raw)};
|
|
992
|
+
|
|
993
|
+
if (kN < kNumOfBitsInRawMask) {
|
|
994
|
+
result_mask =
|
|
995
|
+
And(result_mask, MFromD<D>{static_cast<RawM>((1ULL << kN) - 1)});
|
|
996
|
+
}
|
|
997
|
+
|
|
998
|
+
return result_mask;
|
|
999
|
+
}
|
|
1000
|
+
|
|
1001
|
+
// ------------------------------ UpperHalfOfMask (MFromD)
|
|
1002
|
+
|
|
1003
|
+
#ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
|
|
1004
|
+
#undef HWY_NATIVE_UPPER_HALF_OF_MASK
|
|
1005
|
+
#else
|
|
1006
|
+
#define HWY_NATIVE_UPPER_HALF_OF_MASK
|
|
1007
|
+
#endif
|
|
1008
|
+
|
|
1009
|
+
template <class D, HWY_IF_LANES_D(D, 1)>
|
|
1010
|
+
HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
|
|
1011
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
1012
|
+
const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 1);
|
|
1013
|
+
#else
|
|
1014
|
+
const auto shifted_mask = static_cast<unsigned>(m.raw) >> 1;
|
|
1015
|
+
#endif
|
|
1016
|
+
|
|
1017
|
+
return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
|
|
1018
|
+
}
|
|
1019
|
+
|
|
1020
|
+
template <class D, HWY_IF_LANES_D(D, 2)>
|
|
1021
|
+
HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
|
|
1022
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
1023
|
+
const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 2);
|
|
1024
|
+
#else
|
|
1025
|
+
const auto shifted_mask = static_cast<unsigned>(m.raw) >> 2;
|
|
1026
|
+
#endif
|
|
1027
|
+
|
|
1028
|
+
return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
|
|
1029
|
+
}
|
|
1030
|
+
|
|
1031
|
+
template <class D, HWY_IF_LANES_D(D, 4)>
|
|
1032
|
+
HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
|
|
1033
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
1034
|
+
const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 4);
|
|
1035
|
+
#else
|
|
1036
|
+
const auto shifted_mask = static_cast<unsigned>(m.raw) >> 4;
|
|
1037
|
+
#endif
|
|
1038
|
+
|
|
1039
|
+
return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
|
|
1040
|
+
}
|
|
1041
|
+
|
|
1042
|
+
template <class D, HWY_IF_LANES_D(D, 8)>
|
|
1043
|
+
HWY_API MFromD<D> UpperHalfOfMask(D /*d*/, MFromD<Twice<D>> m) {
|
|
1044
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
1045
|
+
const auto shifted_mask = _kshiftri_mask16(static_cast<__mmask16>(m.raw), 8);
|
|
1046
|
+
#else
|
|
1047
|
+
const auto shifted_mask = static_cast<unsigned>(m.raw) >> 8;
|
|
1048
|
+
#endif
|
|
1049
|
+
|
|
1050
|
+
return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
|
|
1051
|
+
}
|
|
1052
|
+
|
|
1053
|
+
// ------------------------------ OrderedDemote2MasksTo (MFromD, CombineMasks)
|
|
1054
|
+
|
|
1055
|
+
#ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
|
|
1056
|
+
#undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
|
|
1057
|
+
#else
|
|
1058
|
+
#define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
|
|
1059
|
+
#endif
|
|
1060
|
+
|
|
1061
|
+
// Generic for all vector lengths
|
|
1062
|
+
template <class DTo, class DFrom,
|
|
1063
|
+
HWY_IF_T_SIZE_D(DTo, sizeof(TFromD<DFrom>) / 2),
|
|
1064
|
+
class DTo_2 = Repartition<TFromD<DTo>, DFrom>,
|
|
1065
|
+
hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DTo_2>>()>* = nullptr>
|
|
1066
|
+
HWY_API MFromD<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom /*d_from*/,
|
|
1067
|
+
MFromD<DFrom> a, MFromD<DFrom> b) {
|
|
1068
|
+
using MH = MFromD<Half<DTo>>;
|
|
1069
|
+
using RawMH = decltype(MH().raw);
|
|
1070
|
+
|
|
1071
|
+
return CombineMasks(d_to, MH{static_cast<RawMH>(b.raw)},
|
|
1072
|
+
MH{static_cast<RawMH>(a.raw)});
|
|
1073
|
+
}
|
|
1074
|
+
|
|
1075
|
+
// ------------------------------ VecFromMask
|
|
1076
|
+
|
|
1077
|
+
template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
|
|
1078
|
+
HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
|
|
1079
|
+
return Vec128<T, N>{_mm_movm_epi8(v.raw)};
|
|
1080
|
+
}
|
|
1081
|
+
|
|
1082
|
+
template <typename T, size_t N, HWY_IF_UI16(T)>
|
|
1083
|
+
HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
|
|
1084
|
+
return Vec128<T, N>{_mm_movm_epi16(v.raw)};
|
|
1085
|
+
}
|
|
1086
|
+
|
|
1087
|
+
template <typename T, size_t N, HWY_IF_UI32(T)>
|
|
1088
|
+
HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
|
|
1089
|
+
return Vec128<T, N>{_mm_movm_epi32(v.raw)};
|
|
1090
|
+
}
|
|
696
1091
|
|
|
697
|
-
|
|
1092
|
+
template <typename T, size_t N, HWY_IF_UI64(T)>
|
|
1093
|
+
HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
|
|
1094
|
+
return Vec128<T, N>{_mm_movm_epi64(v.raw)};
|
|
1095
|
+
}
|
|
1096
|
+
|
|
1097
|
+
#if HWY_HAVE_FLOAT16
|
|
1098
|
+
template <size_t N>
|
|
1099
|
+
HWY_API Vec128<float16_t, N> VecFromMask(const Mask128<float16_t, N> v) {
|
|
1100
|
+
return Vec128<float16_t, N>{_mm_castsi128_ph(_mm_movm_epi16(v.raw))};
|
|
1101
|
+
}
|
|
1102
|
+
#endif // HWY_HAVE_FLOAT16
|
|
1103
|
+
|
|
1104
|
+
template <size_t N>
|
|
1105
|
+
HWY_API Vec128<float, N> VecFromMask(const Mask128<float, N> v) {
|
|
1106
|
+
return Vec128<float, N>{_mm_castsi128_ps(_mm_movm_epi32(v.raw))};
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
template <size_t N>
|
|
1110
|
+
HWY_API Vec128<double, N> VecFromMask(const Mask128<double, N> v) {
|
|
1111
|
+
return Vec128<double, N>{_mm_castsi128_pd(_mm_movm_epi64(v.raw))};
|
|
1112
|
+
}
|
|
1113
|
+
|
|
1114
|
+
// Generic for all vector lengths.
|
|
1115
|
+
template <class D>
|
|
1116
|
+
HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) {
|
|
1117
|
+
return VecFromMask(v);
|
|
1118
|
+
}
|
|
1119
|
+
|
|
1120
|
+
// ------------------------------ RebindMask (MaskFromVec)
|
|
1121
|
+
|
|
1122
|
+
template <typename TFrom, size_t NFrom, class DTo, HWY_IF_V_SIZE_LE_D(DTo, 16)>
|
|
1123
|
+
HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) {
|
|
1124
|
+
static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
|
|
1125
|
+
return MFromD<DTo>{m.raw};
|
|
1126
|
+
}
|
|
1127
|
+
|
|
1128
|
+
// ------------------------------ IfThenElse
|
|
698
1129
|
|
|
699
1130
|
namespace detail {
|
|
700
1131
|
|
|
701
|
-
// Templates for signed/unsigned integer of a particular size.
|
|
702
1132
|
template <typename T, size_t N>
|
|
703
1133
|
HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<1> /* tag */,
|
|
704
1134
|
Mask128<T, N> mask, Vec128<T, N> yes,
|
|
@@ -726,7 +1156,7 @@ HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<8> /* tag */,
|
|
|
726
1156
|
|
|
727
1157
|
} // namespace detail
|
|
728
1158
|
|
|
729
|
-
template <typename T, size_t N>
|
|
1159
|
+
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
730
1160
|
HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
|
|
731
1161
|
Vec128<T, N> no) {
|
|
732
1162
|
return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
|
|
@@ -741,6 +1171,14 @@ HWY_API Vec128<float16_t, N> IfThenElse(Mask128<float16_t, N> mask,
|
|
|
741
1171
|
}
|
|
742
1172
|
#endif // HWY_HAVE_FLOAT16
|
|
743
1173
|
|
|
1174
|
+
// Generic for all vector lengths.
|
|
1175
|
+
template <class V, class D = DFromV<V>, HWY_X86_IF_EMULATED_D(D)>
|
|
1176
|
+
HWY_API V IfThenElse(MFromD<D> mask, V yes, V no) {
|
|
1177
|
+
const RebindToUnsigned<D> du;
|
|
1178
|
+
return BitCast(
|
|
1179
|
+
D(), IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no)));
|
|
1180
|
+
}
|
|
1181
|
+
|
|
744
1182
|
template <size_t N>
|
|
745
1183
|
HWY_API Vec128<float, N> IfThenElse(Mask128<float, N> mask,
|
|
746
1184
|
Vec128<float, N> yes, Vec128<float, N> no) {
|
|
@@ -779,7 +1217,7 @@ HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<8> /* tag */,
|
|
|
779
1217
|
|
|
780
1218
|
} // namespace detail
|
|
781
1219
|
|
|
782
|
-
template <typename T, size_t N>
|
|
1220
|
+
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
783
1221
|
HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
|
|
784
1222
|
return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
|
|
785
1223
|
}
|
|
@@ -796,6 +1234,13 @@ HWY_API Vec128<double, N> IfThenElseZero(Mask128<double, N> mask,
|
|
|
796
1234
|
return Vec128<double, N>{_mm_maskz_mov_pd(mask.raw, yes.raw)};
|
|
797
1235
|
}
|
|
798
1236
|
|
|
1237
|
+
// Generic for all vector lengths.
|
|
1238
|
+
template <class V, class D = DFromV<V>, HWY_IF_SPECIAL_FLOAT_D(D)>
|
|
1239
|
+
HWY_API V IfThenElseZero(MFromD<D> mask, V yes) {
|
|
1240
|
+
const RebindToUnsigned<D> du;
|
|
1241
|
+
return BitCast(D(), IfThenElseZero(RebindMask(du, mask), BitCast(du, yes)));
|
|
1242
|
+
}
|
|
1243
|
+
|
|
799
1244
|
namespace detail {
|
|
800
1245
|
|
|
801
1246
|
template <typename T, size_t N>
|
|
@@ -822,7 +1267,7 @@ HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<8> /* tag */,
|
|
|
822
1267
|
|
|
823
1268
|
} // namespace detail
|
|
824
1269
|
|
|
825
|
-
template <typename T, size_t N>
|
|
1270
|
+
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
|
|
826
1271
|
HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
|
|
827
1272
|
return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
|
|
828
1273
|
}
|
|
@@ -839,6 +1284,13 @@ HWY_API Vec128<double, N> IfThenZeroElse(Mask128<double, N> mask,
|
|
|
839
1284
|
return Vec128<double, N>{_mm_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
|
|
840
1285
|
}
|
|
841
1286
|
|
|
1287
|
+
// Generic for all vector lengths.
|
|
1288
|
+
template <class V, class D = DFromV<V>, HWY_IF_SPECIAL_FLOAT_D(D)>
|
|
1289
|
+
HWY_API V IfThenZeroElse(MFromD<D> mask, V no) {
|
|
1290
|
+
const RebindToUnsigned<D> du;
|
|
1291
|
+
return BitCast(D(), IfThenZeroElse(RebindMask(du, mask), BitCast(du, no)));
|
|
1292
|
+
}
|
|
1293
|
+
|
|
842
1294
|
// ------------------------------ Mask logical
|
|
843
1295
|
|
|
844
1296
|
// For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently.
|
|
@@ -1042,6 +1494,68 @@ HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<8> /*tag*/,
|
|
|
1042
1494
|
#endif
|
|
1043
1495
|
}
|
|
1044
1496
|
|
|
1497
|
+
// UnmaskedNot returns ~m.raw without zeroing out any invalid bits
|
|
1498
|
+
template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
|
|
1499
|
+
HWY_INLINE Mask128<T, N> UnmaskedNot(const Mask128<T, N> m) {
|
|
1500
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
1501
|
+
return Mask128<T, N>{static_cast<__mmask16>(_knot_mask16(m.raw))};
|
|
1502
|
+
#else
|
|
1503
|
+
return Mask128<T, N>{static_cast<__mmask16>(~m.raw)};
|
|
1504
|
+
#endif
|
|
1505
|
+
}
|
|
1506
|
+
|
|
1507
|
+
template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
|
|
1508
|
+
HWY_INLINE Mask128<T, N> UnmaskedNot(const Mask128<T, N> m) {
|
|
1509
|
+
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
1510
|
+
return Mask128<T, N>{static_cast<__mmask8>(_knot_mask8(m.raw))};
|
|
1511
|
+
#else
|
|
1512
|
+
return Mask128<T, N>{static_cast<__mmask8>(~m.raw)};
|
|
1513
|
+
#endif
|
|
1514
|
+
}
|
|
1515
|
+
|
|
1516
|
+
template <typename T>
|
|
1517
|
+
HWY_INLINE Mask128<T> Not(hwy::SizeTag<1> /*tag*/, const Mask128<T> m) {
|
|
1518
|
+
// sizeof(T) == 1 and N == 16: simply return ~m as all 16 bits of m are valid
|
|
1519
|
+
return UnmaskedNot(m);
|
|
1520
|
+
}
|
|
1521
|
+
template <typename T, size_t N, HWY_IF_LANES_LE(N, 8)>
|
|
1522
|
+
HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> m) {
|
|
1523
|
+
// sizeof(T) == 1 and N <= 8: need to zero out the upper bits of ~m as there
|
|
1524
|
+
// are fewer than 16 valid bits in m
|
|
1525
|
+
|
|
1526
|
+
// Return (~m) & ((1ull << N) - 1)
|
|
1527
|
+
return AndNot(hwy::SizeTag<1>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
|
|
1528
|
+
}
|
|
1529
|
+
template <typename T>
|
|
1530
|
+
HWY_INLINE Mask128<T> Not(hwy::SizeTag<2> /*tag*/, const Mask128<T> m) {
|
|
1531
|
+
// sizeof(T) == 2 and N == 8: simply return ~m as all 8 bits of m are valid
|
|
1532
|
+
return UnmaskedNot(m);
|
|
1533
|
+
}
|
|
1534
|
+
template <typename T, size_t N, HWY_IF_LANES_LE(N, 4)>
|
|
1535
|
+
HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> m) {
|
|
1536
|
+
// sizeof(T) == 2 and N <= 4: need to zero out the upper bits of ~m as there
|
|
1537
|
+
// are fewer than 8 valid bits in m
|
|
1538
|
+
|
|
1539
|
+
// Return (~m) & ((1ull << N) - 1)
|
|
1540
|
+
return AndNot(hwy::SizeTag<2>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
|
|
1541
|
+
}
|
|
1542
|
+
template <typename T, size_t N>
|
|
1543
|
+
HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> m) {
|
|
1544
|
+
// sizeof(T) == 4: need to zero out the upper bits of ~m as there are at most
|
|
1545
|
+
// 4 valid bits in m
|
|
1546
|
+
|
|
1547
|
+
// Return (~m) & ((1ull << N) - 1)
|
|
1548
|
+
return AndNot(hwy::SizeTag<4>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
|
|
1549
|
+
}
|
|
1550
|
+
template <typename T, size_t N>
|
|
1551
|
+
HWY_INLINE Mask128<T, N> Not(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> m) {
|
|
1552
|
+
// sizeof(T) == 8: need to zero out the upper bits of ~m as there are at most
|
|
1553
|
+
// 2 valid bits in m
|
|
1554
|
+
|
|
1555
|
+
// Return (~m) & ((1ull << N) - 1)
|
|
1556
|
+
return AndNot(hwy::SizeTag<8>(), m, Mask128<T, N>::FromBits((1ull << N) - 1));
|
|
1557
|
+
}
|
|
1558
|
+
|
|
1045
1559
|
} // namespace detail
|
|
1046
1560
|
|
|
1047
1561
|
template <typename T, size_t N>
|
|
@@ -1066,9 +1580,8 @@ HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
|
|
|
1066
1580
|
|
|
1067
1581
|
template <typename T, size_t N>
|
|
1068
1582
|
HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
|
|
1069
|
-
// Flip only the valid bits
|
|
1070
|
-
|
|
1071
|
-
return Xor(m, Mask128<T, N>::FromBits((1ull << N) - 1));
|
|
1583
|
+
// Flip only the valid bits
|
|
1584
|
+
return detail::Not(hwy::SizeTag<sizeof(T)>(), m);
|
|
1072
1585
|
}
|
|
1073
1586
|
|
|
1074
1587
|
template <typename T, size_t N>
|
|
@@ -1309,20 +1822,17 @@ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
|
|
|
1309
1822
|
HWY_API VFromD<D> Load(D /* tag */, const TFromD<D>* HWY_RESTRICT aligned) {
|
|
1310
1823
|
return VFromD<D>{_mm_load_si128(reinterpret_cast<const __m128i*>(aligned))};
|
|
1311
1824
|
}
|
|
1312
|
-
// Generic for all vector lengths greater than or equal to 16 bytes.
|
|
1313
|
-
template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_BF16_D(D)>
|
|
1314
|
-
HWY_API VFromD<D> Load(D d, const bfloat16_t* HWY_RESTRICT aligned) {
|
|
1315
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
1316
|
-
return BitCast(d, Load(du, reinterpret_cast<const uint16_t*>(aligned)));
|
|
1317
|
-
}
|
|
1318
|
-
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
|
|
1319
|
-
HWY_API Vec128<float16_t> Load(D d, const float16_t* HWY_RESTRICT aligned) {
|
|
1320
1825
|
#if HWY_HAVE_FLOAT16
|
|
1826
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
|
|
1827
|
+
HWY_API Vec128<float16_t> Load(D, const float16_t* HWY_RESTRICT aligned) {
|
|
1321
1828
|
return Vec128<float16_t>{_mm_load_ph(aligned)};
|
|
1322
|
-
|
|
1323
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
1324
|
-
return BitCast(d, Load(du, reinterpret_cast<const uint16_t*>(aligned)));
|
|
1829
|
+
}
|
|
1325
1830
|
#endif // HWY_HAVE_FLOAT16
|
|
1831
|
+
// Generic for all vector lengths greater than or equal to 16 bytes.
|
|
1832
|
+
template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
|
|
1833
|
+
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) {
|
|
1834
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1835
|
+
return BitCast(d, Load(du, detail::U16LanePointer(aligned)));
|
|
1326
1836
|
}
|
|
1327
1837
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
|
|
1328
1838
|
HWY_API Vec128<float> Load(D /* tag */, const float* HWY_RESTRICT aligned) {
|
|
@@ -1337,21 +1847,17 @@ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
|
|
|
1337
1847
|
HWY_API VFromD<D> LoadU(D /* tag */, const TFromD<D>* HWY_RESTRICT p) {
|
|
1338
1848
|
return VFromD<D>{_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))};
|
|
1339
1849
|
}
|
|
1340
|
-
// Generic for all vector lengths greater than or equal to 16 bytes.
|
|
1341
|
-
template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_BF16_D(D)>
|
|
1342
|
-
HWY_API VFromD<D> LoadU(D d, const bfloat16_t* HWY_RESTRICT p) {
|
|
1343
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
1344
|
-
return BitCast(d, LoadU(du, reinterpret_cast<const uint16_t*>(p)));
|
|
1345
|
-
}
|
|
1346
|
-
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
|
|
1347
|
-
HWY_API Vec128<float16_t> LoadU(D d, const float16_t* HWY_RESTRICT p) {
|
|
1348
1850
|
#if HWY_HAVE_FLOAT16
|
|
1349
|
-
|
|
1851
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
|
|
1852
|
+
HWY_API Vec128<float16_t> LoadU(D, const float16_t* HWY_RESTRICT p) {
|
|
1350
1853
|
return Vec128<float16_t>{_mm_loadu_ph(p)};
|
|
1351
|
-
|
|
1352
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
1353
|
-
return BitCast(d, LoadU(du, reinterpret_cast<const uint16_t*>(p)));
|
|
1854
|
+
}
|
|
1354
1855
|
#endif // HWY_HAVE_FLOAT16
|
|
1856
|
+
// Generic for all vector lengths greater than or equal to 16 bytes.
|
|
1857
|
+
template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
|
|
1858
|
+
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
|
|
1859
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
1860
|
+
return BitCast(d, LoadU(du, detail::U16LanePointer(p)));
|
|
1355
1861
|
}
|
|
1356
1862
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
|
|
1357
1863
|
HWY_API Vec128<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
|
|
@@ -1445,21 +1951,17 @@ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
|
|
|
1445
1951
|
HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
|
|
1446
1952
|
_mm_store_si128(reinterpret_cast<__m128i*>(aligned), v.raw);
|
|
1447
1953
|
}
|
|
1448
|
-
// Generic for all vector lengths greater than or equal to 16 bytes.
|
|
1449
|
-
template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_BF16_D(D)>
|
|
1450
|
-
HWY_API void Store(VFromD<D> v, D d, bfloat16_t* HWY_RESTRICT aligned) {
|
|
1451
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
1452
|
-
Store(BitCast(du, v), du, reinterpret_cast<uint16_t*>(aligned));
|
|
1453
|
-
}
|
|
1454
|
-
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
|
|
1455
|
-
HWY_API void Store(Vec128<float16_t> v, D d, float16_t* HWY_RESTRICT aligned) {
|
|
1456
1954
|
#if HWY_HAVE_FLOAT16
|
|
1457
|
-
|
|
1955
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
|
|
1956
|
+
HWY_API void Store(Vec128<float16_t> v, D, float16_t* HWY_RESTRICT aligned) {
|
|
1458
1957
|
_mm_store_ph(aligned, v.raw);
|
|
1459
|
-
|
|
1958
|
+
}
|
|
1959
|
+
#endif // HWY_HAVE_FLOAT16
|
|
1960
|
+
// Generic for all vector lengths greater than or equal to 16 bytes.
|
|
1961
|
+
template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
|
|
1962
|
+
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
|
|
1460
1963
|
const RebindToUnsigned<decltype(d)> du;
|
|
1461
1964
|
Store(BitCast(du, v), du, reinterpret_cast<uint16_t*>(aligned));
|
|
1462
|
-
#endif // HWY_HAVE_FLOAT16
|
|
1463
1965
|
}
|
|
1464
1966
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
|
|
1465
1967
|
HWY_API void Store(Vec128<float> v, D /* tag */, float* HWY_RESTRICT aligned) {
|
|
@@ -1475,21 +1977,17 @@ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)>
|
|
|
1475
1977
|
HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
|
|
1476
1978
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(p), v.raw);
|
|
1477
1979
|
}
|
|
1478
|
-
// Generic for all vector lengths greater than or equal to 16 bytes.
|
|
1479
|
-
template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_BF16_D(D)>
|
|
1480
|
-
HWY_API void StoreU(VFromD<D> v, D d, bfloat16_t* HWY_RESTRICT p) {
|
|
1481
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
1482
|
-
StoreU(BitCast(du, v), du, reinterpret_cast<uint16_t*>(p));
|
|
1483
|
-
}
|
|
1484
|
-
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
|
|
1485
|
-
HWY_API void StoreU(Vec128<float16_t> v, D d, float16_t* HWY_RESTRICT p) {
|
|
1486
1980
|
#if HWY_HAVE_FLOAT16
|
|
1487
|
-
|
|
1981
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
|
|
1982
|
+
HWY_API void StoreU(Vec128<float16_t> v, D, float16_t* HWY_RESTRICT p) {
|
|
1488
1983
|
_mm_storeu_ph(p, v.raw);
|
|
1489
|
-
|
|
1984
|
+
}
|
|
1985
|
+
#endif // HWY_HAVE_FLOAT16
|
|
1986
|
+
// Generic for all vector lengths greater than or equal to 16 bytes.
|
|
1987
|
+
template <class D, HWY_IF_V_SIZE_GT_D(D, 8), HWY_X86_IF_EMULATED_D(D)>
|
|
1988
|
+
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
1490
1989
|
const RebindToUnsigned<decltype(d)> du;
|
|
1491
1990
|
StoreU(BitCast(du, v), du, reinterpret_cast<uint16_t*>(p));
|
|
1492
|
-
#endif // HWY_HAVE_FLOAT16
|
|
1493
1991
|
}
|
|
1494
1992
|
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
|
|
1495
1993
|
HWY_API void StoreU(Vec128<float> v, D /* tag */, float* HWY_RESTRICT p) {
|
|
@@ -1553,20 +2051,24 @@ HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
|
|
|
1553
2051
|
template <typename T, size_t N, typename TI, size_t NI>
|
|
1554
2052
|
HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
|
|
1555
2053
|
const Vec128<TI, NI> from) {
|
|
2054
|
+
const DFromV<decltype(from)> d;
|
|
2055
|
+
const Repartition<uint8_t, decltype(d)> du8;
|
|
2056
|
+
|
|
2057
|
+
const DFromV<decltype(bytes)> d_bytes;
|
|
2058
|
+
const Repartition<uint8_t, decltype(d_bytes)> du8_bytes;
|
|
1556
2059
|
#if HWY_TARGET == HWY_SSE2
|
|
1557
2060
|
#if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle)
|
|
1558
2061
|
typedef uint8_t GccU8RawVectType __attribute__((__vector_size__(16)));
|
|
2062
|
+
(void)d;
|
|
2063
|
+
(void)du8;
|
|
2064
|
+
(void)d_bytes;
|
|
2065
|
+
(void)du8_bytes;
|
|
1559
2066
|
return Vec128<TI, NI>{reinterpret_cast<typename detail::Raw128<TI>::type>(
|
|
1560
2067
|
__builtin_shuffle(reinterpret_cast<GccU8RawVectType>(bytes.raw),
|
|
1561
2068
|
reinterpret_cast<GccU8RawVectType>(from.raw)))};
|
|
1562
2069
|
#else
|
|
1563
|
-
const DFromV<decltype(from)> d;
|
|
1564
|
-
const Repartition<uint8_t, decltype(d)> du8;
|
|
1565
2070
|
const Full128<uint8_t> du8_full;
|
|
1566
2071
|
|
|
1567
|
-
const DFromV<decltype(bytes)> d_bytes;
|
|
1568
|
-
const Repartition<uint8_t, decltype(d_bytes)> du8_bytes;
|
|
1569
|
-
|
|
1570
2072
|
alignas(16) uint8_t result_bytes[16];
|
|
1571
2073
|
alignas(16) uint8_t u8_bytes[16];
|
|
1572
2074
|
alignas(16) uint8_t from_bytes[16];
|
|
@@ -1581,7 +2083,9 @@ HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
|
|
|
1581
2083
|
return BitCast(d, VFromD<decltype(du8)>{Load(du8_full, result_bytes).raw});
|
|
1582
2084
|
#endif
|
|
1583
2085
|
#else // SSSE3 or newer
|
|
1584
|
-
return
|
|
2086
|
+
return BitCast(
|
|
2087
|
+
d, VFromD<decltype(du8)>{_mm_shuffle_epi8(BitCast(du8_bytes, bytes).raw,
|
|
2088
|
+
BitCast(du8, from).raw)});
|
|
1585
2089
|
#endif
|
|
1586
2090
|
}
|
|
1587
2091
|
|
|
@@ -1636,8 +2140,11 @@ HWY_API Vec32<T> ShuffleTwo2301(const Vec32<T> a, const Vec32<T> b) {
|
|
|
1636
2140
|
_mm_shufflelo_epi16(ba.raw, _MM_SHUFFLE(3, 0, 3, 0))};
|
|
1637
2141
|
return BitCast(d, Or(ShiftLeft<8>(ba_shuffled), ShiftRight<8>(ba_shuffled)));
|
|
1638
2142
|
#else
|
|
1639
|
-
|
|
1640
|
-
|
|
2143
|
+
const RebindToUnsigned<decltype(d2)> d2_u;
|
|
2144
|
+
const auto shuffle_idx =
|
|
2145
|
+
BitCast(d2, Dup128VecFromValues(d2_u, 1, 0, 7, 6, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
2146
|
+
0, 0, 0, 0));
|
|
2147
|
+
return Vec32<T>{TableLookupBytes(ba, shuffle_idx).raw};
|
|
1641
2148
|
#endif
|
|
1642
2149
|
}
|
|
1643
2150
|
template <typename T, HWY_IF_T_SIZE(T, 2)>
|
|
@@ -1651,8 +2158,11 @@ HWY_API Vec64<T> ShuffleTwo2301(const Vec64<T> a, const Vec64<T> b) {
|
|
|
1651
2158
|
return Vec64<T>{
|
|
1652
2159
|
_mm_shufflelo_epi16(ba_shuffled.raw, _MM_SHUFFLE(2, 3, 0, 1))};
|
|
1653
2160
|
#else
|
|
1654
|
-
|
|
1655
|
-
|
|
2161
|
+
const RebindToUnsigned<decltype(d2)> d2_u;
|
|
2162
|
+
const auto shuffle_idx = BitCast(
|
|
2163
|
+
d2,
|
|
2164
|
+
Dup128VecFromValues(d2_u, 0x0302, 0x0100, 0x0f0e, 0x0d0c, 0, 0, 0, 0));
|
|
2165
|
+
return Vec64<T>{TableLookupBytes(ba, shuffle_idx).raw};
|
|
1656
2166
|
#endif
|
|
1657
2167
|
}
|
|
1658
2168
|
template <typename T, HWY_IF_T_SIZE(T, 4)>
|
|
@@ -1679,8 +2189,11 @@ HWY_API Vec32<T> ShuffleTwo1230(const Vec32<T> a, const Vec32<T> b) {
|
|
|
1679
2189
|
#else
|
|
1680
2190
|
const Twice<decltype(d)> d2;
|
|
1681
2191
|
const auto ba = Combine(d2, b, a);
|
|
1682
|
-
|
|
1683
|
-
|
|
2192
|
+
const RebindToUnsigned<decltype(d2)> d2_u;
|
|
2193
|
+
const auto shuffle_idx =
|
|
2194
|
+
BitCast(d2, Dup128VecFromValues(d2_u, 0, 3, 6, 5, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
2195
|
+
0, 0, 0, 0));
|
|
2196
|
+
return Vec32<T>{TableLookupBytes(ba, shuffle_idx).raw};
|
|
1684
2197
|
#endif
|
|
1685
2198
|
}
|
|
1686
2199
|
template <typename T, HWY_IF_T_SIZE(T, 2)>
|
|
@@ -1695,8 +2208,11 @@ HWY_API Vec64<T> ShuffleTwo1230(const Vec64<T> a, const Vec64<T> b) {
|
|
|
1695
2208
|
#else
|
|
1696
2209
|
const Twice<decltype(d)> d2;
|
|
1697
2210
|
const auto ba = Combine(d2, b, a);
|
|
1698
|
-
|
|
1699
|
-
|
|
2211
|
+
const RebindToUnsigned<decltype(d2)> d2_u;
|
|
2212
|
+
const auto shuffle_idx = BitCast(
|
|
2213
|
+
d2,
|
|
2214
|
+
Dup128VecFromValues(d2_u, 0x0100, 0x0706, 0x0d0c, 0x0b0a, 0, 0, 0, 0));
|
|
2215
|
+
return Vec64<T>{TableLookupBytes(ba, shuffle_idx).raw};
|
|
1700
2216
|
#endif
|
|
1701
2217
|
}
|
|
1702
2218
|
template <typename T, HWY_IF_T_SIZE(T, 4)>
|
|
@@ -1723,8 +2239,11 @@ HWY_API Vec32<T> ShuffleTwo3012(const Vec32<T> a, const Vec32<T> b) {
|
|
|
1723
2239
|
#else
|
|
1724
2240
|
const Twice<decltype(d)> d2;
|
|
1725
2241
|
const auto ba = Combine(d2, b, a);
|
|
1726
|
-
|
|
1727
|
-
|
|
2242
|
+
const RebindToUnsigned<decltype(d2)> d2_u;
|
|
2243
|
+
const auto shuffle_idx =
|
|
2244
|
+
BitCast(d2, Dup128VecFromValues(d2_u, 2, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
2245
|
+
0, 0, 0, 0));
|
|
2246
|
+
return Vec32<T>{TableLookupBytes(ba, shuffle_idx).raw};
|
|
1728
2247
|
#endif
|
|
1729
2248
|
}
|
|
1730
2249
|
template <typename T, HWY_IF_T_SIZE(T, 2)>
|
|
@@ -1739,8 +2258,11 @@ HWY_API Vec64<T> ShuffleTwo3012(const Vec64<T> a, const Vec64<T> b) {
|
|
|
1739
2258
|
#else
|
|
1740
2259
|
const Twice<decltype(d)> d2;
|
|
1741
2260
|
const auto ba = Combine(d2, b, a);
|
|
1742
|
-
|
|
1743
|
-
|
|
2261
|
+
const RebindToUnsigned<decltype(d2)> d2_u;
|
|
2262
|
+
const auto shuffle_idx = BitCast(
|
|
2263
|
+
d2,
|
|
2264
|
+
Dup128VecFromValues(d2_u, 0x0504, 0x0302, 0x0908, 0x0f0e, 0, 0, 0, 0));
|
|
2265
|
+
return Vec64<T>{TableLookupBytes(ba, shuffle_idx).raw};
|
|
1744
2266
|
#endif
|
|
1745
2267
|
}
|
|
1746
2268
|
template <typename T, HWY_IF_T_SIZE(T, 4)>
|
|
@@ -1812,105 +2334,6 @@ HWY_API Vec128<float> Shuffle0123(const Vec128<float> v) {
|
|
|
1812
2334
|
|
|
1813
2335
|
// Comparisons set a mask bit to 1 if the condition is true, else 0.
|
|
1814
2336
|
|
|
1815
|
-
// ------------------------------ MaskFromVec
|
|
1816
|
-
|
|
1817
|
-
namespace detail {
|
|
1818
|
-
|
|
1819
|
-
template <typename T, size_t N>
|
|
1820
|
-
HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<1> /*tag*/,
|
|
1821
|
-
const Vec128<T, N> v) {
|
|
1822
|
-
return Mask128<T, N>{_mm_movepi8_mask(v.raw)};
|
|
1823
|
-
}
|
|
1824
|
-
template <typename T, size_t N>
|
|
1825
|
-
HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<2> /*tag*/,
|
|
1826
|
-
const Vec128<T, N> v) {
|
|
1827
|
-
return Mask128<T, N>{_mm_movepi16_mask(v.raw)};
|
|
1828
|
-
}
|
|
1829
|
-
template <typename T, size_t N>
|
|
1830
|
-
HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<4> /*tag*/,
|
|
1831
|
-
const Vec128<T, N> v) {
|
|
1832
|
-
return Mask128<T, N>{_mm_movepi32_mask(v.raw)};
|
|
1833
|
-
}
|
|
1834
|
-
template <typename T, size_t N>
|
|
1835
|
-
HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<8> /*tag*/,
|
|
1836
|
-
const Vec128<T, N> v) {
|
|
1837
|
-
return Mask128<T, N>{_mm_movepi64_mask(v.raw)};
|
|
1838
|
-
}
|
|
1839
|
-
|
|
1840
|
-
} // namespace detail
|
|
1841
|
-
|
|
1842
|
-
template <typename T, size_t N>
|
|
1843
|
-
HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
|
|
1844
|
-
return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
|
|
1845
|
-
}
|
|
1846
|
-
// There do not seem to be native floating-point versions of these instructions.
|
|
1847
|
-
template <size_t N>
|
|
1848
|
-
HWY_API Mask128<float, N> MaskFromVec(const Vec128<float, N> v) {
|
|
1849
|
-
const RebindToSigned<DFromV<decltype(v)>> di;
|
|
1850
|
-
return Mask128<float, N>{MaskFromVec(BitCast(di, v)).raw};
|
|
1851
|
-
}
|
|
1852
|
-
template <size_t N>
|
|
1853
|
-
HWY_API Mask128<double, N> MaskFromVec(const Vec128<double, N> v) {
|
|
1854
|
-
const RebindToSigned<DFromV<decltype(v)>> di;
|
|
1855
|
-
return Mask128<double, N>{MaskFromVec(BitCast(di, v)).raw};
|
|
1856
|
-
}
|
|
1857
|
-
|
|
1858
|
-
template <class D>
|
|
1859
|
-
using MFromD = decltype(MaskFromVec(VFromD<D>()));
|
|
1860
|
-
|
|
1861
|
-
// ------------------------------ VecFromMask
|
|
1862
|
-
|
|
1863
|
-
template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
|
|
1864
|
-
HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
|
|
1865
|
-
return Vec128<T, N>{_mm_movm_epi8(v.raw)};
|
|
1866
|
-
}
|
|
1867
|
-
|
|
1868
|
-
template <typename T, size_t N, HWY_IF_UI16(T)>
|
|
1869
|
-
HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
|
|
1870
|
-
return Vec128<T, N>{_mm_movm_epi16(v.raw)};
|
|
1871
|
-
}
|
|
1872
|
-
|
|
1873
|
-
template <typename T, size_t N, HWY_IF_UI32(T)>
|
|
1874
|
-
HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
|
|
1875
|
-
return Vec128<T, N>{_mm_movm_epi32(v.raw)};
|
|
1876
|
-
}
|
|
1877
|
-
|
|
1878
|
-
template <typename T, size_t N, HWY_IF_UI64(T)>
|
|
1879
|
-
HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
|
|
1880
|
-
return Vec128<T, N>{_mm_movm_epi64(v.raw)};
|
|
1881
|
-
}
|
|
1882
|
-
|
|
1883
|
-
#if HWY_HAVE_FLOAT16
|
|
1884
|
-
template <size_t N>
|
|
1885
|
-
HWY_API Vec128<float16_t, N> VecFromMask(const Mask128<float16_t, N> v) {
|
|
1886
|
-
return Vec128<float16_t, N>{_mm_castsi128_ph(_mm_movm_epi16(v.raw))};
|
|
1887
|
-
}
|
|
1888
|
-
#endif // HWY_HAVE_FLOAT16
|
|
1889
|
-
|
|
1890
|
-
template <size_t N>
|
|
1891
|
-
HWY_API Vec128<float, N> VecFromMask(const Mask128<float, N> v) {
|
|
1892
|
-
return Vec128<float, N>{_mm_castsi128_ps(_mm_movm_epi32(v.raw))};
|
|
1893
|
-
}
|
|
1894
|
-
|
|
1895
|
-
template <size_t N>
|
|
1896
|
-
HWY_API Vec128<double, N> VecFromMask(const Mask128<double, N> v) {
|
|
1897
|
-
return Vec128<double, N>{_mm_castsi128_pd(_mm_movm_epi64(v.raw))};
|
|
1898
|
-
}
|
|
1899
|
-
|
|
1900
|
-
// Generic for all vector lengths.
|
|
1901
|
-
template <class D>
|
|
1902
|
-
HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) {
|
|
1903
|
-
return VecFromMask(v);
|
|
1904
|
-
}
|
|
1905
|
-
|
|
1906
|
-
// ------------------------------ RebindMask (MaskFromVec)
|
|
1907
|
-
|
|
1908
|
-
template <typename TFrom, size_t NFrom, class DTo, HWY_IF_V_SIZE_LE_D(DTo, 16)>
|
|
1909
|
-
HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) {
|
|
1910
|
-
static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
|
|
1911
|
-
return MFromD<DTo>{m.raw};
|
|
1912
|
-
}
|
|
1913
|
-
|
|
1914
2337
|
// ------------------------------ TestBit
|
|
1915
2338
|
|
|
1916
2339
|
namespace detail {
|
|
@@ -1970,7 +2393,11 @@ HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
|
|
|
1970
2393
|
template <size_t N>
|
|
1971
2394
|
HWY_API Mask128<float16_t, N> operator==(Vec128<float16_t, N> a,
|
|
1972
2395
|
Vec128<float16_t, N> b) {
|
|
2396
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
2397
|
+
HWY_DIAGNOSTICS(push)
|
|
2398
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
1973
2399
|
return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)};
|
|
2400
|
+
HWY_DIAGNOSTICS(pop)
|
|
1974
2401
|
}
|
|
1975
2402
|
#endif // HWY_HAVE_FLOAT16
|
|
1976
2403
|
template <size_t N>
|
|
@@ -2010,7 +2437,11 @@ HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
|
|
|
2010
2437
|
template <size_t N>
|
|
2011
2438
|
HWY_API Mask128<float16_t, N> operator!=(Vec128<float16_t, N> a,
|
|
2012
2439
|
Vec128<float16_t, N> b) {
|
|
2440
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
2441
|
+
HWY_DIAGNOSTICS(push)
|
|
2442
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
2013
2443
|
return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
|
|
2444
|
+
HWY_DIAGNOSTICS(pop)
|
|
2014
2445
|
}
|
|
2015
2446
|
#endif // HWY_HAVE_FLOAT16
|
|
2016
2447
|
template <size_t N>
|
|
@@ -2072,7 +2503,11 @@ HWY_API Mask128<uint64_t, N> operator>(Vec128<uint64_t, N> a,
|
|
|
2072
2503
|
template <size_t N>
|
|
2073
2504
|
HWY_API Mask128<float16_t, N> operator>(Vec128<float16_t, N> a,
|
|
2074
2505
|
Vec128<float16_t, N> b) {
|
|
2506
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
2507
|
+
HWY_DIAGNOSTICS(push)
|
|
2508
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
2075
2509
|
return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)};
|
|
2510
|
+
HWY_DIAGNOSTICS(pop)
|
|
2076
2511
|
}
|
|
2077
2512
|
#endif // HWY_HAVE_FLOAT16
|
|
2078
2513
|
template <size_t N>
|
|
@@ -2090,7 +2525,11 @@ HWY_API Mask128<double, N> operator>(Vec128<double, N> a, Vec128<double, N> b) {
|
|
|
2090
2525
|
template <size_t N>
|
|
2091
2526
|
HWY_API Mask128<float16_t, N> operator>=(Vec128<float16_t, N> a,
|
|
2092
2527
|
Vec128<float16_t, N> b) {
|
|
2528
|
+
// Work around warnings in the intrinsic definitions (passing -1 as a mask).
|
|
2529
|
+
HWY_DIAGNOSTICS(push)
|
|
2530
|
+
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
|
|
2093
2531
|
return Mask128<float16_t, N>{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)};
|
|
2532
|
+
HWY_DIAGNOSTICS(pop)
|
|
2094
2533
|
}
|
|
2095
2534
|
#endif // HWY_HAVE_FLOAT16
|
|
2096
2535
|
template <size_t N>
|
|
@@ -2494,7 +2933,7 @@ static HWY_INLINE V MaskOutVec128Iota(V v) {
|
|
|
2494
2933
|
template <class D, typename T2, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
2495
2934
|
HWY_API VFromD<D> Iota(D d, const T2 first) {
|
|
2496
2935
|
const auto result_iota =
|
|
2497
|
-
detail::Iota0(d) + Set(d,
|
|
2936
|
+
detail::Iota0(d) + Set(d, ConvertScalarTo<TFromD<D>>(first));
|
|
2498
2937
|
#if HWY_COMPILER_MSVC
|
|
2499
2938
|
return detail::MaskOutVec128Iota(result_iota);
|
|
2500
2939
|
#else
|
|
@@ -2619,9 +3058,11 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D /* tag */,
|
|
|
2619
3058
|
}
|
|
2620
3059
|
|
|
2621
3060
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
|
|
2622
|
-
HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D
|
|
3061
|
+
HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
|
|
2623
3062
|
const TFromD<D>* HWY_RESTRICT p) {
|
|
2624
|
-
|
|
3063
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
3064
|
+
return BitCast(d, VFromD<decltype(du)>{
|
|
3065
|
+
_mm_mask_loadu_epi16(BitCast(du, v).raw, m.raw, p)});
|
|
2625
3066
|
}
|
|
2626
3067
|
|
|
2627
3068
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)>
|
|
@@ -3216,23 +3657,182 @@ HWY_API Vec128<double, N> operator-(const Vec128<double, N> a,
|
|
|
3216
3657
|
return Vec128<double, N>{_mm_sub_pd(a.raw, b.raw)};
|
|
3217
3658
|
}
|
|
3218
3659
|
|
|
3219
|
-
// ------------------------------
|
|
3220
|
-
|
|
3221
|
-
|
|
3222
|
-
|
|
3660
|
+
// ------------------------------ AddSub
|
|
3661
|
+
|
|
3662
|
+
#if HWY_TARGET <= HWY_SSSE3
|
|
3663
|
+
template <size_t N, HWY_IF_LANES_GT(N, 1)>
|
|
3664
|
+
HWY_API Vec128<float, N> AddSub(Vec128<float, N> a, Vec128<float, N> b) {
|
|
3665
|
+
return Vec128<float, N>{_mm_addsub_ps(a.raw, b.raw)};
|
|
3666
|
+
}
|
|
3667
|
+
HWY_API Vec128<double> AddSub(Vec128<double> a, Vec128<double> b) {
|
|
3668
|
+
return Vec128<double>{_mm_addsub_pd(a.raw, b.raw)};
|
|
3669
|
+
}
|
|
3670
|
+
#endif // HWY_TARGET <= HWY_SSSE3
|
|
3671
|
+
|
|
3672
|
+
// ------------------------------ SumsOf8
|
|
3673
|
+
template <size_t N>
|
|
3674
|
+
HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
|
|
3675
|
+
return Vec128<uint64_t, N / 8>{_mm_sad_epu8(v.raw, _mm_setzero_si128())};
|
|
3676
|
+
}
|
|
3677
|
+
|
|
3678
|
+
// Generic for all vector lengths
|
|
3679
|
+
template <class V, HWY_IF_I8_D(DFromV<V>)>
|
|
3680
|
+
HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) {
|
|
3681
|
+
const DFromV<decltype(v)> d;
|
|
3682
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
3683
|
+
const Repartition<int64_t, decltype(d)> di64;
|
|
3684
|
+
|
|
3685
|
+
// Adjust the values of v to be in the 0..255 range by adding 128 to each lane
|
|
3686
|
+
// of v (which is the same as an bitwise XOR of each i8 lane by 128) and then
|
|
3687
|
+
// bitcasting the Xor result to an u8 vector.
|
|
3688
|
+
const auto v_adj = BitCast(du, Xor(v, SignBit(d)));
|
|
3689
|
+
|
|
3690
|
+
// Need to add -1024 to each i64 lane of the result of the SumsOf8(v_adj)
|
|
3691
|
+
// operation to account for the adjustment made above.
|
|
3692
|
+
return BitCast(di64, SumsOf8(v_adj)) + Set(di64, int64_t{-1024});
|
|
3693
|
+
}
|
|
3694
|
+
|
|
3695
|
+
#ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
|
|
3696
|
+
#undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
|
|
3697
|
+
#else
|
|
3698
|
+
#define HWY_NATIVE_SUMS_OF_8_ABS_DIFF
|
|
3699
|
+
#endif
|
|
3700
|
+
|
|
3701
|
+
template <size_t N>
|
|
3702
|
+
HWY_API Vec128<uint64_t, N / 8> SumsOf8AbsDiff(const Vec128<uint8_t, N> a,
|
|
3703
|
+
const Vec128<uint8_t, N> b) {
|
|
3704
|
+
return Vec128<uint64_t, N / 8>{_mm_sad_epu8(a.raw, b.raw)};
|
|
3705
|
+
}
|
|
3706
|
+
|
|
3707
|
+
// Generic for all vector lengths
|
|
3708
|
+
template <class V, HWY_IF_I8_D(DFromV<V>)>
|
|
3709
|
+
HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8AbsDiff(V a, V b) {
|
|
3710
|
+
const DFromV<V> d;
|
|
3711
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
3712
|
+
const RepartitionToWideX3<decltype(d)> di64;
|
|
3713
|
+
|
|
3714
|
+
// Adjust the values of a and b to be in the 0..255 range by adding 128 to
|
|
3715
|
+
// each lane of a and b (which is the same as an bitwise XOR of each i8 lane
|
|
3716
|
+
// by 128) and then bitcasting the results of the Xor operations to u8
|
|
3717
|
+
// vectors.
|
|
3718
|
+
const auto i8_msb = SignBit(d);
|
|
3719
|
+
const auto a_adj = BitCast(du, Xor(a, i8_msb));
|
|
3720
|
+
const auto b_adj = BitCast(du, Xor(b, i8_msb));
|
|
3721
|
+
|
|
3722
|
+
// The result of SumsOf8AbsDiff(a_adj, b_adj) can simply be bitcasted to an
|
|
3723
|
+
// i64 vector as |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true
|
|
3724
|
+
return BitCast(di64, SumsOf8AbsDiff(a_adj, b_adj));
|
|
3725
|
+
}
|
|
3726
|
+
|
|
3727
|
+
// ------------------------------ SumsOf4
|
|
3728
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
3729
|
+
namespace detail {
|
|
3730
|
+
|
|
3731
|
+
template <size_t N>
|
|
3732
|
+
HWY_INLINE Vec128<uint32_t, (N + 3) / 4> SumsOf4(
|
|
3733
|
+
hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/,
|
|
3734
|
+
Vec128<uint8_t, N> v) {
|
|
3735
|
+
const DFromV<decltype(v)> d;
|
|
3736
|
+
|
|
3737
|
+
// _mm_maskz_dbsad_epu8 is used below as the odd uint16_t lanes need to be
|
|
3738
|
+
// zeroed out and the sums of the 4 consecutive lanes are already in the
|
|
3739
|
+
// even uint16_t lanes of the _mm_maskz_dbsad_epu8 result.
|
|
3740
|
+
return Vec128<uint32_t, (N + 3) / 4>{
|
|
3741
|
+
_mm_maskz_dbsad_epu8(static_cast<__mmask8>(0x55), v.raw, Zero(d).raw, 0)};
|
|
3742
|
+
}
|
|
3743
|
+
|
|
3744
|
+
// detail::SumsOf4 for Vec128<int8_t, N> on AVX3 is implemented in x86_512-inl.h
|
|
3745
|
+
|
|
3746
|
+
} // namespace detail
|
|
3747
|
+
#endif // HWY_TARGET <= HWY_AVX3
|
|
3748
|
+
|
|
3749
|
+
// ------------------------------ SumsOfAdjQuadAbsDiff
|
|
3750
|
+
|
|
3751
|
+
#if HWY_TARGET <= HWY_SSE4
|
|
3752
|
+
#ifdef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
|
|
3753
|
+
#undef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
|
|
3754
|
+
#else
|
|
3755
|
+
#define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
|
|
3756
|
+
#endif
|
|
3757
|
+
|
|
3758
|
+
template <int kAOffset, int kBOffset, size_t N>
|
|
3759
|
+
HWY_API Vec128<uint16_t, (N + 1) / 2> SumsOfAdjQuadAbsDiff(
|
|
3760
|
+
Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
|
|
3761
|
+
static_assert(0 <= kAOffset && kAOffset <= 1,
|
|
3762
|
+
"kAOffset must be between 0 and 1");
|
|
3763
|
+
static_assert(0 <= kBOffset && kBOffset <= 3,
|
|
3764
|
+
"kBOffset must be between 0 and 3");
|
|
3765
|
+
return Vec128<uint16_t, (N + 1) / 2>{
|
|
3766
|
+
_mm_mpsadbw_epu8(a.raw, b.raw, (kAOffset << 2) | kBOffset)};
|
|
3223
3767
|
}
|
|
3224
3768
|
|
|
3225
|
-
|
|
3226
|
-
|
|
3769
|
+
// Generic for all vector lengths
|
|
3770
|
+
template <int kAOffset, int kBOffset, class V, HWY_IF_I8_D(DFromV<V>)>
|
|
3771
|
+
HWY_API VFromD<RepartitionToWide<DFromV<V>>> SumsOfAdjQuadAbsDiff(V a, V b) {
|
|
3772
|
+
const DFromV<decltype(a)> d;
|
|
3773
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
3774
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
3775
|
+
|
|
3776
|
+
// Adjust the values of a and b to be in the 0..255 range by adding 128 to
|
|
3777
|
+
// each lane of a and b (which is the same as an bitwise XOR of each i8 lane
|
|
3778
|
+
// by 128) and then bitcasting the results of the Xor operations to u8
|
|
3779
|
+
// vectors.
|
|
3780
|
+
const auto i8_msb = SignBit(d);
|
|
3781
|
+
const auto a_adj = BitCast(du, Xor(a, i8_msb));
|
|
3782
|
+
const auto b_adj = BitCast(du, Xor(b, i8_msb));
|
|
3783
|
+
|
|
3784
|
+
// The result of SumsOfAdjQuadAbsDiff<kAOffset, kBOffset>(a_adj, b_adj) can
|
|
3785
|
+
// simply be bitcasted to an i16 vector as
|
|
3786
|
+
// |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true.
|
|
3787
|
+
return BitCast(dw, SumsOfAdjQuadAbsDiff<kAOffset, kBOffset>(a_adj, b_adj));
|
|
3788
|
+
}
|
|
3789
|
+
#endif
|
|
3790
|
+
|
|
3791
|
+
// ------------------------------ SumsOfShuffledQuadAbsDiff
|
|
3792
|
+
|
|
3793
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
3794
|
+
#ifdef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
|
|
3795
|
+
#undef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
|
|
3227
3796
|
#else
|
|
3228
|
-
#define
|
|
3797
|
+
#define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
|
|
3229
3798
|
#endif
|
|
3230
3799
|
|
|
3231
|
-
template <size_t N>
|
|
3232
|
-
HWY_API Vec128<
|
|
3233
|
-
|
|
3234
|
-
|
|
3800
|
+
template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, size_t N>
|
|
3801
|
+
HWY_API Vec128<uint16_t, (N + 1) / 2> SumsOfShuffledQuadAbsDiff(
|
|
3802
|
+
Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
|
|
3803
|
+
static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
|
|
3804
|
+
static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
|
|
3805
|
+
static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
|
|
3806
|
+
static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
|
|
3807
|
+
return Vec128<uint16_t, (N + 1) / 2>{
|
|
3808
|
+
_mm_dbsad_epu8(b.raw, a.raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))};
|
|
3809
|
+
}
|
|
3810
|
+
|
|
3811
|
+
// Generic for all vector lengths
|
|
3812
|
+
template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, class V,
|
|
3813
|
+
HWY_IF_I8_D(DFromV<V>)>
|
|
3814
|
+
HWY_API VFromD<RepartitionToWide<DFromV<V>>> SumsOfShuffledQuadAbsDiff(V a,
|
|
3815
|
+
V b) {
|
|
3816
|
+
const DFromV<decltype(a)> d;
|
|
3817
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
3818
|
+
const RepartitionToWide<decltype(d)> dw;
|
|
3819
|
+
|
|
3820
|
+
// Adjust the values of a and b to be in the 0..255 range by adding 128 to
|
|
3821
|
+
// each lane of a and b (which is the same as an bitwise XOR of each i8 lane
|
|
3822
|
+
// by 128) and then bitcasting the results of the Xor operations to u8
|
|
3823
|
+
// vectors.
|
|
3824
|
+
const auto i8_msb = SignBit(d);
|
|
3825
|
+
const auto a_adj = BitCast(du, Xor(a, i8_msb));
|
|
3826
|
+
const auto b_adj = BitCast(du, Xor(b, i8_msb));
|
|
3827
|
+
|
|
3828
|
+
// The result of
|
|
3829
|
+
// SumsOfShuffledQuadAbsDiff<kIdx3, kIdx2, kIdx1, kIdx0>(a_adj, b_adj) can
|
|
3830
|
+
// simply be bitcasted to an i16 vector as
|
|
3831
|
+
// |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true.
|
|
3832
|
+
return BitCast(
|
|
3833
|
+
dw, SumsOfShuffledQuadAbsDiff<kIdx3, kIdx2, kIdx1, kIdx0>(a_adj, b_adj));
|
|
3235
3834
|
}
|
|
3835
|
+
#endif
|
|
3236
3836
|
|
|
3237
3837
|
// ------------------------------ SaturatedAdd
|
|
3238
3838
|
|
|
@@ -3631,16 +4231,62 @@ HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
|
|
|
3631
4231
|
#endif
|
|
3632
4232
|
}
|
|
3633
4233
|
|
|
4234
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
3634
4235
|
template <size_t N>
|
|
3635
4236
|
HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
|
|
3636
|
-
#if HWY_TARGET <= HWY_AVX3
|
|
3637
4237
|
return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)};
|
|
4238
|
+
}
|
|
3638
4239
|
#else
|
|
4240
|
+
// I64 Abs is generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2
|
|
4241
|
+
template <class V, HWY_IF_I64(TFromV<V>)>
|
|
4242
|
+
HWY_API V Abs(V v) {
|
|
3639
4243
|
const auto zero = Zero(DFromV<decltype(v)>());
|
|
3640
|
-
return
|
|
4244
|
+
return IfNegativeThenElse(v, zero - v, v);
|
|
4245
|
+
}
|
|
4246
|
+
#endif
|
|
4247
|
+
|
|
4248
|
+
#ifdef HWY_NATIVE_SATURATED_ABS
|
|
4249
|
+
#undef HWY_NATIVE_SATURATED_ABS
|
|
4250
|
+
#else
|
|
4251
|
+
#define HWY_NATIVE_SATURATED_ABS
|
|
4252
|
+
#endif
|
|
4253
|
+
|
|
4254
|
+
// Generic for all vector lengths
|
|
4255
|
+
template <class V, HWY_IF_I8(TFromV<V>)>
|
|
4256
|
+
HWY_API V SaturatedAbs(V v) {
|
|
4257
|
+
const DFromV<decltype(v)> d;
|
|
4258
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4259
|
+
return BitCast(d, Min(BitCast(du, v), BitCast(du, SaturatedSub(Zero(d), v))));
|
|
4260
|
+
}
|
|
4261
|
+
|
|
4262
|
+
// Generic for all vector lengths
|
|
4263
|
+
template <class V, HWY_IF_I16(TFromV<V>)>
|
|
4264
|
+
HWY_API V SaturatedAbs(V v) {
|
|
4265
|
+
return Max(v, SaturatedSub(Zero(DFromV<V>()), v));
|
|
4266
|
+
}
|
|
4267
|
+
|
|
4268
|
+
// Generic for all vector lengths
|
|
4269
|
+
template <class V, HWY_IF_I32(TFromV<V>)>
|
|
4270
|
+
HWY_API V SaturatedAbs(V v) {
|
|
4271
|
+
const auto abs_v = Abs(v);
|
|
4272
|
+
|
|
4273
|
+
#if HWY_TARGET <= HWY_SSE4
|
|
4274
|
+
const DFromV<decltype(v)> d;
|
|
4275
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
4276
|
+
return BitCast(d, Min(BitCast(du, abs_v),
|
|
4277
|
+
Set(du, static_cast<uint32_t>(LimitsMax<int32_t>()))));
|
|
4278
|
+
#else
|
|
4279
|
+
return Add(abs_v, BroadcastSignBit(abs_v));
|
|
3641
4280
|
#endif
|
|
3642
4281
|
}
|
|
3643
4282
|
|
|
4283
|
+
// Generic for all vector lengths
|
|
4284
|
+
template <class V, HWY_IF_I64(TFromV<V>)>
|
|
4285
|
+
HWY_API V SaturatedAbs(V v) {
|
|
4286
|
+
const auto abs_v = Abs(v);
|
|
4287
|
+
return Add(abs_v, BroadcastSignBit(abs_v));
|
|
4288
|
+
}
|
|
4289
|
+
|
|
3644
4290
|
// GCC <14 and Clang <11 do not follow the Intel documentation for AVX-512VL
|
|
3645
4291
|
// srli_epi64: the count should be unsigned int. Note that this is not the same
|
|
3646
4292
|
// as the Shift3264Count in x86_512-inl.h (GCC also requires int).
|
|
@@ -3743,6 +4389,49 @@ HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
|
|
|
3743
4389
|
#endif
|
|
3744
4390
|
}
|
|
3745
4391
|
|
|
4392
|
+
// ------------------------------ IfNegativeThenNegOrUndefIfZero
|
|
4393
|
+
|
|
4394
|
+
#if HWY_TARGET <= HWY_SSSE3
|
|
4395
|
+
|
|
4396
|
+
#ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
|
|
4397
|
+
#undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
|
|
4398
|
+
#else
|
|
4399
|
+
#define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
|
|
4400
|
+
#endif
|
|
4401
|
+
|
|
4402
|
+
template <size_t N>
|
|
4403
|
+
HWY_API Vec128<int8_t, N> IfNegativeThenNegOrUndefIfZero(Vec128<int8_t, N> mask,
|
|
4404
|
+
Vec128<int8_t, N> v) {
|
|
4405
|
+
return Vec128<int8_t, N>{_mm_sign_epi8(v.raw, mask.raw)};
|
|
4406
|
+
}
|
|
4407
|
+
|
|
4408
|
+
template <size_t N>
|
|
4409
|
+
HWY_API Vec128<int16_t, N> IfNegativeThenNegOrUndefIfZero(
|
|
4410
|
+
Vec128<int16_t, N> mask, Vec128<int16_t, N> v) {
|
|
4411
|
+
return Vec128<int16_t, N>{_mm_sign_epi16(v.raw, mask.raw)};
|
|
4412
|
+
}
|
|
4413
|
+
|
|
4414
|
+
template <size_t N>
|
|
4415
|
+
HWY_API Vec128<int32_t, N> IfNegativeThenNegOrUndefIfZero(
|
|
4416
|
+
Vec128<int32_t, N> mask, Vec128<int32_t, N> v) {
|
|
4417
|
+
return Vec128<int32_t, N>{_mm_sign_epi32(v.raw, mask.raw)};
|
|
4418
|
+
}
|
|
4419
|
+
|
|
4420
|
+
// Generic for all vector lengths
|
|
4421
|
+
template <class V, HWY_IF_I64_D(DFromV<V>)>
|
|
4422
|
+
HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
|
|
4423
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
4424
|
+
// MaskedSubOr is more efficient than IfNegativeThenElse on AVX3
|
|
4425
|
+
const DFromV<decltype(v)> d;
|
|
4426
|
+
return MaskedSubOr(v, MaskFromVec(mask), Zero(d), v);
|
|
4427
|
+
#else
|
|
4428
|
+
// IfNegativeThenElse is more efficient than MaskedSubOr on SSE4/AVX2
|
|
4429
|
+
return IfNegativeThenElse(mask, Neg(v), v);
|
|
4430
|
+
#endif
|
|
4431
|
+
}
|
|
4432
|
+
|
|
4433
|
+
#endif // HWY_TARGET <= HWY_SSSE3
|
|
4434
|
+
|
|
3746
4435
|
// ------------------------------ ShiftLeftSame
|
|
3747
4436
|
|
|
3748
4437
|
template <size_t N>
|
|
@@ -4000,6 +4689,361 @@ HWY_API V AbsDiff(V a, V b) {
|
|
|
4000
4689
|
return Abs(a - b);
|
|
4001
4690
|
}
|
|
4002
4691
|
|
|
4692
|
+
// ------------------------------ MaskedMinOr
|
|
4693
|
+
|
|
4694
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
4695
|
+
|
|
4696
|
+
#ifdef HWY_NATIVE_MASKED_ARITH
|
|
4697
|
+
#undef HWY_NATIVE_MASKED_ARITH
|
|
4698
|
+
#else
|
|
4699
|
+
#define HWY_NATIVE_MASKED_ARITH
|
|
4700
|
+
#endif
|
|
4701
|
+
|
|
4702
|
+
template <typename T, size_t N, HWY_IF_U8(T)>
|
|
4703
|
+
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4704
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4705
|
+
return Vec128<T, N>{_mm_mask_min_epu8(no.raw, m.raw, a.raw, b.raw)};
|
|
4706
|
+
}
|
|
4707
|
+
template <typename T, size_t N, HWY_IF_I8(T)>
|
|
4708
|
+
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4709
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4710
|
+
return Vec128<T, N>{_mm_mask_min_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
4711
|
+
}
|
|
4712
|
+
|
|
4713
|
+
template <typename T, size_t N, HWY_IF_U16(T)>
|
|
4714
|
+
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4715
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4716
|
+
return Vec128<T, N>{_mm_mask_min_epu16(no.raw, m.raw, a.raw, b.raw)};
|
|
4717
|
+
}
|
|
4718
|
+
template <typename T, size_t N, HWY_IF_I16(T)>
|
|
4719
|
+
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4720
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4721
|
+
return Vec128<T, N>{_mm_mask_min_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
4722
|
+
}
|
|
4723
|
+
|
|
4724
|
+
template <typename T, size_t N, HWY_IF_U32(T)>
|
|
4725
|
+
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4726
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4727
|
+
return Vec128<T, N>{_mm_mask_min_epu32(no.raw, m.raw, a.raw, b.raw)};
|
|
4728
|
+
}
|
|
4729
|
+
template <typename T, size_t N, HWY_IF_I32(T)>
|
|
4730
|
+
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4731
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4732
|
+
return Vec128<T, N>{_mm_mask_min_epi32(no.raw, m.raw, a.raw, b.raw)};
|
|
4733
|
+
}
|
|
4734
|
+
|
|
4735
|
+
template <typename T, size_t N, HWY_IF_U64(T)>
|
|
4736
|
+
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4737
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4738
|
+
return Vec128<T, N>{_mm_mask_min_epu64(no.raw, m.raw, a.raw, b.raw)};
|
|
4739
|
+
}
|
|
4740
|
+
template <typename T, size_t N, HWY_IF_I64(T)>
|
|
4741
|
+
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4742
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4743
|
+
return Vec128<T, N>{_mm_mask_min_epi64(no.raw, m.raw, a.raw, b.raw)};
|
|
4744
|
+
}
|
|
4745
|
+
|
|
4746
|
+
template <typename T, size_t N, HWY_IF_F32(T)>
|
|
4747
|
+
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4748
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4749
|
+
return Vec128<T, N>{_mm_mask_min_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
4750
|
+
}
|
|
4751
|
+
|
|
4752
|
+
template <typename T, size_t N, HWY_IF_F64(T)>
|
|
4753
|
+
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4754
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4755
|
+
return Vec128<T, N>{_mm_mask_min_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
4756
|
+
}
|
|
4757
|
+
|
|
4758
|
+
#if HWY_HAVE_FLOAT16
|
|
4759
|
+
template <typename T, size_t N, HWY_IF_F16(T)>
|
|
4760
|
+
HWY_API Vec128<T, N> MaskedMinOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4761
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4762
|
+
return Vec128<T, N>{_mm_mask_min_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
4763
|
+
}
|
|
4764
|
+
#endif // HWY_HAVE_FLOAT16
|
|
4765
|
+
|
|
4766
|
+
// ------------------------------ MaskedMaxOr
|
|
4767
|
+
|
|
4768
|
+
template <typename T, size_t N, HWY_IF_U8(T)>
|
|
4769
|
+
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4770
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4771
|
+
return Vec128<T, N>{_mm_mask_max_epu8(no.raw, m.raw, a.raw, b.raw)};
|
|
4772
|
+
}
|
|
4773
|
+
template <typename T, size_t N, HWY_IF_I8(T)>
|
|
4774
|
+
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4775
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4776
|
+
return Vec128<T, N>{_mm_mask_max_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
4777
|
+
}
|
|
4778
|
+
|
|
4779
|
+
template <typename T, size_t N, HWY_IF_U16(T)>
|
|
4780
|
+
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4781
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4782
|
+
return Vec128<T, N>{_mm_mask_max_epu16(no.raw, m.raw, a.raw, b.raw)};
|
|
4783
|
+
}
|
|
4784
|
+
template <typename T, size_t N, HWY_IF_I16(T)>
|
|
4785
|
+
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4786
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4787
|
+
return Vec128<T, N>{_mm_mask_max_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
4788
|
+
}
|
|
4789
|
+
|
|
4790
|
+
template <typename T, size_t N, HWY_IF_U32(T)>
|
|
4791
|
+
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4792
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4793
|
+
return Vec128<T, N>{_mm_mask_max_epu32(no.raw, m.raw, a.raw, b.raw)};
|
|
4794
|
+
}
|
|
4795
|
+
template <typename T, size_t N, HWY_IF_I32(T)>
|
|
4796
|
+
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4797
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4798
|
+
return Vec128<T, N>{_mm_mask_max_epi32(no.raw, m.raw, a.raw, b.raw)};
|
|
4799
|
+
}
|
|
4800
|
+
|
|
4801
|
+
template <typename T, size_t N, HWY_IF_U64(T)>
|
|
4802
|
+
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4803
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4804
|
+
return Vec128<T, N>{_mm_mask_max_epu64(no.raw, m.raw, a.raw, b.raw)};
|
|
4805
|
+
}
|
|
4806
|
+
template <typename T, size_t N, HWY_IF_I64(T)>
|
|
4807
|
+
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4808
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4809
|
+
return Vec128<T, N>{_mm_mask_max_epi64(no.raw, m.raw, a.raw, b.raw)};
|
|
4810
|
+
}
|
|
4811
|
+
|
|
4812
|
+
template <typename T, size_t N, HWY_IF_F32(T)>
|
|
4813
|
+
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4814
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4815
|
+
return Vec128<T, N>{_mm_mask_max_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
4816
|
+
}
|
|
4817
|
+
|
|
4818
|
+
template <typename T, size_t N, HWY_IF_F64(T)>
|
|
4819
|
+
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4820
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4821
|
+
return Vec128<T, N>{_mm_mask_max_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
4822
|
+
}
|
|
4823
|
+
|
|
4824
|
+
#if HWY_HAVE_FLOAT16
|
|
4825
|
+
template <typename T, size_t N, HWY_IF_F16(T)>
|
|
4826
|
+
HWY_API Vec128<T, N> MaskedMaxOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4827
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4828
|
+
return Vec128<T, N>{_mm_mask_max_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
4829
|
+
}
|
|
4830
|
+
#endif // HWY_HAVE_FLOAT16
|
|
4831
|
+
|
|
4832
|
+
// ------------------------------ MaskedAddOr
|
|
4833
|
+
|
|
4834
|
+
template <typename T, size_t N, HWY_IF_UI8(T)>
|
|
4835
|
+
HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4836
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4837
|
+
return Vec128<T, N>{_mm_mask_add_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
4838
|
+
}
|
|
4839
|
+
|
|
4840
|
+
template <typename T, size_t N, HWY_IF_UI16(T)>
|
|
4841
|
+
HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4842
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4843
|
+
return Vec128<T, N>{_mm_mask_add_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
4844
|
+
}
|
|
4845
|
+
|
|
4846
|
+
template <typename T, size_t N, HWY_IF_UI32(T)>
|
|
4847
|
+
HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4848
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4849
|
+
return Vec128<T, N>{_mm_mask_add_epi32(no.raw, m.raw, a.raw, b.raw)};
|
|
4850
|
+
}
|
|
4851
|
+
|
|
4852
|
+
template <typename T, size_t N, HWY_IF_UI64(T)>
|
|
4853
|
+
HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4854
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4855
|
+
return Vec128<T, N>{_mm_mask_add_epi64(no.raw, m.raw, a.raw, b.raw)};
|
|
4856
|
+
}
|
|
4857
|
+
|
|
4858
|
+
template <typename T, size_t N, HWY_IF_F32(T)>
|
|
4859
|
+
HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4860
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4861
|
+
return Vec128<T, N>{_mm_mask_add_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
4862
|
+
}
|
|
4863
|
+
|
|
4864
|
+
template <typename T, size_t N, HWY_IF_F64(T)>
|
|
4865
|
+
HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4866
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4867
|
+
return Vec128<T, N>{_mm_mask_add_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
4868
|
+
}
|
|
4869
|
+
|
|
4870
|
+
#if HWY_HAVE_FLOAT16
|
|
4871
|
+
template <typename T, size_t N, HWY_IF_F16(T)>
|
|
4872
|
+
HWY_API Vec128<T, N> MaskedAddOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4873
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4874
|
+
return Vec128<T, N>{_mm_mask_add_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
4875
|
+
}
|
|
4876
|
+
#endif // HWY_HAVE_FLOAT16
|
|
4877
|
+
|
|
4878
|
+
// ------------------------------ MaskedSubOr
|
|
4879
|
+
|
|
4880
|
+
template <typename T, size_t N, HWY_IF_UI8(T)>
|
|
4881
|
+
HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4882
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4883
|
+
return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
4884
|
+
}
|
|
4885
|
+
|
|
4886
|
+
template <typename T, size_t N, HWY_IF_UI16(T)>
|
|
4887
|
+
HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4888
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4889
|
+
return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
4890
|
+
}
|
|
4891
|
+
|
|
4892
|
+
template <typename T, size_t N, HWY_IF_UI32(T)>
|
|
4893
|
+
HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4894
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4895
|
+
return Vec128<T, N>{_mm_mask_sub_epi32(no.raw, m.raw, a.raw, b.raw)};
|
|
4896
|
+
}
|
|
4897
|
+
|
|
4898
|
+
template <typename T, size_t N, HWY_IF_UI64(T)>
|
|
4899
|
+
HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4900
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4901
|
+
return Vec128<T, N>{_mm_mask_sub_epi64(no.raw, m.raw, a.raw, b.raw)};
|
|
4902
|
+
}
|
|
4903
|
+
|
|
4904
|
+
template <typename T, size_t N, HWY_IF_F32(T)>
|
|
4905
|
+
HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4906
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4907
|
+
return Vec128<T, N>{_mm_mask_sub_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
4908
|
+
}
|
|
4909
|
+
|
|
4910
|
+
template <typename T, size_t N, HWY_IF_F64(T)>
|
|
4911
|
+
HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4912
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4913
|
+
return Vec128<T, N>{_mm_mask_sub_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
4914
|
+
}
|
|
4915
|
+
|
|
4916
|
+
#if HWY_HAVE_FLOAT16
|
|
4917
|
+
template <typename T, size_t N, HWY_IF_F16(T)>
|
|
4918
|
+
HWY_API Vec128<T, N> MaskedSubOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4919
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4920
|
+
return Vec128<T, N>{_mm_mask_sub_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
4921
|
+
}
|
|
4922
|
+
#endif // HWY_HAVE_FLOAT16
|
|
4923
|
+
|
|
4924
|
+
// ------------------------------ MaskedMulOr
|
|
4925
|
+
|
|
4926
|
+
// There are no elementwise integer mask_mul. Generic for all vector lengths.
|
|
4927
|
+
template <class V, class M>
|
|
4928
|
+
HWY_API V MaskedMulOr(V no, M m, V a, V b) {
|
|
4929
|
+
return IfThenElse(m, a * b, no);
|
|
4930
|
+
}
|
|
4931
|
+
|
|
4932
|
+
template <size_t N>
|
|
4933
|
+
HWY_API Vec128<float, N> MaskedMulOr(Vec128<float, N> no, Mask128<float, N> m,
|
|
4934
|
+
Vec128<float, N> a, Vec128<float, N> b) {
|
|
4935
|
+
return Vec128<float, N>{_mm_mask_mul_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
4936
|
+
}
|
|
4937
|
+
|
|
4938
|
+
template <size_t N>
|
|
4939
|
+
HWY_API Vec128<double, N> MaskedMulOr(Vec128<double, N> no,
|
|
4940
|
+
Mask128<double, N> m, Vec128<double, N> a,
|
|
4941
|
+
Vec128<double, N> b) {
|
|
4942
|
+
return Vec128<double, N>{_mm_mask_mul_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
4943
|
+
}
|
|
4944
|
+
|
|
4945
|
+
#if HWY_HAVE_FLOAT16
|
|
4946
|
+
template <size_t N>
|
|
4947
|
+
HWY_API Vec128<float16_t, N> MaskedMulOr(Vec128<float16_t, N> no,
|
|
4948
|
+
Mask128<float16_t, N> m,
|
|
4949
|
+
Vec128<float16_t, N> a,
|
|
4950
|
+
Vec128<float16_t, N> b) {
|
|
4951
|
+
return Vec128<float16_t, N>{_mm_mask_mul_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
4952
|
+
}
|
|
4953
|
+
#endif // HWY_HAVE_FLOAT16
|
|
4954
|
+
|
|
4955
|
+
// ------------------------------ MaskedDivOr
|
|
4956
|
+
|
|
4957
|
+
template <size_t N>
|
|
4958
|
+
HWY_API Vec128<float, N> MaskedDivOr(Vec128<float, N> no, Mask128<float, N> m,
|
|
4959
|
+
Vec128<float, N> a, Vec128<float, N> b) {
|
|
4960
|
+
return Vec128<float, N>{_mm_mask_div_ps(no.raw, m.raw, a.raw, b.raw)};
|
|
4961
|
+
}
|
|
4962
|
+
|
|
4963
|
+
template <size_t N>
|
|
4964
|
+
HWY_API Vec128<double, N> MaskedDivOr(Vec128<double, N> no,
|
|
4965
|
+
Mask128<double, N> m, Vec128<double, N> a,
|
|
4966
|
+
Vec128<double, N> b) {
|
|
4967
|
+
return Vec128<double, N>{_mm_mask_div_pd(no.raw, m.raw, a.raw, b.raw)};
|
|
4968
|
+
}
|
|
4969
|
+
|
|
4970
|
+
#if HWY_HAVE_FLOAT16
|
|
4971
|
+
template <size_t N>
|
|
4972
|
+
HWY_API Vec128<float16_t, N> MaskedDivOr(Vec128<float16_t, N> no,
|
|
4973
|
+
Mask128<float16_t, N> m,
|
|
4974
|
+
Vec128<float16_t, N> a,
|
|
4975
|
+
Vec128<float16_t, N> b) {
|
|
4976
|
+
return Vec128<float16_t, N>{_mm_mask_div_ph(no.raw, m.raw, a.raw, b.raw)};
|
|
4977
|
+
}
|
|
4978
|
+
#endif // HWY_HAVE_FLOAT16
|
|
4979
|
+
|
|
4980
|
+
// Generic for all vector lengths
|
|
4981
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
4982
|
+
HWY_API V MaskedDivOr(V no, MFromD<DFromV<V>> m, V a, V b) {
|
|
4983
|
+
return IfThenElse(m, Div(a, b), no);
|
|
4984
|
+
}
|
|
4985
|
+
|
|
4986
|
+
// ------------------------------ MaskedModOr
|
|
4987
|
+
// Generic for all vector lengths
|
|
4988
|
+
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
|
|
4989
|
+
HWY_API V MaskedModOr(V no, MFromD<DFromV<V>> m, V a, V b) {
|
|
4990
|
+
return IfThenElse(m, Mod(a, b), no);
|
|
4991
|
+
}
|
|
4992
|
+
|
|
4993
|
+
// ------------------------------ MaskedSatAddOr
|
|
4994
|
+
|
|
4995
|
+
template <typename T, size_t N, HWY_IF_I8(T)>
|
|
4996
|
+
HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
4997
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
4998
|
+
return Vec128<T, N>{_mm_mask_adds_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
4999
|
+
}
|
|
5000
|
+
|
|
5001
|
+
template <typename T, size_t N, HWY_IF_U8(T)>
|
|
5002
|
+
HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
5003
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
5004
|
+
return Vec128<T, N>{_mm_mask_adds_epu8(no.raw, m.raw, a.raw, b.raw)};
|
|
5005
|
+
}
|
|
5006
|
+
|
|
5007
|
+
template <typename T, size_t N, HWY_IF_I16(T)>
|
|
5008
|
+
HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
5009
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
5010
|
+
return Vec128<T, N>{_mm_mask_adds_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
5011
|
+
}
|
|
5012
|
+
|
|
5013
|
+
template <typename T, size_t N, HWY_IF_U16(T)>
|
|
5014
|
+
HWY_API Vec128<T, N> MaskedSatAddOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
5015
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
5016
|
+
return Vec128<T, N>{_mm_mask_adds_epu16(no.raw, m.raw, a.raw, b.raw)};
|
|
5017
|
+
}
|
|
5018
|
+
|
|
5019
|
+
// ------------------------------ MaskedSatSubOr
|
|
5020
|
+
|
|
5021
|
+
template <typename T, size_t N, HWY_IF_I8(T)>
|
|
5022
|
+
HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
5023
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
5024
|
+
return Vec128<T, N>{_mm_mask_subs_epi8(no.raw, m.raw, a.raw, b.raw)};
|
|
5025
|
+
}
|
|
5026
|
+
|
|
5027
|
+
template <typename T, size_t N, HWY_IF_U8(T)>
|
|
5028
|
+
HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
5029
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
5030
|
+
return Vec128<T, N>{_mm_mask_subs_epu8(no.raw, m.raw, a.raw, b.raw)};
|
|
5031
|
+
}
|
|
5032
|
+
|
|
5033
|
+
template <typename T, size_t N, HWY_IF_I16(T)>
|
|
5034
|
+
HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
5035
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
5036
|
+
return Vec128<T, N>{_mm_mask_subs_epi16(no.raw, m.raw, a.raw, b.raw)};
|
|
5037
|
+
}
|
|
5038
|
+
|
|
5039
|
+
template <typename T, size_t N, HWY_IF_U16(T)>
|
|
5040
|
+
HWY_API Vec128<T, N> MaskedSatSubOr(Vec128<T, N> no, Mask128<T, N> m,
|
|
5041
|
+
Vec128<T, N> a, Vec128<T, N> b) {
|
|
5042
|
+
return Vec128<T, N>{_mm_mask_subs_epu16(no.raw, m.raw, a.raw, b.raw)};
|
|
5043
|
+
}
|
|
5044
|
+
|
|
5045
|
+
#endif // HWY_TARGET <= HWY_AVX3
|
|
5046
|
+
|
|
4003
5047
|
// ------------------------------ Floating-point multiply-add variants
|
|
4004
5048
|
|
|
4005
5049
|
#if HWY_HAVE_FLOAT16
|
|
@@ -4035,7 +5079,7 @@ HWY_API Vec128<float16_t, N> NegMulSub(Vec128<float16_t, N> mul,
|
|
|
4035
5079
|
template <size_t N>
|
|
4036
5080
|
HWY_API Vec128<float, N> MulAdd(Vec128<float, N> mul, Vec128<float, N> x,
|
|
4037
5081
|
Vec128<float, N> add) {
|
|
4038
|
-
#if HWY_TARGET >= HWY_SSE4
|
|
5082
|
+
#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
|
|
4039
5083
|
return mul * x + add;
|
|
4040
5084
|
#else
|
|
4041
5085
|
return Vec128<float, N>{_mm_fmadd_ps(mul.raw, x.raw, add.raw)};
|
|
@@ -4044,7 +5088,7 @@ HWY_API Vec128<float, N> MulAdd(Vec128<float, N> mul, Vec128<float, N> x,
|
|
|
4044
5088
|
template <size_t N>
|
|
4045
5089
|
HWY_API Vec128<double, N> MulAdd(Vec128<double, N> mul, Vec128<double, N> x,
|
|
4046
5090
|
Vec128<double, N> add) {
|
|
4047
|
-
#if HWY_TARGET >= HWY_SSE4
|
|
5091
|
+
#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
|
|
4048
5092
|
return mul * x + add;
|
|
4049
5093
|
#else
|
|
4050
5094
|
return Vec128<double, N>{_mm_fmadd_pd(mul.raw, x.raw, add.raw)};
|
|
@@ -4055,7 +5099,7 @@ HWY_API Vec128<double, N> MulAdd(Vec128<double, N> mul, Vec128<double, N> x,
|
|
|
4055
5099
|
template <size_t N>
|
|
4056
5100
|
HWY_API Vec128<float, N> NegMulAdd(Vec128<float, N> mul, Vec128<float, N> x,
|
|
4057
5101
|
Vec128<float, N> add) {
|
|
4058
|
-
#if HWY_TARGET >= HWY_SSE4
|
|
5102
|
+
#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
|
|
4059
5103
|
return add - mul * x;
|
|
4060
5104
|
#else
|
|
4061
5105
|
return Vec128<float, N>{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)};
|
|
@@ -4064,7 +5108,7 @@ HWY_API Vec128<float, N> NegMulAdd(Vec128<float, N> mul, Vec128<float, N> x,
|
|
|
4064
5108
|
template <size_t N>
|
|
4065
5109
|
HWY_API Vec128<double, N> NegMulAdd(Vec128<double, N> mul, Vec128<double, N> x,
|
|
4066
5110
|
Vec128<double, N> add) {
|
|
4067
|
-
#if HWY_TARGET >= HWY_SSE4
|
|
5111
|
+
#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
|
|
4068
5112
|
return add - mul * x;
|
|
4069
5113
|
#else
|
|
4070
5114
|
return Vec128<double, N>{_mm_fnmadd_pd(mul.raw, x.raw, add.raw)};
|
|
@@ -4075,7 +5119,7 @@ HWY_API Vec128<double, N> NegMulAdd(Vec128<double, N> mul, Vec128<double, N> x,
|
|
|
4075
5119
|
template <size_t N>
|
|
4076
5120
|
HWY_API Vec128<float, N> MulSub(Vec128<float, N> mul, Vec128<float, N> x,
|
|
4077
5121
|
Vec128<float, N> sub) {
|
|
4078
|
-
#if HWY_TARGET >= HWY_SSE4
|
|
5122
|
+
#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
|
|
4079
5123
|
return mul * x - sub;
|
|
4080
5124
|
#else
|
|
4081
5125
|
return Vec128<float, N>{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)};
|
|
@@ -4084,33 +5128,65 @@ HWY_API Vec128<float, N> MulSub(Vec128<float, N> mul, Vec128<float, N> x,
|
|
|
4084
5128
|
template <size_t N>
|
|
4085
5129
|
HWY_API Vec128<double, N> MulSub(Vec128<double, N> mul, Vec128<double, N> x,
|
|
4086
5130
|
Vec128<double, N> sub) {
|
|
4087
|
-
#if HWY_TARGET >= HWY_SSE4
|
|
5131
|
+
#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
|
|
4088
5132
|
return mul * x - sub;
|
|
4089
5133
|
#else
|
|
4090
5134
|
return Vec128<double, N>{_mm_fmsub_pd(mul.raw, x.raw, sub.raw)};
|
|
4091
5135
|
#endif
|
|
4092
5136
|
}
|
|
4093
5137
|
|
|
4094
|
-
// Returns -mul * x - sub
|
|
4095
|
-
template <size_t N>
|
|
4096
|
-
HWY_API Vec128<float, N> NegMulSub(Vec128<float, N> mul, Vec128<float, N> x,
|
|
4097
|
-
Vec128<float, N> sub) {
|
|
4098
|
-
#if HWY_TARGET >= HWY_SSE4
|
|
4099
|
-
return Neg(mul) * x - sub;
|
|
5138
|
+
// Returns -mul * x - sub
|
|
5139
|
+
template <size_t N>
|
|
5140
|
+
HWY_API Vec128<float, N> NegMulSub(Vec128<float, N> mul, Vec128<float, N> x,
|
|
5141
|
+
Vec128<float, N> sub) {
|
|
5142
|
+
#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
|
|
5143
|
+
return Neg(mul) * x - sub;
|
|
5144
|
+
#else
|
|
5145
|
+
return Vec128<float, N>{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)};
|
|
5146
|
+
#endif
|
|
5147
|
+
}
|
|
5148
|
+
template <size_t N>
|
|
5149
|
+
HWY_API Vec128<double, N> NegMulSub(Vec128<double, N> mul, Vec128<double, N> x,
|
|
5150
|
+
Vec128<double, N> sub) {
|
|
5151
|
+
#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
|
|
5152
|
+
return Neg(mul) * x - sub;
|
|
5153
|
+
#else
|
|
5154
|
+
return Vec128<double, N>{_mm_fnmsub_pd(mul.raw, x.raw, sub.raw)};
|
|
5155
|
+
#endif
|
|
5156
|
+
}
|
|
5157
|
+
|
|
5158
|
+
#if HWY_TARGET <= HWY_SSSE3
|
|
5159
|
+
|
|
5160
|
+
#if HWY_HAVE_FLOAT16
|
|
5161
|
+
template <size_t N, HWY_IF_LANES_GT(N, 1)>
|
|
5162
|
+
HWY_API Vec128<float16_t, N> MulAddSub(Vec128<float16_t, N> mul,
|
|
5163
|
+
Vec128<float16_t, N> x,
|
|
5164
|
+
Vec128<float16_t, N> sub_or_add) {
|
|
5165
|
+
return Vec128<float16_t, N>{_mm_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)};
|
|
5166
|
+
}
|
|
5167
|
+
#endif // HWY_HAVE_FLOAT16
|
|
5168
|
+
|
|
5169
|
+
template <size_t N, HWY_IF_LANES_GT(N, 1)>
|
|
5170
|
+
HWY_API Vec128<float, N> MulAddSub(Vec128<float, N> mul, Vec128<float, N> x,
|
|
5171
|
+
Vec128<float, N> sub_or_add) {
|
|
5172
|
+
#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
|
|
5173
|
+
return AddSub(mul * x, sub_or_add);
|
|
4100
5174
|
#else
|
|
4101
|
-
return Vec128<float, N>{
|
|
5175
|
+
return Vec128<float, N>{_mm_fmaddsub_ps(mul.raw, x.raw, sub_or_add.raw)};
|
|
4102
5176
|
#endif
|
|
4103
5177
|
}
|
|
4104
|
-
|
|
4105
|
-
HWY_API Vec128<double
|
|
4106
|
-
|
|
4107
|
-
#if HWY_TARGET >= HWY_SSE4
|
|
4108
|
-
return
|
|
5178
|
+
|
|
5179
|
+
HWY_API Vec128<double> MulAddSub(Vec128<double> mul, Vec128<double> x,
|
|
5180
|
+
Vec128<double> sub_or_add) {
|
|
5181
|
+
#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA)
|
|
5182
|
+
return AddSub(mul * x, sub_or_add);
|
|
4109
5183
|
#else
|
|
4110
|
-
return Vec128<double
|
|
5184
|
+
return Vec128<double>{_mm_fmaddsub_pd(mul.raw, x.raw, sub_or_add.raw)};
|
|
4111
5185
|
#endif
|
|
4112
5186
|
}
|
|
4113
5187
|
|
|
5188
|
+
#endif // HWY_TARGET <= HWY_SSSE3
|
|
5189
|
+
|
|
4114
5190
|
// ------------------------------ Floating-point square root
|
|
4115
5191
|
|
|
4116
5192
|
// Full precision square root
|
|
@@ -4508,116 +5584,129 @@ HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D d,
|
|
|
4508
5584
|
|
|
4509
5585
|
namespace detail {
|
|
4510
5586
|
|
|
4511
|
-
template <int kScale,
|
|
4512
|
-
HWY_INLINE
|
|
4513
|
-
|
|
4514
|
-
|
|
4515
|
-
|
|
4516
|
-
index.raw, kScale)};
|
|
5587
|
+
template <int kScale, typename T, size_t N, HWY_IF_UI32(T)>
|
|
5588
|
+
HWY_INLINE Vec128<T, N> NativeGather128(const T* HWY_RESTRICT base,
|
|
5589
|
+
Vec128<int32_t, N> indices) {
|
|
5590
|
+
return Vec128<T, N>{_mm_i32gather_epi32(
|
|
5591
|
+
reinterpret_cast<const int32_t*>(base), indices.raw, kScale)};
|
|
4517
5592
|
}
|
|
4518
5593
|
|
|
4519
|
-
template <int kScale,
|
|
4520
|
-
HWY_INLINE
|
|
4521
|
-
|
|
4522
|
-
|
|
4523
|
-
|
|
4524
|
-
reinterpret_cast<const GatherIndex64*>(base), index.raw, kScale)};
|
|
5594
|
+
template <int kScale, typename T, size_t N, HWY_IF_UI64(T)>
|
|
5595
|
+
HWY_INLINE Vec128<T, N> NativeGather128(const T* HWY_RESTRICT base,
|
|
5596
|
+
Vec128<int64_t, N> indices) {
|
|
5597
|
+
return Vec128<T, N>{_mm_i64gather_epi64(
|
|
5598
|
+
reinterpret_cast<const GatherIndex64*>(base), indices.raw, kScale)};
|
|
4525
5599
|
}
|
|
4526
5600
|
|
|
4527
|
-
template <int kScale,
|
|
4528
|
-
HWY_INLINE
|
|
4529
|
-
|
|
4530
|
-
return
|
|
5601
|
+
template <int kScale, size_t N>
|
|
5602
|
+
HWY_INLINE Vec128<float, N> NativeGather128(const float* HWY_RESTRICT base,
|
|
5603
|
+
Vec128<int32_t, N> indices) {
|
|
5604
|
+
return Vec128<float, N>{_mm_i32gather_ps(base, indices.raw, kScale)};
|
|
4531
5605
|
}
|
|
4532
5606
|
|
|
4533
|
-
template <int kScale,
|
|
4534
|
-
HWY_INLINE
|
|
4535
|
-
|
|
4536
|
-
|
|
4537
|
-
return VFromD<D>{_mm_i64gather_pd(base, index.raw, kScale)};
|
|
5607
|
+
template <int kScale, size_t N>
|
|
5608
|
+
HWY_INLINE Vec128<double, N> NativeGather128(const double* HWY_RESTRICT base,
|
|
5609
|
+
Vec128<int64_t, N> indices) {
|
|
5610
|
+
return Vec128<double, N>{_mm_i64gather_pd(base, indices.raw, kScale)};
|
|
4538
5611
|
}
|
|
4539
5612
|
|
|
4540
|
-
template <int kScale,
|
|
4541
|
-
HWY_INLINE
|
|
4542
|
-
|
|
4543
|
-
|
|
4544
|
-
|
|
4545
|
-
if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
|
|
5613
|
+
template <int kScale, typename T, size_t N, HWY_IF_UI32(T)>
|
|
5614
|
+
HWY_INLINE Vec128<T, N> NativeMaskedGatherOr128(Vec128<T, N> no,
|
|
5615
|
+
Mask128<T, N> m,
|
|
5616
|
+
const T* HWY_RESTRICT base,
|
|
5617
|
+
Vec128<int32_t, N> indices) {
|
|
4546
5618
|
#if HWY_TARGET <= HWY_AVX3
|
|
4547
|
-
return
|
|
4548
|
-
|
|
5619
|
+
return Vec128<T, N>{_mm_mmask_i32gather_epi32(
|
|
5620
|
+
no.raw, m.raw, indices.raw, reinterpret_cast<const int32_t*>(base),
|
|
4549
5621
|
kScale)};
|
|
4550
5622
|
#else
|
|
4551
|
-
return
|
|
4552
|
-
|
|
4553
|
-
|
|
5623
|
+
return Vec128<T, N>{
|
|
5624
|
+
_mm_mask_i32gather_epi32(no.raw, reinterpret_cast<const int32_t*>(base),
|
|
5625
|
+
indices.raw, m.raw, kScale)};
|
|
4554
5626
|
#endif
|
|
4555
5627
|
}
|
|
4556
5628
|
|
|
4557
|
-
template <int kScale,
|
|
4558
|
-
HWY_INLINE
|
|
4559
|
-
|
|
4560
|
-
|
|
4561
|
-
|
|
4562
|
-
if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
|
|
5629
|
+
template <int kScale, typename T, size_t N, HWY_IF_UI64(T)>
|
|
5630
|
+
HWY_INLINE Vec128<T, N> NativeMaskedGatherOr128(Vec128<T, N> no,
|
|
5631
|
+
Mask128<T, N> m,
|
|
5632
|
+
const T* HWY_RESTRICT base,
|
|
5633
|
+
Vec128<int64_t, N> indices) {
|
|
4563
5634
|
#if HWY_TARGET <= HWY_AVX3
|
|
4564
|
-
return
|
|
4565
|
-
|
|
4566
|
-
|
|
5635
|
+
return Vec128<T, N>{_mm_mmask_i64gather_epi64(
|
|
5636
|
+
no.raw, m.raw, indices.raw, reinterpret_cast<const GatherIndex64*>(base),
|
|
5637
|
+
kScale)};
|
|
4567
5638
|
#else
|
|
4568
|
-
return
|
|
4569
|
-
|
|
4570
|
-
|
|
5639
|
+
return Vec128<T, N>{_mm_mask_i64gather_epi64(
|
|
5640
|
+
no.raw, reinterpret_cast<const GatherIndex64*>(base), indices.raw, m.raw,
|
|
5641
|
+
kScale)};
|
|
4571
5642
|
#endif
|
|
4572
5643
|
}
|
|
4573
5644
|
|
|
4574
|
-
template <int kScale,
|
|
4575
|
-
HWY_INLINE
|
|
4576
|
-
|
|
4577
|
-
|
|
4578
|
-
// For partial vectors, ensure upper mask lanes are zero to prevent faults.
|
|
4579
|
-
if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
|
|
5645
|
+
template <int kScale, size_t N>
|
|
5646
|
+
HWY_INLINE Vec128<float, N> NativeMaskedGatherOr128(
|
|
5647
|
+
Vec128<float, N> no, Mask128<float, N> m, const float* HWY_RESTRICT base,
|
|
5648
|
+
Vec128<int32_t, N> indices) {
|
|
4580
5649
|
#if HWY_TARGET <= HWY_AVX3
|
|
4581
|
-
return
|
|
4582
|
-
_mm_mmask_i32gather_ps(
|
|
5650
|
+
return Vec128<float, N>{
|
|
5651
|
+
_mm_mmask_i32gather_ps(no.raw, m.raw, indices.raw, base, kScale)};
|
|
4583
5652
|
#else
|
|
4584
|
-
return
|
|
4585
|
-
_mm_mask_i32gather_ps(
|
|
5653
|
+
return Vec128<float, N>{
|
|
5654
|
+
_mm_mask_i32gather_ps(no.raw, base, indices.raw, m.raw, kScale)};
|
|
4586
5655
|
#endif
|
|
4587
5656
|
}
|
|
4588
5657
|
|
|
4589
|
-
template <int kScale,
|
|
4590
|
-
HWY_INLINE
|
|
4591
|
-
|
|
4592
|
-
|
|
4593
|
-
// For partial vectors, ensure upper mask lanes are zero to prevent faults.
|
|
4594
|
-
if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
|
|
5658
|
+
template <int kScale, size_t N>
|
|
5659
|
+
HWY_INLINE Vec128<double, N> NativeMaskedGatherOr128(
|
|
5660
|
+
Vec128<double, N> no, Mask128<double, N> m, const double* HWY_RESTRICT base,
|
|
5661
|
+
Vec128<int64_t, N> indices) {
|
|
4595
5662
|
#if HWY_TARGET <= HWY_AVX3
|
|
4596
|
-
return
|
|
4597
|
-
_mm_mmask_i64gather_pd(
|
|
5663
|
+
return Vec128<double, N>{
|
|
5664
|
+
_mm_mmask_i64gather_pd(no.raw, m.raw, indices.raw, base, kScale)};
|
|
4598
5665
|
#else
|
|
4599
|
-
return
|
|
4600
|
-
_mm_mask_i64gather_pd(
|
|
5666
|
+
return Vec128<double, N>{
|
|
5667
|
+
_mm_mask_i64gather_pd(no.raw, base, indices.raw, m.raw, kScale)};
|
|
4601
5668
|
#endif
|
|
4602
5669
|
}
|
|
4603
5670
|
|
|
4604
5671
|
} // namespace detail
|
|
4605
5672
|
|
|
4606
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)
|
|
4607
|
-
HWY_API VFromD<D> GatherOffset(D d, const
|
|
4608
|
-
|
|
4609
|
-
|
|
5673
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
5674
|
+
HWY_API VFromD<D> GatherOffset(D d, const TFromD<D>* HWY_RESTRICT base,
|
|
5675
|
+
VFromD<RebindToSigned<D>> offsets) {
|
|
5676
|
+
const RebindToSigned<decltype(d)> di;
|
|
5677
|
+
(void)di; // for HWY_DASSERT
|
|
5678
|
+
HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
|
|
5679
|
+
return detail::NativeGather128<1>(base, offsets);
|
|
5680
|
+
}
|
|
5681
|
+
|
|
5682
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>>
|
|
5683
|
+
HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base,
|
|
5684
|
+
VFromD<RebindToSigned<D>> indices) {
|
|
5685
|
+
const RebindToSigned<decltype(d)> di;
|
|
5686
|
+
(void)di; // for HWY_DASSERT
|
|
5687
|
+
HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
|
|
5688
|
+
return detail::NativeGather128<sizeof(T)>(base, indices);
|
|
4610
5689
|
}
|
|
4611
|
-
|
|
4612
|
-
|
|
4613
|
-
|
|
4614
|
-
|
|
5690
|
+
|
|
5691
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>>
|
|
5692
|
+
HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
|
|
5693
|
+
const T* HWY_RESTRICT base,
|
|
5694
|
+
VFromD<RebindToSigned<D>> indices) {
|
|
5695
|
+
// For partial vectors, ensure upper mask lanes are zero to prevent faults.
|
|
5696
|
+
if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
|
|
5697
|
+
|
|
5698
|
+
const RebindToSigned<decltype(d)> di;
|
|
5699
|
+
(void)di; // for HWY_DASSERT
|
|
5700
|
+
HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
|
|
5701
|
+
return detail::NativeMaskedGatherOr128<sizeof(T)>(no, m, base, indices);
|
|
4615
5702
|
}
|
|
4616
|
-
|
|
5703
|
+
|
|
5704
|
+
// Generic for all vector lengths.
|
|
5705
|
+
template <class D>
|
|
4617
5706
|
HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D d,
|
|
4618
|
-
const
|
|
4619
|
-
|
|
4620
|
-
return
|
|
5707
|
+
const TFromD<D>* HWY_RESTRICT base,
|
|
5708
|
+
VFromD<RebindToSigned<D>> indices) {
|
|
5709
|
+
return MaskedGatherIndexOr(Zero(d), m, d, base, indices);
|
|
4621
5710
|
}
|
|
4622
5711
|
|
|
4623
5712
|
#endif // HWY_TARGET <= HWY_AVX2
|
|
@@ -4740,9 +5829,7 @@ HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
|
|
|
4740
5829
|
const RebindToUnsigned<decltype(d)> du;
|
|
4741
5830
|
const uint16_t lane = static_cast<uint16_t>(
|
|
4742
5831
|
_mm_extract_epi16(BitCast(du, v).raw, kLane) & 0xFFFF);
|
|
4743
|
-
T
|
|
4744
|
-
CopySameSize(&lane, &ret); // for float16_t
|
|
4745
|
-
return ret;
|
|
5832
|
+
return BitCastScalar<T>(lane);
|
|
4746
5833
|
}
|
|
4747
5834
|
|
|
4748
5835
|
template <size_t kLane, typename T, size_t N, HWY_IF_UI32(T)>
|
|
@@ -4780,9 +5867,7 @@ HWY_INLINE float ExtractLane(const Vec128<float, N> v) {
|
|
|
4780
5867
|
#else
|
|
4781
5868
|
// Bug in the intrinsic, returns int but should be float.
|
|
4782
5869
|
const int32_t bits = _mm_extract_ps(v.raw, kLane);
|
|
4783
|
-
float
|
|
4784
|
-
CopySameSize(&bits, &ret);
|
|
4785
|
-
return ret;
|
|
5870
|
+
return BitCastScalar<float>(bits);
|
|
4786
5871
|
#endif
|
|
4787
5872
|
}
|
|
4788
5873
|
|
|
@@ -4958,8 +6043,7 @@ HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
|
|
|
4958
6043
|
static_assert(kLane < N, "Lane index out of bounds");
|
|
4959
6044
|
const DFromV<decltype(v)> d;
|
|
4960
6045
|
const RebindToUnsigned<decltype(d)> du;
|
|
4961
|
-
uint16_t bits;
|
|
4962
|
-
CopySameSize(&t, &bits); // for float16_t
|
|
6046
|
+
const uint16_t bits = BitCastScalar<uint16_t>(t);
|
|
4963
6047
|
return BitCast(d, VFromD<decltype(du)>{
|
|
4964
6048
|
_mm_insert_epi16(BitCast(du, v).raw, bits, kLane)});
|
|
4965
6049
|
}
|
|
@@ -4970,8 +6054,7 @@ HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
|
|
|
4970
6054
|
#if HWY_TARGET >= HWY_SSSE3
|
|
4971
6055
|
return InsertLaneUsingBroadcastAndBlend(v, kLane, t);
|
|
4972
6056
|
#else
|
|
4973
|
-
MakeSigned<T> ti;
|
|
4974
|
-
CopySameSize(&t, &ti); // don't just cast because T might be float.
|
|
6057
|
+
const MakeSigned<T> ti = BitCastScalar<MakeSigned<T>>(t);
|
|
4975
6058
|
return Vec128<T, N>{_mm_insert_epi32(v.raw, ti, kLane)};
|
|
4976
6059
|
#endif
|
|
4977
6060
|
}
|
|
@@ -4990,8 +6073,7 @@ HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
|
|
|
4990
6073
|
return BitCast(
|
|
4991
6074
|
d, Vec128<double, N>{_mm_shuffle_pd(BitCast(df, v).raw, vt.raw, 0)});
|
|
4992
6075
|
#else
|
|
4993
|
-
MakeSigned<T> ti;
|
|
4994
|
-
CopySameSize(&t, &ti); // don't just cast because T might be float.
|
|
6076
|
+
const MakeSigned<T> ti = BitCastScalar<MakeSigned<T>>(t);
|
|
4995
6077
|
return Vec128<T, N>{_mm_insert_epi64(v.raw, ti, kLane)};
|
|
4996
6078
|
#endif
|
|
4997
6079
|
}
|
|
@@ -5527,9 +6609,9 @@ HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) {
|
|
|
5527
6609
|
return BitCast(d, VU{_mm_shuffle_epi32(rev4.raw, _MM_SHUFFLE(1, 0, 3, 2))});
|
|
5528
6610
|
#else
|
|
5529
6611
|
const RebindToSigned<decltype(d)> di;
|
|
5530
|
-
|
|
5531
|
-
0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100
|
|
5532
|
-
return BitCast(d, TableLookupBytes(v,
|
|
6612
|
+
const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
|
|
6613
|
+
di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
|
|
6614
|
+
return BitCast(d, TableLookupBytes(v, shuffle));
|
|
5533
6615
|
#endif
|
|
5534
6616
|
}
|
|
5535
6617
|
|
|
@@ -5578,9 +6660,9 @@ HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
|
|
|
5578
6660
|
return BitCast(d, VU{shuf_result});
|
|
5579
6661
|
#else
|
|
5580
6662
|
const RebindToSigned<decltype(d)> di;
|
|
5581
|
-
|
|
5582
|
-
0x0302, 0x0100, 0x0706, 0x0504, 0x0B0A, 0x0908, 0x0F0E, 0x0D0C
|
|
5583
|
-
return BitCast(d, TableLookupBytes(v,
|
|
6663
|
+
const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
|
|
6664
|
+
di, 0x0302, 0x0100, 0x0706, 0x0504, 0x0B0A, 0x0908, 0x0F0E, 0x0D0C);
|
|
6665
|
+
return BitCast(d, TableLookupBytes(v, shuffle));
|
|
5584
6666
|
#endif
|
|
5585
6667
|
}
|
|
5586
6668
|
|
|
@@ -5615,9 +6697,9 @@ HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
|
|
|
5615
6697
|
_MM_SHUFFLE(0, 1, 2, 3))});
|
|
5616
6698
|
#else
|
|
5617
6699
|
const RebindToSigned<decltype(d)> di;
|
|
5618
|
-
|
|
5619
|
-
0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908
|
|
5620
|
-
return BitCast(d, TableLookupBytes(v,
|
|
6700
|
+
const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
|
|
6701
|
+
di, 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908);
|
|
6702
|
+
return BitCast(d, TableLookupBytes(v, shuffle));
|
|
5621
6703
|
#endif
|
|
5622
6704
|
}
|
|
5623
6705
|
|
|
@@ -5641,9 +6723,9 @@ HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
|
|
|
5641
6723
|
return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v))));
|
|
5642
6724
|
#else
|
|
5643
6725
|
const RebindToSigned<decltype(d)> di;
|
|
5644
|
-
|
|
5645
|
-
0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100
|
|
5646
|
-
return BitCast(d, TableLookupBytes(v,
|
|
6726
|
+
const VFromD<decltype(di)> shuffle = Dup128VecFromValues(
|
|
6727
|
+
di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
|
|
6728
|
+
return BitCast(d, TableLookupBytes(v, shuffle));
|
|
5647
6729
|
#endif
|
|
5648
6730
|
}
|
|
5649
6731
|
|
|
@@ -5758,7 +6840,11 @@ template <size_t kIdx3210, class V>
|
|
|
5758
6840
|
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
|
|
5759
6841
|
hwy::SizeTag<2> /*lane_size_tag*/,
|
|
5760
6842
|
hwy::SizeTag<8> /*vect_size_tag*/, V v) {
|
|
5761
|
-
|
|
6843
|
+
const DFromV<decltype(v)> d;
|
|
6844
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
6845
|
+
return BitCast(d,
|
|
6846
|
+
VFromD<decltype(du)>{_mm_shufflelo_epi16(
|
|
6847
|
+
BitCast(du, v).raw, static_cast<int>(kIdx3210 & 0xFF))});
|
|
5762
6848
|
}
|
|
5763
6849
|
|
|
5764
6850
|
#if HWY_TARGET == HWY_SSE2
|
|
@@ -5766,8 +6852,12 @@ template <size_t kIdx3210, class V>
|
|
|
5766
6852
|
HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
|
|
5767
6853
|
hwy::SizeTag<2> /*lane_size_tag*/,
|
|
5768
6854
|
hwy::SizeTag<16> /*vect_size_tag*/, V v) {
|
|
6855
|
+
const DFromV<decltype(v)> d;
|
|
6856
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
5769
6857
|
constexpr int kShuffle = static_cast<int>(kIdx3210 & 0xFF);
|
|
5770
|
-
return
|
|
6858
|
+
return BitCast(
|
|
6859
|
+
d, VFromD<decltype(du)>{_mm_shufflehi_epi16(
|
|
6860
|
+
_mm_shufflelo_epi16(BitCast(du, v).raw, kShuffle), kShuffle)});
|
|
5771
6861
|
}
|
|
5772
6862
|
|
|
5773
6863
|
template <size_t kIdx3210, size_t kVectSize, class V,
|
|
@@ -6173,7 +7263,7 @@ template <class D, HWY_IF_T_SIZE_ONE_OF_D(
|
|
|
6173
7263
|
(1 << 4) | (1 << 8))>
|
|
6174
7264
|
HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
|
|
6175
7265
|
size_t max_lanes_to_store) {
|
|
6176
|
-
const size_t
|
|
7266
|
+
const size_t num_lanes_to_store =
|
|
6177
7267
|
HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D));
|
|
6178
7268
|
|
|
6179
7269
|
#if HWY_COMPILER_MSVC
|
|
@@ -6181,12 +7271,14 @@ HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
|
|
|
6181
7271
|
HWY_FENCE;
|
|
6182
7272
|
#endif
|
|
6183
7273
|
|
|
6184
|
-
BlendedStore(v, FirstN(d,
|
|
7274
|
+
BlendedStore(v, FirstN(d, num_lanes_to_store), d, p);
|
|
6185
7275
|
|
|
6186
7276
|
#if HWY_COMPILER_MSVC
|
|
6187
7277
|
// Work around MSVC compiler bug by using a HWY_FENCE after the BlendedStore
|
|
6188
7278
|
HWY_FENCE;
|
|
6189
7279
|
#endif
|
|
7280
|
+
|
|
7281
|
+
detail::MaybeUnpoison(p, num_lanes_to_store);
|
|
6190
7282
|
}
|
|
6191
7283
|
|
|
6192
7284
|
#if HWY_TARGET > HWY_AVX3
|
|
@@ -6214,36 +7306,35 @@ namespace detail {
|
|
|
6214
7306
|
template <class D, HWY_IF_T_SIZE_D(D, 1)>
|
|
6215
7307
|
HWY_API void AVX2UIF8Or16StoreTrailingN(VFromD<D> v_trailing, D /*d*/,
|
|
6216
7308
|
TFromD<D>* HWY_RESTRICT p,
|
|
6217
|
-
size_t
|
|
7309
|
+
size_t num_lanes_to_store) {
|
|
6218
7310
|
// AVX2UIF8Or16StoreTrailingN should only be called for an I8/U8 vector if
|
|
6219
|
-
// (
|
|
7311
|
+
// (num_lanes_to_store & 3) != 0 is true
|
|
6220
7312
|
const auto v_full128 = ResizeBitCast(Full128<TFromD<D>>(), v_trailing);
|
|
6221
|
-
if ((
|
|
7313
|
+
if ((num_lanes_to_store & 2) != 0) {
|
|
6222
7314
|
const uint16_t u16_bits = GetLane(BitCast(Full128<uint16_t>(), v_full128));
|
|
6223
|
-
p[
|
|
7315
|
+
p[num_lanes_to_store - 1] = detail::ExtractLane<2>(v_full128);
|
|
6224
7316
|
CopyBytes<sizeof(uint16_t)>(&u16_bits,
|
|
6225
|
-
p + (
|
|
7317
|
+
p + (num_lanes_to_store & ~size_t{3}));
|
|
6226
7318
|
} else {
|
|
6227
|
-
p[
|
|
7319
|
+
p[num_lanes_to_store - 1] = GetLane(v_full128);
|
|
6228
7320
|
}
|
|
6229
7321
|
}
|
|
6230
7322
|
|
|
6231
7323
|
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
6232
7324
|
HWY_API void AVX2UIF8Or16StoreTrailingN(VFromD<D> v_trailing, D /*d*/,
|
|
6233
|
-
TFromD<D>*
|
|
6234
|
-
size_t
|
|
7325
|
+
TFromD<D>* p,
|
|
7326
|
+
size_t num_lanes_to_store) {
|
|
6235
7327
|
// AVX2UIF8Or16StoreTrailingN should only be called for an I16/U16/F16/BF16
|
|
6236
|
-
// vector if (
|
|
6237
|
-
p[
|
|
7328
|
+
// vector if (num_lanes_to_store & 1) == 1 is true
|
|
7329
|
+
p[num_lanes_to_store - 1] = GetLane(v_trailing);
|
|
6238
7330
|
}
|
|
6239
7331
|
|
|
6240
7332
|
} // namespace detail
|
|
6241
7333
|
|
|
6242
7334
|
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
|
|
6243
7335
|
HWY_IF_LANES_GT_D(D, 2)>
|
|
6244
|
-
HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>*
|
|
6245
|
-
|
|
6246
|
-
const size_t num_of_lanes_to_store =
|
|
7336
|
+
HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* p, size_t max_lanes_to_store) {
|
|
7337
|
+
const size_t num_lanes_to_store =
|
|
6247
7338
|
HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D));
|
|
6248
7339
|
|
|
6249
7340
|
const FixedTag<TFromD<D>, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>))>
|
|
@@ -6252,7 +7343,7 @@ HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
|
|
|
6252
7343
|
const Repartition<int32_t, decltype(d_full)> di32_full;
|
|
6253
7344
|
|
|
6254
7345
|
const auto i32_store_mask = BitCast(
|
|
6255
|
-
di32_full, VecFromMask(du_full, FirstN(du_full,
|
|
7346
|
+
di32_full, VecFromMask(du_full, FirstN(du_full, num_lanes_to_store)));
|
|
6256
7347
|
const auto vi32 = ResizeBitCast(di32_full, v);
|
|
6257
7348
|
|
|
6258
7349
|
#if HWY_COMPILER_MSVC
|
|
@@ -6265,19 +7356,21 @@ HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
|
|
|
6265
7356
|
|
|
6266
7357
|
constexpr size_t kNumOfLanesPerI32 = 4 / sizeof(TFromD<D>);
|
|
6267
7358
|
constexpr size_t kTrailingLenMask = kNumOfLanesPerI32 - 1;
|
|
6268
|
-
const size_t trailing_n = (
|
|
7359
|
+
const size_t trailing_n = (num_lanes_to_store & kTrailingLenMask);
|
|
6269
7360
|
|
|
6270
7361
|
if (trailing_n != 0) {
|
|
6271
|
-
const
|
|
7362
|
+
const VFromD<D> v_trailing = ResizeBitCast(
|
|
6272
7363
|
d, SlideDownLanes(di32_full, vi32,
|
|
6273
|
-
|
|
6274
|
-
detail::AVX2UIF8Or16StoreTrailingN(v_trailing, d, p,
|
|
7364
|
+
num_lanes_to_store / kNumOfLanesPerI32));
|
|
7365
|
+
detail::AVX2UIF8Or16StoreTrailingN(v_trailing, d, p, num_lanes_to_store);
|
|
6275
7366
|
}
|
|
6276
7367
|
|
|
6277
7368
|
#if HWY_COMPILER_MSVC
|
|
6278
7369
|
// Work around MSVC compiler bug by using a HWY_FENCE after the BlendedStore
|
|
6279
7370
|
HWY_FENCE;
|
|
6280
7371
|
#endif
|
|
7372
|
+
|
|
7373
|
+
detail::MaybeUnpoison(p, num_lanes_to_store);
|
|
6281
7374
|
}
|
|
6282
7375
|
#endif // HWY_TARGET > HWY_AVX3
|
|
6283
7376
|
#endif // HWY_TARGET <= HWY_AVX2
|
|
@@ -6300,19 +7393,36 @@ HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
|
|
|
6300
7393
|
|
|
6301
7394
|
// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
|
|
6302
7395
|
|
|
6303
|
-
template <class D, HWY_IF_V_SIZE_D(D, 16)>
|
|
7396
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
|
|
6304
7397
|
HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
|
|
6305
7398
|
const RebindToUnsigned<decltype(d)> du;
|
|
6306
7399
|
const Half<decltype(du)> duh;
|
|
6307
7400
|
return BitCast(d, VFromD<decltype(du)>{_mm_move_epi64(BitCast(duh, lo).raw)});
|
|
6308
7401
|
}
|
|
6309
7402
|
|
|
6310
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
|
|
7403
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
|
|
6311
7404
|
HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
|
|
6312
7405
|
const Half<D> dh;
|
|
6313
7406
|
return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD<D>{lo.raw});
|
|
6314
7407
|
}
|
|
6315
7408
|
|
|
7409
|
+
#if HWY_HAVE_FLOAT16
|
|
7410
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F16_D(D)>
|
|
7411
|
+
HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
|
|
7412
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
7413
|
+
const Half<decltype(du)> duh;
|
|
7414
|
+
return BitCast(d, ZeroExtendVector(du, BitCast(duh, lo)));
|
|
7415
|
+
}
|
|
7416
|
+
#endif
|
|
7417
|
+
|
|
7418
|
+
// Generic for all vector lengths.
|
|
7419
|
+
template <class D, HWY_X86_IF_EMULATED_D(D)>
|
|
7420
|
+
HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
|
|
7421
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
7422
|
+
const Half<decltype(du)> duh;
|
|
7423
|
+
return BitCast(d, ZeroExtendVector(du, BitCast(duh, lo)));
|
|
7424
|
+
}
|
|
7425
|
+
|
|
6316
7426
|
// ------------------------------ Concat full (InterleaveLower)
|
|
6317
7427
|
|
|
6318
7428
|
// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
|
|
@@ -6459,10 +7569,11 @@ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
|
|
|
6459
7569
|
HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
6460
7570
|
// Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns
|
|
6461
7571
|
// 0xFFFF8000, which correctly saturates to 0x8000.
|
|
7572
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
6462
7573
|
const Repartition<int32_t, decltype(d)> dw;
|
|
6463
7574
|
const Vec128<int32_t> uH = ShiftRight<16>(BitCast(dw, hi));
|
|
6464
7575
|
const Vec128<int32_t> uL = ShiftRight<16>(BitCast(dw, lo));
|
|
6465
|
-
return VFromD<
|
|
7576
|
+
return BitCast(d, VFromD<decltype(du)>{_mm_packs_epi32(uL.raw, uH.raw)});
|
|
6466
7577
|
}
|
|
6467
7578
|
|
|
6468
7579
|
// 16-bit x4
|
|
@@ -6565,11 +7676,12 @@ template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
|
|
|
6565
7676
|
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
|
6566
7677
|
#if HWY_TARGET <= HWY_SSE4
|
|
6567
7678
|
// Isolate lower 16 bits per u32 so we can pack.
|
|
7679
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
6568
7680
|
const Repartition<uint32_t, decltype(d)> dw;
|
|
6569
7681
|
const Vec128<uint32_t> mask = Set(dw, 0x0000FFFF);
|
|
6570
7682
|
const Vec128<uint32_t> uH = And(BitCast(dw, hi), mask);
|
|
6571
7683
|
const Vec128<uint32_t> uL = And(BitCast(dw, lo), mask);
|
|
6572
|
-
return VFromD<
|
|
7684
|
+
return BitCast(d, VFromD<decltype(du)>{_mm_packus_epi32(uL.raw, uH.raw)});
|
|
6573
7685
|
#elif HWY_TARGET == HWY_SSE2
|
|
6574
7686
|
const Repartition<uint32_t, decltype(d)> dw;
|
|
6575
7687
|
return ConcatOdd(d, BitCast(d, ShiftLeft<16>(BitCast(dw, hi))),
|
|
@@ -6642,9 +7754,9 @@ HWY_API V DupEven(V v) {
|
|
|
6642
7754
|
|
|
6643
7755
|
#if HWY_TARGET <= HWY_SSSE3
|
|
6644
7756
|
const RebindToUnsigned<decltype(d)> du;
|
|
6645
|
-
|
|
6646
|
-
0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
|
|
6647
|
-
return TableLookupBytes(v, BitCast(d,
|
|
7757
|
+
const VFromD<decltype(du)> shuffle = Dup128VecFromValues(
|
|
7758
|
+
du, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
|
|
7759
|
+
return TableLookupBytes(v, BitCast(d, shuffle));
|
|
6648
7760
|
#else
|
|
6649
7761
|
const Repartition<uint16_t, decltype(d)> du16;
|
|
6650
7762
|
return IfVecThenElse(BitCast(d, Set(du16, uint16_t{0xFF00})),
|
|
@@ -6656,8 +7768,8 @@ template <typename T, HWY_IF_T_SIZE(T, 2)>
|
|
|
6656
7768
|
HWY_API Vec64<T> DupEven(const Vec64<T> v) {
|
|
6657
7769
|
const DFromV<decltype(v)> d;
|
|
6658
7770
|
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
6659
|
-
return BitCast(d, VFromD<decltype(du)>{
|
|
6660
|
-
|
|
7771
|
+
return BitCast(d, VFromD<decltype(du)>{_mm_shufflelo_epi16(
|
|
7772
|
+
BitCast(du, v).raw, _MM_SHUFFLE(2, 2, 0, 0))});
|
|
6661
7773
|
}
|
|
6662
7774
|
|
|
6663
7775
|
// Generic for all vector lengths.
|
|
@@ -6666,9 +7778,9 @@ HWY_API V DupEven(const V v) {
|
|
|
6666
7778
|
const DFromV<decltype(v)> d;
|
|
6667
7779
|
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
6668
7780
|
#if HWY_TARGET <= HWY_SSSE3
|
|
6669
|
-
|
|
6670
|
-
0x0100, 0x0100, 0x0504, 0x0504, 0x0908, 0x0908, 0x0d0c, 0x0d0c
|
|
6671
|
-
return TableLookupBytes(v, BitCast(d,
|
|
7781
|
+
const VFromD<decltype(du)> shuffle = Dup128VecFromValues(
|
|
7782
|
+
du, 0x0100, 0x0100, 0x0504, 0x0504, 0x0908, 0x0908, 0x0d0c, 0x0d0c);
|
|
7783
|
+
return TableLookupBytes(v, BitCast(d, shuffle));
|
|
6672
7784
|
#else
|
|
6673
7785
|
return BitCast(
|
|
6674
7786
|
d, VFromD<decltype(du)>{_mm_shufflehi_epi16(
|
|
@@ -6699,9 +7811,9 @@ HWY_API V DupOdd(V v) {
|
|
|
6699
7811
|
|
|
6700
7812
|
#if HWY_TARGET <= HWY_SSSE3
|
|
6701
7813
|
const RebindToUnsigned<decltype(d)> du;
|
|
6702
|
-
|
|
6703
|
-
1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
|
|
6704
|
-
return TableLookupBytes(v, BitCast(d,
|
|
7814
|
+
const VFromD<decltype(du)> shuffle = Dup128VecFromValues(
|
|
7815
|
+
du, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
|
|
7816
|
+
return TableLookupBytes(v, BitCast(d, shuffle));
|
|
6705
7817
|
#else
|
|
6706
7818
|
const Repartition<uint16_t, decltype(d)> du16;
|
|
6707
7819
|
return IfVecThenElse(BitCast(d, Set(du16, uint16_t{0x00FF})),
|
|
@@ -6723,9 +7835,9 @@ HWY_API V DupOdd(V v) {
|
|
|
6723
7835
|
const DFromV<decltype(v)> d;
|
|
6724
7836
|
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
6725
7837
|
#if HWY_TARGET <= HWY_SSSE3
|
|
6726
|
-
|
|
6727
|
-
0x0302, 0x0302, 0x0706, 0x0706, 0x0b0a, 0x0b0a, 0x0f0e, 0x0f0e
|
|
6728
|
-
return TableLookupBytes(v, BitCast(d,
|
|
7838
|
+
const VFromD<decltype(du)> shuffle = Dup128VecFromValues(
|
|
7839
|
+
du, 0x0302, 0x0302, 0x0706, 0x0706, 0x0b0a, 0x0b0a, 0x0f0e, 0x0f0e);
|
|
7840
|
+
return TableLookupBytes(v, BitCast(d, shuffle));
|
|
6729
7841
|
#else
|
|
6730
7842
|
return BitCast(
|
|
6731
7843
|
d, VFromD<decltype(du)>{_mm_shufflehi_epi16(
|
|
@@ -6952,14 +8064,16 @@ HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
|
|
|
6952
8064
|
|
|
6953
8065
|
template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
|
|
6954
8066
|
HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
|
|
6955
|
-
#if HWY_TARGET >= HWY_SSSE3
|
|
6956
8067
|
const DFromV<decltype(a)> d;
|
|
8068
|
+
#if HWY_TARGET >= HWY_SSSE3
|
|
6957
8069
|
const Repartition<uint8_t, decltype(d)> d8;
|
|
6958
8070
|
alignas(16) static constexpr uint8_t mask[16] = {
|
|
6959
8071
|
0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
|
|
6960
8072
|
return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
|
|
6961
8073
|
#else
|
|
6962
|
-
|
|
8074
|
+
const RebindToUnsigned<decltype(d)> du; // for float16_t
|
|
8075
|
+
return BitCast(d, VFromD<decltype(du)>{_mm_blend_epi16(
|
|
8076
|
+
BitCast(du, a).raw, BitCast(du, b).raw, 0x55)});
|
|
6963
8077
|
#endif
|
|
6964
8078
|
}
|
|
6965
8079
|
|
|
@@ -7941,11 +9055,31 @@ HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<int16_t, D>> v) {
|
|
|
7941
9055
|
#endif
|
|
7942
9056
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
|
|
7943
9057
|
HWY_INLINE_F16 VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
|
|
9058
|
+
#if HWY_HAVE_FLOAT16
|
|
9059
|
+
const RebindToUnsigned<DFromV<decltype(v)>> du16;
|
|
9060
|
+
return VFromD<D>{_mm_cvtph_ps(BitCast(du16, v).raw)};
|
|
9061
|
+
#else
|
|
7944
9062
|
return VFromD<D>{_mm_cvtph_ps(v.raw)};
|
|
9063
|
+
#endif
|
|
7945
9064
|
}
|
|
7946
9065
|
|
|
7947
9066
|
#endif // HWY_NATIVE_F16C
|
|
7948
9067
|
|
|
9068
|
+
#if HWY_HAVE_FLOAT16
|
|
9069
|
+
|
|
9070
|
+
#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
|
|
9071
|
+
#undef HWY_NATIVE_PROMOTE_F16_TO_F64
|
|
9072
|
+
#else
|
|
9073
|
+
#define HWY_NATIVE_PROMOTE_F16_TO_F64
|
|
9074
|
+
#endif
|
|
9075
|
+
|
|
9076
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
|
|
9077
|
+
HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
|
|
9078
|
+
return VFromD<D>{_mm_cvtph_pd(v.raw)};
|
|
9079
|
+
}
|
|
9080
|
+
|
|
9081
|
+
#endif // HWY_HAVE_FLOAT16
|
|
9082
|
+
|
|
7949
9083
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
|
|
7950
9084
|
HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) {
|
|
7951
9085
|
const Rebind<uint16_t, decltype(df32)> du16;
|
|
@@ -7980,6 +9114,42 @@ HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<uint32_t, D>> v) {
|
|
|
7980
9114
|
}
|
|
7981
9115
|
#endif
|
|
7982
9116
|
|
|
9117
|
+
// ------------------------------ PromoteEvenTo/PromoteOddTo
|
|
9118
|
+
|
|
9119
|
+
#if HWY_TARGET > HWY_AVX3
|
|
9120
|
+
namespace detail {
|
|
9121
|
+
|
|
9122
|
+
// I32->I64 PromoteEvenTo/PromoteOddTo
|
|
9123
|
+
|
|
9124
|
+
template <class D, HWY_IF_LANES_D(D, 1)>
|
|
9125
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
9126
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
9127
|
+
hwy::SignedTag /*from_type_tag*/, D d_to,
|
|
9128
|
+
Vec64<int32_t> v) {
|
|
9129
|
+
return PromoteLowerTo(d_to, v);
|
|
9130
|
+
}
|
|
9131
|
+
|
|
9132
|
+
template <class D, HWY_IF_LANES_D(D, 2)>
|
|
9133
|
+
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
|
|
9134
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
9135
|
+
hwy::SignedTag /*from_type_tag*/, D d_to,
|
|
9136
|
+
Vec128<int32_t> v) {
|
|
9137
|
+
const Repartition<int32_t, D> d_from;
|
|
9138
|
+
return PromoteLowerTo(d_to, ConcatEven(d_from, v, v));
|
|
9139
|
+
}
|
|
9140
|
+
|
|
9141
|
+
template <class D, class V, HWY_IF_LANES_LE_D(D, 2)>
|
|
9142
|
+
HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
|
|
9143
|
+
hwy::SizeTag<8> /*to_lane_size_tag*/,
|
|
9144
|
+
hwy::SignedTag /*from_type_tag*/, D d_to,
|
|
9145
|
+
V v) {
|
|
9146
|
+
const Repartition<int32_t, D> d_from;
|
|
9147
|
+
return PromoteLowerTo(d_to, ConcatOdd(d_from, v, v));
|
|
9148
|
+
}
|
|
9149
|
+
|
|
9150
|
+
} // namespace detail
|
|
9151
|
+
#endif
|
|
9152
|
+
|
|
7983
9153
|
// ------------------------------ Demotions (full -> part w/ narrow lanes)
|
|
7984
9154
|
|
|
7985
9155
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
|
|
@@ -8143,14 +9313,31 @@ HWY_DIAGNOSTICS(push)
|
|
|
8143
9313
|
HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wmain")
|
|
8144
9314
|
|
|
8145
9315
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
|
|
8146
|
-
HWY_API VFromD<D> DemoteTo(D
|
|
8147
|
-
|
|
9316
|
+
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
|
|
9317
|
+
const RebindToUnsigned<decltype(df16)> du16;
|
|
9318
|
+
return BitCast(
|
|
9319
|
+
df16, VFromD<decltype(du16)>{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)});
|
|
8148
9320
|
}
|
|
8149
9321
|
|
|
8150
9322
|
HWY_DIAGNOSTICS(pop)
|
|
8151
9323
|
|
|
8152
9324
|
#endif // F16C
|
|
8153
9325
|
|
|
9326
|
+
#if HWY_HAVE_FLOAT16
|
|
9327
|
+
|
|
9328
|
+
#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
9329
|
+
#undef HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
9330
|
+
#else
|
|
9331
|
+
#define HWY_NATIVE_DEMOTE_F64_TO_F16
|
|
9332
|
+
#endif
|
|
9333
|
+
|
|
9334
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)>
|
|
9335
|
+
HWY_API VFromD<D> DemoteTo(D /*df16*/, VFromD<Rebind<double, D>> v) {
|
|
9336
|
+
return VFromD<D>{_mm_cvtpd_ph(v.raw)};
|
|
9337
|
+
}
|
|
9338
|
+
|
|
9339
|
+
#endif // HWY_HAVE_FLOAT16
|
|
9340
|
+
|
|
8154
9341
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
|
|
8155
9342
|
HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
|
|
8156
9343
|
// TODO(janwas): _mm_cvtneps_pbh once we have avx512bf16.
|
|
@@ -8389,7 +9576,7 @@ HWY_API VFromD<D> DemoteTo(D du32, VFromD<Rebind<double, D>> v) {
|
|
|
8389
9576
|
#if HWY_TARGET <= HWY_AVX3
|
|
8390
9577
|
(void)du32;
|
|
8391
9578
|
return VFromD<D>{
|
|
8392
|
-
_mm_maskz_cvttpd_epu32(
|
|
9579
|
+
_mm_maskz_cvttpd_epu32(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
|
|
8393
9580
|
#else // AVX2 or earlier
|
|
8394
9581
|
const Rebind<double, decltype(du32)> df64;
|
|
8395
9582
|
const RebindToUnsigned<decltype(df64)> du64;
|
|
@@ -8512,7 +9699,7 @@ HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
|
|
|
8512
9699
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)>
|
|
8513
9700
|
HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
|
|
8514
9701
|
return VFromD<D>{
|
|
8515
|
-
_mm_maskz_cvttps_epu64(
|
|
9702
|
+
_mm_maskz_cvttps_epu64(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
|
|
8516
9703
|
}
|
|
8517
9704
|
#else // AVX2 or below
|
|
8518
9705
|
|
|
@@ -8747,32 +9934,17 @@ HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
|
|
|
8747
9934
|
|
|
8748
9935
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
|
|
8749
9936
|
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
|
|
8750
|
-
const
|
|
8751
|
-
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
8752
|
-
const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
|
|
8753
|
-
#else
|
|
8754
|
-
const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
|
|
8755
|
-
#endif
|
|
9937
|
+
const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
|
|
8756
9938
|
return VFromD<D>{_mm_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)};
|
|
8757
9939
|
}
|
|
8758
9940
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)>
|
|
8759
9941
|
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
|
|
8760
|
-
const
|
|
8761
|
-
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
8762
|
-
const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
|
|
8763
|
-
#else
|
|
8764
|
-
const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
|
|
8765
|
-
#endif
|
|
9942
|
+
const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
|
|
8766
9943
|
return VFromD<D>{_mm_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)};
|
|
8767
9944
|
}
|
|
8768
9945
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)>
|
|
8769
9946
|
HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int64_t, D>> v) {
|
|
8770
|
-
const
|
|
8771
|
-
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
|
8772
|
-
const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw);
|
|
8773
|
-
#else
|
|
8774
|
-
const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw);
|
|
8775
|
-
#endif
|
|
9947
|
+
const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw;
|
|
8776
9948
|
return VFromD<D>{_mm_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)};
|
|
8777
9949
|
}
|
|
8778
9950
|
|
|
@@ -9030,6 +10202,11 @@ HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) {
|
|
|
9030
10202
|
return detail::FixConversionOverflow(
|
|
9031
10203
|
di, v, VFromD<RebindToSigned<D>>{_mm_cvttph_epi16(v.raw)});
|
|
9032
10204
|
}
|
|
10205
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)>
|
|
10206
|
+
HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToFloat<D>> v) {
|
|
10207
|
+
return VFromD<D>{
|
|
10208
|
+
_mm_maskz_cvttph_epu16(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
|
|
10209
|
+
}
|
|
9033
10210
|
#endif // HWY_HAVE_FLOAT16
|
|
9034
10211
|
|
|
9035
10212
|
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)>
|
|
@@ -9048,13 +10225,13 @@ HWY_API VFromD<DI> ConvertTo(DI di, VFromD<RebindToFloat<DI>> v) {
|
|
|
9048
10225
|
template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U32_D(DU)>
|
|
9049
10226
|
HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
9050
10227
|
return VFromD<DU>{
|
|
9051
|
-
_mm_maskz_cvttps_epu32(
|
|
10228
|
+
_mm_maskz_cvttps_epu32(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
|
|
9052
10229
|
}
|
|
9053
10230
|
|
|
9054
10231
|
template <class DU, HWY_IF_V_SIZE_LE_D(DU, 16), HWY_IF_U64_D(DU)>
|
|
9055
10232
|
HWY_API VFromD<DU> ConvertTo(DU /*du*/, VFromD<RebindToFloat<DU>> v) {
|
|
9056
10233
|
return VFromD<DU>{
|
|
9057
|
-
_mm_maskz_cvttpd_epu64(
|
|
10234
|
+
_mm_maskz_cvttpd_epu64(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)};
|
|
9058
10235
|
}
|
|
9059
10236
|
|
|
9060
10237
|
#else // AVX2 or below
|
|
@@ -9445,6 +10622,13 @@ HWY_API Mask128<double, N> IsNaN(const Vec128<double, N> v) {
|
|
|
9445
10622
|
|
|
9446
10623
|
#if HWY_TARGET <= HWY_AVX3
|
|
9447
10624
|
|
|
10625
|
+
// Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
|
|
10626
|
+
#ifdef HWY_NATIVE_ISINF
|
|
10627
|
+
#undef HWY_NATIVE_ISINF
|
|
10628
|
+
#else
|
|
10629
|
+
#define HWY_NATIVE_ISINF
|
|
10630
|
+
#endif
|
|
10631
|
+
|
|
9448
10632
|
template <size_t N>
|
|
9449
10633
|
HWY_API Mask128<float, N> IsInf(const Vec128<float, N> v) {
|
|
9450
10634
|
return Mask128<float, N>{_mm_fpclass_ps_mask(
|
|
@@ -9472,35 +10656,6 @@ HWY_API Mask128<double, N> IsFinite(const Vec128<double, N> v) {
|
|
|
9472
10656
|
HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)});
|
|
9473
10657
|
}
|
|
9474
10658
|
|
|
9475
|
-
#else
|
|
9476
|
-
|
|
9477
|
-
template <typename T, size_t N>
|
|
9478
|
-
HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
|
|
9479
|
-
static_assert(IsFloat<T>(), "Only for float");
|
|
9480
|
-
const DFromV<decltype(v)> d;
|
|
9481
|
-
const RebindToSigned<decltype(d)> di;
|
|
9482
|
-
const VFromD<decltype(di)> vi = BitCast(di, v);
|
|
9483
|
-
// 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
|
|
9484
|
-
return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
|
|
9485
|
-
}
|
|
9486
|
-
|
|
9487
|
-
// Returns whether normal/subnormal/zero.
|
|
9488
|
-
template <typename T, size_t N>
|
|
9489
|
-
HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
|
|
9490
|
-
static_assert(IsFloat<T>(), "Only for float");
|
|
9491
|
-
const DFromV<decltype(v)> d;
|
|
9492
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
9493
|
-
const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
|
|
9494
|
-
const VFromD<decltype(du)> vu = BitCast(du, v);
|
|
9495
|
-
// Shift left to clear the sign bit, then right so we can compare with the
|
|
9496
|
-
// max exponent (cannot compare with MaxExponentTimes2 directly because it is
|
|
9497
|
-
// negative and non-negative floats would be greater). MSVC seems to generate
|
|
9498
|
-
// incorrect code if we instead add vu + vu.
|
|
9499
|
-
const VFromD<decltype(di)> exp =
|
|
9500
|
-
BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(ShiftLeft<1>(vu)));
|
|
9501
|
-
return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
|
|
9502
|
-
}
|
|
9503
|
-
|
|
9504
10659
|
#endif // HWY_TARGET <= HWY_AVX3
|
|
9505
10660
|
|
|
9506
10661
|
// ================================================== CRYPTO
|
|
@@ -9586,10 +10741,9 @@ HWY_INLINE MFromD<D> LoadMaskBits128(D d, uint64_t mask_bits) {
|
|
|
9586
10741
|
1, 1, 1, 1, 1, 1, 1, 1};
|
|
9587
10742
|
const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
|
|
9588
10743
|
#endif
|
|
9589
|
-
|
|
9590
|
-
|
|
9591
|
-
|
|
9592
|
-
return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
|
|
10744
|
+
const VFromD<decltype(du)> bit = Dup128VecFromValues(
|
|
10745
|
+
du, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
|
|
10746
|
+
return RebindMask(d, TestBit(rep8, bit));
|
|
9593
10747
|
}
|
|
9594
10748
|
|
|
9595
10749
|
template <class D, HWY_IF_T_SIZE_D(D, 2)>
|
|
@@ -9644,6 +10798,20 @@ HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
|
|
|
9644
10798
|
#endif
|
|
9645
10799
|
}
|
|
9646
10800
|
|
|
10801
|
+
// ------------------------------ Dup128MaskFromMaskBits
|
|
10802
|
+
|
|
10803
|
+
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
10804
|
+
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
10805
|
+
constexpr size_t kN = MaxLanes(d);
|
|
10806
|
+
if (kN < 8) mask_bits &= (1u << kN) - 1;
|
|
10807
|
+
|
|
10808
|
+
#if HWY_TARGET <= HWY_AVX3
|
|
10809
|
+
return MFromD<D>::FromBits(mask_bits);
|
|
10810
|
+
#else
|
|
10811
|
+
return detail::LoadMaskBits128(d, mask_bits);
|
|
10812
|
+
#endif
|
|
10813
|
+
}
|
|
10814
|
+
|
|
9647
10815
|
template <typename T>
|
|
9648
10816
|
struct CompressIsPartition {
|
|
9649
10817
|
#if HWY_TARGET <= HWY_AVX3
|
|
@@ -10779,243 +11947,74 @@ HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
|
|
|
10779
11947
|
|
|
10780
11948
|
// ------------------------------ Reductions
|
|
10781
11949
|
|
|
10782
|
-
|
|
11950
|
+
// Nothing fully native, generic_ops-inl defines SumOfLanes and ReduceSum.
|
|
10783
11951
|
|
|
10784
|
-
//
|
|
10785
|
-
|
|
10786
|
-
|
|
10787
|
-
|
|
10788
|
-
|
|
10789
|
-
|
|
10790
|
-
|
|
10791
|
-
return v;
|
|
10792
|
-
}
|
|
10793
|
-
template <typename T>
|
|
10794
|
-
HWY_INLINE Vec128<T, 1> MaxOfLanes(Vec128<T, 1> v) {
|
|
10795
|
-
return v;
|
|
10796
|
-
}
|
|
11952
|
+
// We provide specializations of u8x8 and u8x16, so exclude those.
|
|
11953
|
+
#undef HWY_IF_SUM_OF_LANES_D
|
|
11954
|
+
#define HWY_IF_SUM_OF_LANES_D(D) \
|
|
11955
|
+
HWY_IF_LANES_GT_D(D, 1), \
|
|
11956
|
+
hwy::EnableIf<!hwy::IsSame<TFromD<D>, uint8_t>() || \
|
|
11957
|
+
(HWY_V_SIZE_D(D) != 8 && HWY_V_SIZE_D(D) != 16)>* = \
|
|
11958
|
+
nullptr
|
|
10797
11959
|
|
|
10798
|
-
|
|
10799
|
-
|
|
10800
|
-
|
|
10801
|
-
const DFromV<decltype(v10)> d;
|
|
10802
|
-
return Add(v10, Reverse2(d, v10));
|
|
11960
|
+
template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_D(D, 8)>
|
|
11961
|
+
HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
|
|
11962
|
+
return Set(d, static_cast<uint8_t>(GetLane(SumsOf8(v)) & 0xFF));
|
|
10803
11963
|
}
|
|
10804
|
-
template <
|
|
10805
|
-
|
|
10806
|
-
const
|
|
10807
|
-
|
|
10808
|
-
|
|
10809
|
-
|
|
10810
|
-
HWY_INLINE Vec128<T, 2> MaxOfLanes(Vec128<T, 2> v10) {
|
|
10811
|
-
const DFromV<decltype(v10)> d;
|
|
10812
|
-
return Max(v10, Reverse2(d, v10));
|
|
10813
|
-
}
|
|
10814
|
-
|
|
10815
|
-
// N=4 (only 16/32-bit, else >128-bit)
|
|
10816
|
-
template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
|
|
10817
|
-
HWY_INLINE Vec128<T, 4> SumOfLanes(Vec128<T, 4> v3210) {
|
|
10818
|
-
using V = decltype(v3210);
|
|
10819
|
-
const DFromV<V> d;
|
|
10820
|
-
const V v0123 = Reverse4(d, v3210);
|
|
10821
|
-
const V v03_12_12_03 = Add(v3210, v0123);
|
|
10822
|
-
const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
|
|
10823
|
-
return Add(v03_12_12_03, v12_03_03_12);
|
|
10824
|
-
}
|
|
10825
|
-
template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
|
|
10826
|
-
HWY_INLINE Vec128<T, 4> MinOfLanes(Vec128<T, 4> v3210) {
|
|
10827
|
-
using V = decltype(v3210);
|
|
10828
|
-
const DFromV<V> d;
|
|
10829
|
-
const V v0123 = Reverse4(d, v3210);
|
|
10830
|
-
const V v03_12_12_03 = Min(v3210, v0123);
|
|
10831
|
-
const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
|
|
10832
|
-
return Min(v03_12_12_03, v12_03_03_12);
|
|
10833
|
-
}
|
|
10834
|
-
template <typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
|
|
10835
|
-
HWY_INLINE Vec128<T, 4> MaxOfLanes(Vec128<T, 4> v3210) {
|
|
10836
|
-
using V = decltype(v3210);
|
|
10837
|
-
const DFromV<V> d;
|
|
10838
|
-
const V v0123 = Reverse4(d, v3210);
|
|
10839
|
-
const V v03_12_12_03 = Max(v3210, v0123);
|
|
10840
|
-
const V v12_03_03_12 = Reverse2(d, v03_12_12_03);
|
|
10841
|
-
return Max(v03_12_12_03, v12_03_03_12);
|
|
11964
|
+
template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_D(D, 16)>
|
|
11965
|
+
HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
|
|
11966
|
+
const Repartition<uint64_t, decltype(d)> d64;
|
|
11967
|
+
VFromD<decltype(d64)> sums = SumsOf8(v);
|
|
11968
|
+
sums = SumOfLanes(d64, sums);
|
|
11969
|
+
return Broadcast<0>(BitCast(d, sums));
|
|
10842
11970
|
}
|
|
10843
11971
|
|
|
10844
|
-
#undef HWY_X86_IF_NOT_MINPOS
|
|
10845
11972
|
#if HWY_TARGET <= HWY_SSE4
|
|
10846
|
-
//
|
|
10847
|
-
#
|
|
10848
|
-
|
|
11973
|
+
// We provide specializations of u8x8, u8x16, and u16x8, so exclude those.
|
|
11974
|
+
#undef HWY_IF_MINMAX_OF_LANES_D
|
|
11975
|
+
#define HWY_IF_MINMAX_OF_LANES_D(D) \
|
|
11976
|
+
HWY_IF_LANES_GT_D(D, 1), \
|
|
11977
|
+
hwy::EnableIf<(!hwy::IsSame<TFromD<D>, uint8_t>() || \
|
|
11978
|
+
((HWY_V_SIZE_D(D) < 8) || (HWY_V_SIZE_D(D) > 16))) && \
|
|
11979
|
+
(!hwy::IsSame<TFromD<D>, uint16_t>() || \
|
|
11980
|
+
(HWY_V_SIZE_D(D) != 16))>* = nullptr
|
|
10849
11981
|
|
|
10850
|
-
|
|
11982
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
|
|
11983
|
+
HWY_API Vec128<uint16_t> MinOfLanes(D /* tag */, Vec128<uint16_t> v) {
|
|
10851
11984
|
return Broadcast<0>(Vec128<uint16_t>{_mm_minpos_epu16(v.raw)});
|
|
10852
11985
|
}
|
|
10853
11986
|
|
|
10854
|
-
|
|
10855
|
-
|
|
11987
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
|
|
11988
|
+
HWY_API Vec128<uint16_t> MaxOfLanes(D d, Vec128<uint16_t> v) {
|
|
10856
11989
|
const Vec128<uint16_t> max = Set(d, LimitsMax<uint16_t>());
|
|
10857
|
-
return max - MinOfLanes(max - v);
|
|
10858
|
-
}
|
|
10859
|
-
#else
|
|
10860
|
-
#define HWY_X86_IF_NOT_MINPOS(T) hwy::EnableIf<true>* = nullptr
|
|
10861
|
-
#endif // HWY_TARGET <= HWY_SSE4
|
|
10862
|
-
|
|
10863
|
-
// N=8 (only 16-bit, else >128-bit)
|
|
10864
|
-
template <typename T, HWY_IF_T_SIZE(T, 2)>
|
|
10865
|
-
HWY_INLINE Vec128<T, 8> SumOfLanes(Vec128<T, 8> v76543210) {
|
|
10866
|
-
using V = decltype(v76543210);
|
|
10867
|
-
const DFromV<V> d;
|
|
10868
|
-
// The upper half is reversed from the lower half; omit for brevity.
|
|
10869
|
-
const V v34_25_16_07 = Add(v76543210, Reverse8(d, v76543210));
|
|
10870
|
-
const V v0347_1625_1625_0347 = Add(v34_25_16_07, Reverse4(d, v34_25_16_07));
|
|
10871
|
-
return Add(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
|
|
10872
|
-
}
|
|
10873
|
-
template <typename T, HWY_IF_T_SIZE(T, 2), HWY_X86_IF_NOT_MINPOS(T)>
|
|
10874
|
-
HWY_INLINE Vec128<T, 8> MinOfLanes(Vec128<T, 8> v76543210) {
|
|
10875
|
-
using V = decltype(v76543210);
|
|
10876
|
-
const DFromV<V> d;
|
|
10877
|
-
// The upper half is reversed from the lower half; omit for brevity.
|
|
10878
|
-
const V v34_25_16_07 = Min(v76543210, Reverse8(d, v76543210));
|
|
10879
|
-
const V v0347_1625_1625_0347 = Min(v34_25_16_07, Reverse4(d, v34_25_16_07));
|
|
10880
|
-
return Min(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
|
|
10881
|
-
}
|
|
10882
|
-
template <typename T, HWY_IF_T_SIZE(T, 2), HWY_X86_IF_NOT_MINPOS(T)>
|
|
10883
|
-
HWY_INLINE Vec128<T, 8> MaxOfLanes(Vec128<T, 8> v76543210) {
|
|
10884
|
-
using V = decltype(v76543210);
|
|
10885
|
-
const DFromV<V> d;
|
|
10886
|
-
// The upper half is reversed from the lower half; omit for brevity.
|
|
10887
|
-
const V v34_25_16_07 = Max(v76543210, Reverse8(d, v76543210));
|
|
10888
|
-
const V v0347_1625_1625_0347 = Max(v34_25_16_07, Reverse4(d, v34_25_16_07));
|
|
10889
|
-
return Max(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
|
|
10890
|
-
}
|
|
10891
|
-
|
|
10892
|
-
template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
|
|
10893
|
-
HWY_INLINE T ReduceSum(Vec128<T, N> v) {
|
|
10894
|
-
return GetLane(SumOfLanes(v));
|
|
10895
|
-
}
|
|
10896
|
-
|
|
10897
|
-
// u8, N=8, N=16:
|
|
10898
|
-
HWY_INLINE uint8_t ReduceSum(Vec64<uint8_t> v) {
|
|
10899
|
-
return static_cast<uint8_t>(GetLane(SumsOf8(v)) & 0xFF);
|
|
10900
|
-
}
|
|
10901
|
-
HWY_INLINE Vec64<uint8_t> SumOfLanes(Vec64<uint8_t> v) {
|
|
10902
|
-
const Full64<uint8_t> d;
|
|
10903
|
-
return Set(d, ReduceSum(v));
|
|
10904
|
-
}
|
|
10905
|
-
HWY_INLINE uint8_t ReduceSum(Vec128<uint8_t> v) {
|
|
10906
|
-
uint64_t sums = ReduceSum(SumsOf8(v));
|
|
10907
|
-
return static_cast<uint8_t>(sums & 0xFF);
|
|
10908
|
-
}
|
|
10909
|
-
HWY_INLINE Vec128<uint8_t> SumOfLanes(Vec128<uint8_t> v) {
|
|
10910
|
-
const DFromV<decltype(v)> d;
|
|
10911
|
-
return Set(d, ReduceSum(v));
|
|
10912
|
-
}
|
|
10913
|
-
template <size_t N, HWY_IF_V_SIZE_GT(int8_t, N, 4)>
|
|
10914
|
-
HWY_INLINE int8_t ReduceSum(const Vec128<int8_t, N> v) {
|
|
10915
|
-
const DFromV<decltype(v)> d;
|
|
10916
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
10917
|
-
const auto is_neg = v < Zero(d);
|
|
10918
|
-
|
|
10919
|
-
// Sum positive and negative lanes separately, then combine to get the result.
|
|
10920
|
-
const auto positive = SumsOf8(BitCast(du, IfThenZeroElse(is_neg, v)));
|
|
10921
|
-
const auto negative = SumsOf8(BitCast(du, IfThenElseZero(is_neg, Abs(v))));
|
|
10922
|
-
return static_cast<int8_t>(ReduceSum(positive - negative) & 0xFF);
|
|
10923
|
-
}
|
|
10924
|
-
template <size_t N, HWY_IF_V_SIZE_GT(int8_t, N, 4)>
|
|
10925
|
-
HWY_INLINE Vec128<int8_t, N> SumOfLanes(const Vec128<int8_t, N> v) {
|
|
10926
|
-
const DFromV<decltype(v)> d;
|
|
10927
|
-
return Set(d, ReduceSum(v));
|
|
11990
|
+
return max - MinOfLanes(d, max - v);
|
|
10928
11991
|
}
|
|
10929
11992
|
|
|
10930
|
-
|
|
10931
|
-
|
|
10932
|
-
const DFromV<decltype(v)> d;
|
|
11993
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
|
|
11994
|
+
HWY_API Vec64<uint8_t> MinOfLanes(D d, Vec64<uint8_t> v) {
|
|
10933
11995
|
const Rebind<uint16_t, decltype(d)> d16;
|
|
10934
|
-
return TruncateTo(d, MinOfLanes(PromoteTo(d16, v)));
|
|
11996
|
+
return TruncateTo(d, MinOfLanes(d16, PromoteTo(d16, v)));
|
|
10935
11997
|
}
|
|
10936
|
-
|
|
10937
|
-
|
|
11998
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
|
|
11999
|
+
HWY_API Vec128<uint8_t> MinOfLanes(D d, Vec128<uint8_t> v) {
|
|
12000
|
+
const Half<decltype(d)> dh;
|
|
10938
12001
|
Vec64<uint8_t> result =
|
|
10939
|
-
Min(MinOfLanes(UpperHalf(
|
|
10940
|
-
return Combine(
|
|
12002
|
+
Min(MinOfLanes(dh, UpperHalf(dh, v)), MinOfLanes(dh, LowerHalf(dh, v)));
|
|
12003
|
+
return Combine(d, result, result);
|
|
10941
12004
|
}
|
|
10942
12005
|
|
|
10943
|
-
|
|
10944
|
-
|
|
10945
|
-
|
|
10946
|
-
|
|
10947
|
-
HWY_INLINE Vec128<uint8_t> MaxOfLanes(Vec128<uint8_t> v) {
|
|
10948
|
-
const Vec128<uint8_t> m(Set(DFromV<decltype(v)>(), LimitsMax<uint8_t>()));
|
|
10949
|
-
return m - MinOfLanes(m - v);
|
|
10950
|
-
}
|
|
10951
|
-
#elif HWY_TARGET >= HWY_SSSE3
|
|
10952
|
-
template <size_t N, HWY_IF_V_SIZE_GT(uint8_t, N, 4)>
|
|
10953
|
-
HWY_API Vec128<uint8_t, N> MaxOfLanes(Vec128<uint8_t, N> v) {
|
|
10954
|
-
const DFromV<decltype(v)> d;
|
|
10955
|
-
const RepartitionToWide<decltype(d)> d16;
|
|
10956
|
-
const RepartitionToWide<decltype(d16)> d32;
|
|
10957
|
-
Vec128<uint8_t, N> vm = Max(v, Reverse2(d, v));
|
|
10958
|
-
vm = Max(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
|
|
10959
|
-
vm = Max(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
|
|
10960
|
-
if (N > 8) {
|
|
10961
|
-
const RepartitionToWide<decltype(d32)> d64;
|
|
10962
|
-
vm = Max(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
|
|
10963
|
-
}
|
|
10964
|
-
return vm;
|
|
10965
|
-
}
|
|
10966
|
-
|
|
10967
|
-
template <size_t N, HWY_IF_V_SIZE_GT(uint8_t, N, 4)>
|
|
10968
|
-
HWY_API Vec128<uint8_t, N> MinOfLanes(Vec128<uint8_t, N> v) {
|
|
10969
|
-
const DFromV<decltype(v)> d;
|
|
10970
|
-
const RepartitionToWide<decltype(d)> d16;
|
|
10971
|
-
const RepartitionToWide<decltype(d16)> d32;
|
|
10972
|
-
Vec128<uint8_t, N> vm = Min(v, Reverse2(d, v));
|
|
10973
|
-
vm = Min(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm))));
|
|
10974
|
-
vm = Min(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm))));
|
|
10975
|
-
if (N > 8) {
|
|
10976
|
-
const RepartitionToWide<decltype(d32)> d64;
|
|
10977
|
-
vm = Min(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm))));
|
|
10978
|
-
}
|
|
10979
|
-
return vm;
|
|
10980
|
-
}
|
|
10981
|
-
#endif
|
|
10982
|
-
|
|
10983
|
-
// Implement min/max of i8 in terms of u8 by toggling the sign bit.
|
|
10984
|
-
template <size_t N, HWY_IF_V_SIZE_GT(int8_t, N, 4)>
|
|
10985
|
-
HWY_INLINE Vec128<int8_t, N> MinOfLanes(Vec128<int8_t, N> v) {
|
|
10986
|
-
const DFromV<decltype(v)> d;
|
|
10987
|
-
const RebindToUnsigned<decltype(d)> du;
|
|
10988
|
-
const auto mask = SignBit(du);
|
|
10989
|
-
const auto vu = Xor(BitCast(du, v), mask);
|
|
10990
|
-
return BitCast(d, Xor(MinOfLanes(vu), mask));
|
|
12006
|
+
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
|
|
12007
|
+
HWY_API Vec64<uint8_t> MaxOfLanes(D d, Vec64<uint8_t> v) {
|
|
12008
|
+
const Vec64<uint8_t> m(Set(d, LimitsMax<uint8_t>()));
|
|
12009
|
+
return m - MinOfLanes(d, m - v);
|
|
10991
12010
|
}
|
|
10992
|
-
template <
|
|
10993
|
-
|
|
10994
|
-
const
|
|
10995
|
-
|
|
10996
|
-
const auto mask = SignBit(du);
|
|
10997
|
-
const auto vu = Xor(BitCast(du, v), mask);
|
|
10998
|
-
return BitCast(d, Xor(MaxOfLanes(vu), mask));
|
|
12011
|
+
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
|
|
12012
|
+
HWY_API Vec128<uint8_t> MaxOfLanes(D d, Vec128<uint8_t> v) {
|
|
12013
|
+
const Vec128<uint8_t> m(Set(d, LimitsMax<uint8_t>()));
|
|
12014
|
+
return m - MinOfLanes(d, m - v);
|
|
10999
12015
|
}
|
|
11000
12016
|
|
|
11001
|
-
|
|
11002
|
-
|
|
11003
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
11004
|
-
HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) {
|
|
11005
|
-
return detail::SumOfLanes(v);
|
|
11006
|
-
}
|
|
11007
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
11008
|
-
HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v) {
|
|
11009
|
-
return detail::ReduceSum(v);
|
|
11010
|
-
}
|
|
11011
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
11012
|
-
HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) {
|
|
11013
|
-
return detail::MinOfLanes(v);
|
|
11014
|
-
}
|
|
11015
|
-
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
|
|
11016
|
-
HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
|
|
11017
|
-
return detail::MaxOfLanes(v);
|
|
11018
|
-
}
|
|
12017
|
+
#endif // HWY_TARGET <= HWY_SSE4
|
|
11019
12018
|
|
|
11020
12019
|
// ------------------------------ Lt128
|
|
11021
12020
|
|
|
@@ -11168,6 +12167,8 @@ HWY_API V LeadingZeroCount(V v) {
|
|
|
11168
12167
|
} // namespace hwy
|
|
11169
12168
|
HWY_AFTER_NAMESPACE();
|
|
11170
12169
|
|
|
12170
|
+
#undef HWY_X86_IF_EMULATED_D
|
|
12171
|
+
|
|
11171
12172
|
// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
|
|
11172
12173
|
// the warning seems to be issued at the call site of intrinsics, i.e. our code.
|
|
11173
12174
|
HWY_DIAGNOSTICS(pop)
|