@img/sharp-libvips-dev 1.2.1 → 1.2.2-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/ffi.h +3 -3
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +2 -19
- package/include/hwy/aligned_allocator.h +11 -7
- package/include/hwy/auto_tune.h +504 -0
- package/include/hwy/base.h +425 -104
- package/include/hwy/cache_control.h +16 -0
- package/include/hwy/detect_compiler_arch.h +32 -1
- package/include/hwy/detect_targets.h +251 -67
- package/include/hwy/foreach_target.h +35 -0
- package/include/hwy/highway.h +185 -76
- package/include/hwy/nanobenchmark.h +1 -19
- package/include/hwy/ops/arm_neon-inl.h +969 -458
- package/include/hwy/ops/arm_sve-inl.h +1137 -359
- package/include/hwy/ops/emu128-inl.h +97 -11
- package/include/hwy/ops/generic_ops-inl.h +1222 -34
- package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
- package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
- package/include/hwy/ops/ppc_vsx-inl.h +306 -126
- package/include/hwy/ops/rvv-inl.h +546 -51
- package/include/hwy/ops/scalar-inl.h +77 -22
- package/include/hwy/ops/set_macros-inl.h +138 -17
- package/include/hwy/ops/shared-inl.h +50 -10
- package/include/hwy/ops/wasm_128-inl.h +137 -92
- package/include/hwy/ops/x86_128-inl.h +773 -214
- package/include/hwy/ops/x86_256-inl.h +712 -255
- package/include/hwy/ops/x86_512-inl.h +429 -753
- package/include/hwy/ops/x86_avx3-inl.h +501 -0
- package/include/hwy/per_target.h +2 -1
- package/include/hwy/profiler.h +622 -486
- package/include/hwy/targets.h +62 -20
- package/include/hwy/timer-inl.h +8 -160
- package/include/hwy/timer.h +170 -3
- package/include/hwy/x86_cpuid.h +81 -0
- package/include/libheif/heif_cxx.h +25 -5
- package/include/libheif/heif_regions.h +5 -5
- package/include/libheif/heif_version.h +2 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/pango-1.0/pango/pango-enum-types.h +3 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-font.h +30 -0
- package/include/pango-1.0/pango/pango-version-macros.h +26 -0
- package/include/zlib.h +3 -3
- package/package.json +1 -1
- package/versions.json +8 -8
|
@@ -16,8 +16,21 @@
|
|
|
16
16
|
// RISC-V V vectors (length not known at compile time).
|
|
17
17
|
// External include guard in highway.h - see comment there.
|
|
18
18
|
|
|
19
|
+
#pragma push_macro("__riscv_v_elen")
|
|
20
|
+
|
|
21
|
+
// Workaround that ensures that all of the __riscv_vsetvl_* and
|
|
22
|
+
// __riscv_vsetvlmax_* macros in riscv_vector.h are defined when compiling with
|
|
23
|
+
// Clang 20 with dynamic dispatch and a baseline target of SCALAR or EMU128
|
|
24
|
+
#if HWY_COMPILER_CLANG >= 2000 && HWY_COMPILER_CLANG < 2100 && \
|
|
25
|
+
(!defined(__riscv_v_elen) || __riscv_v_elen < 64)
|
|
26
|
+
#undef __riscv_v_elen
|
|
27
|
+
#define __riscv_v_elen 64
|
|
28
|
+
#endif
|
|
29
|
+
|
|
19
30
|
#include <riscv_vector.h>
|
|
20
31
|
|
|
32
|
+
#pragma pop_macro("__riscv_v_elen")
|
|
33
|
+
|
|
21
34
|
#include "hwy/ops/shared-inl.h"
|
|
22
35
|
|
|
23
36
|
HWY_BEFORE_NAMESPACE();
|
|
@@ -127,6 +140,26 @@ namespace detail { // for code folding
|
|
|
127
140
|
X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \
|
|
128
141
|
X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
|
|
129
142
|
|
|
143
|
+
#define HWY_RVV_FOREACH_08_GET_SET(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
144
|
+
X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \
|
|
145
|
+
X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \
|
|
146
|
+
X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
|
|
147
|
+
|
|
148
|
+
#define HWY_RVV_FOREACH_16_GET_SET(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
149
|
+
X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \
|
|
150
|
+
X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \
|
|
151
|
+
X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
|
|
152
|
+
|
|
153
|
+
#define HWY_RVV_FOREACH_32_GET_SET(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
154
|
+
X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \
|
|
155
|
+
X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \
|
|
156
|
+
X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
|
|
157
|
+
|
|
158
|
+
#define HWY_RVV_FOREACH_64_GET_SET(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
159
|
+
X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \
|
|
160
|
+
X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \
|
|
161
|
+
X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
|
|
162
|
+
|
|
130
163
|
// LMULS = _DEMOTE: can demote from SEW*LMUL to SEWH*LMULH.
|
|
131
164
|
#define HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
132
165
|
X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
|
|
@@ -275,6 +308,35 @@ namespace detail { // for code folding
|
|
|
275
308
|
HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
276
309
|
HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
|
|
277
310
|
|
|
311
|
+
// GET/SET + VIRT
|
|
312
|
+
#define HWY_RVV_FOREACH_08_GET_SET_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
313
|
+
X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
|
|
314
|
+
X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \
|
|
315
|
+
X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP)
|
|
316
|
+
|
|
317
|
+
#define HWY_RVV_FOREACH_16_GET_SET_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
318
|
+
X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \
|
|
319
|
+
X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP)
|
|
320
|
+
|
|
321
|
+
#define HWY_RVV_FOREACH_32_GET_SET_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
322
|
+
X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP)
|
|
323
|
+
|
|
324
|
+
#define HWY_RVV_FOREACH_64_GET_SET_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
|
|
325
|
+
|
|
326
|
+
// For the smallest LMUL for each SEW, similar to the LowerHalf operator, we
|
|
327
|
+
// provide the Get and Set operator that returns the same vector type.
|
|
328
|
+
#define HWY_RVV_FOREACH_08_GET_SET_SMALLEST(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
329
|
+
X_MACRO(BASE, CHAR, 8, 16, __, mf8, mf4, __, -3, /*MLEN=*/64, NAME, OP)
|
|
330
|
+
|
|
331
|
+
#define HWY_RVV_FOREACH_16_GET_SET_SMALLEST(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
332
|
+
X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP)
|
|
333
|
+
|
|
334
|
+
#define HWY_RVV_FOREACH_32_GET_SET_SMALLEST(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
335
|
+
X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP)
|
|
336
|
+
|
|
337
|
+
#define HWY_RVV_FOREACH_64_GET_SET_SMALLEST(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
338
|
+
X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP)
|
|
339
|
+
|
|
278
340
|
// EXT + VIRT
|
|
279
341
|
#define HWY_RVV_FOREACH_08_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
280
342
|
HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
|
|
@@ -341,9 +403,13 @@ namespace detail { // for code folding
|
|
|
341
403
|
HWY_RVV_FOREACH_F16_UNCONDITIONAL(X_MACRO, NAME, OP, LMULS)
|
|
342
404
|
// Only BF16 is emulated.
|
|
343
405
|
#define HWY_RVV_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
|
|
406
|
+
#define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
|
|
407
|
+
#define HWY_RVV_IF_NOT_EMULATED_D(D) HWY_IF_NOT_BF16_D(D)
|
|
344
408
|
#else
|
|
345
409
|
#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS)
|
|
346
410
|
#define HWY_RVV_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
|
|
411
|
+
#define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
|
|
412
|
+
#define HWY_RVV_IF_NOT_EMULATED_D(D) HWY_IF_NOT_SPECIAL_FLOAT_D(D)
|
|
347
413
|
#endif
|
|
348
414
|
#define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
|
|
349
415
|
HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, float, f, NAME, OP)
|
|
@@ -1114,6 +1180,18 @@ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, SaturatedSub, ssub, _ALL)
|
|
|
1114
1180
|
|
|
1115
1181
|
// ------------------------------ AverageRound
|
|
1116
1182
|
|
|
1183
|
+
#ifdef HWY_NATIVE_AVERAGE_ROUND_UI32
|
|
1184
|
+
#undef HWY_NATIVE_AVERAGE_ROUND_UI32
|
|
1185
|
+
#else
|
|
1186
|
+
#define HWY_NATIVE_AVERAGE_ROUND_UI32
|
|
1187
|
+
#endif
|
|
1188
|
+
|
|
1189
|
+
#ifdef HWY_NATIVE_AVERAGE_ROUND_UI64
|
|
1190
|
+
#undef HWY_NATIVE_AVERAGE_ROUND_UI64
|
|
1191
|
+
#else
|
|
1192
|
+
#define HWY_NATIVE_AVERAGE_ROUND_UI64
|
|
1193
|
+
#endif
|
|
1194
|
+
|
|
1117
1195
|
// Define this to opt-out of the default behavior, which is AVOID on certain
|
|
1118
1196
|
// compiler versions. You can define only this to use VXRM, or define both this
|
|
1119
1197
|
// and HWY_RVV_AVOID_VXRM to always avoid VXRM.
|
|
@@ -1123,9 +1201,9 @@ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, SaturatedSub, ssub, _ALL)
|
|
|
1123
1201
|
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400
|
|
1124
1202
|
#define HWY_RVV_AVOID_VXRM
|
|
1125
1203
|
// Clang 16 with __riscv_v_intrinsic == 11000 may either require VXRM or avoid.
|
|
1126
|
-
// Assume earlier
|
|
1204
|
+
// Assume that Clang 16 and earlier avoid VXRM.
|
|
1127
1205
|
#elif HWY_COMPILER_CLANG && \
|
|
1128
|
-
(HWY_COMPILER_CLANG <
|
|
1206
|
+
(HWY_COMPILER_CLANG < 1700 || __riscv_v_intrinsic < 11000)
|
|
1129
1207
|
#define HWY_RVV_AVOID_VXRM
|
|
1130
1208
|
#endif
|
|
1131
1209
|
|
|
@@ -1153,8 +1231,8 @@ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, SaturatedSub, ssub, _ALL)
|
|
|
1153
1231
|
a, b, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \
|
|
1154
1232
|
}
|
|
1155
1233
|
|
|
1156
|
-
|
|
1157
|
-
|
|
1234
|
+
HWY_RVV_FOREACH_I(HWY_RVV_RETV_AVERAGE, AverageRound, aadd, _ALL)
|
|
1235
|
+
HWY_RVV_FOREACH_U(HWY_RVV_RETV_AVERAGE, AverageRound, aaddu, _ALL)
|
|
1158
1236
|
|
|
1159
1237
|
#undef HWY_RVV_RETV_AVERAGE
|
|
1160
1238
|
|
|
@@ -1183,6 +1261,35 @@ HWY_RVV_FOREACH_I(HWY_RVV_SHIFT, ShiftRight, sra, _ALL)
|
|
|
1183
1261
|
|
|
1184
1262
|
#undef HWY_RVV_SHIFT
|
|
1185
1263
|
|
|
1264
|
+
// ------------------------------ RoundingShiftRight[Same]
|
|
1265
|
+
|
|
1266
|
+
#ifdef HWY_NATIVE_ROUNDING_SHR
|
|
1267
|
+
#undef HWY_NATIVE_ROUNDING_SHR
|
|
1268
|
+
#else
|
|
1269
|
+
#define HWY_NATIVE_ROUNDING_SHR
|
|
1270
|
+
#endif
|
|
1271
|
+
|
|
1272
|
+
// Intrinsics do not define .vi forms, so use .vx instead.
|
|
1273
|
+
#define HWY_RVV_ROUNDING_SHR(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
1274
|
+
SHIFT, MLEN, NAME, OP) \
|
|
1275
|
+
template <int kBits> \
|
|
1276
|
+
HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
1277
|
+
return __riscv_v##OP##_vx_##CHAR##SEW##LMUL( \
|
|
1278
|
+
v, kBits, \
|
|
1279
|
+
HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \
|
|
1280
|
+
} \
|
|
1281
|
+
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
1282
|
+
NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \
|
|
1283
|
+
return __riscv_v##OP##_vx_##CHAR##SEW##LMUL( \
|
|
1284
|
+
v, static_cast<uint8_t>(bits), \
|
|
1285
|
+
HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \
|
|
1286
|
+
}
|
|
1287
|
+
|
|
1288
|
+
HWY_RVV_FOREACH_U(HWY_RVV_ROUNDING_SHR, RoundingShiftRight, ssrl, _ALL)
|
|
1289
|
+
HWY_RVV_FOREACH_I(HWY_RVV_ROUNDING_SHR, RoundingShiftRight, ssra, _ALL)
|
|
1290
|
+
|
|
1291
|
+
#undef HWY_RVV_ROUNDING_SHR
|
|
1292
|
+
|
|
1186
1293
|
// ------------------------------ SumsOf8 (ShiftRight, Add)
|
|
1187
1294
|
template <class VU8, HWY_IF_U8_D(DFromV<VU8>)>
|
|
1188
1295
|
HWY_API VFromD<Repartition<uint64_t, DFromV<VU8>>> SumsOf8(const VU8 v) {
|
|
@@ -1276,6 +1383,33 @@ HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shr, sra, _ALL)
|
|
|
1276
1383
|
#undef HWY_RVV_SHIFT_II
|
|
1277
1384
|
#undef HWY_RVV_SHIFT_VV
|
|
1278
1385
|
|
|
1386
|
+
// ------------------------------ RoundingShr
|
|
1387
|
+
#define HWY_RVV_ROUNDING_SHR_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
|
|
1388
|
+
LMULH, SHIFT, MLEN, NAME, OP) \
|
|
1389
|
+
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
1390
|
+
NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
|
|
1391
|
+
return __riscv_v##OP##_vv_##CHAR##SEW##LMUL( \
|
|
1392
|
+
v, bits, \
|
|
1393
|
+
HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \
|
|
1394
|
+
}
|
|
1395
|
+
|
|
1396
|
+
HWY_RVV_FOREACH_U(HWY_RVV_ROUNDING_SHR_VV, RoundingShr, ssrl, _ALL)
|
|
1397
|
+
|
|
1398
|
+
#define HWY_RVV_ROUNDING_SHR_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
|
|
1399
|
+
LMULH, SHIFT, MLEN, NAME, OP) \
|
|
1400
|
+
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
1401
|
+
NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
|
|
1402
|
+
const HWY_RVV_D(uint, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), SHIFT) du; \
|
|
1403
|
+
return __riscv_v##OP##_vv_##CHAR##SEW##LMUL( \
|
|
1404
|
+
v, BitCast(du, bits), \
|
|
1405
|
+
HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \
|
|
1406
|
+
}
|
|
1407
|
+
|
|
1408
|
+
HWY_RVV_FOREACH_I(HWY_RVV_ROUNDING_SHR_II, RoundingShr, ssra, _ALL)
|
|
1409
|
+
|
|
1410
|
+
#undef HWY_RVV_ROUNDING_SHR_VV
|
|
1411
|
+
#undef HWY_RVV_ROUNDING_SHR_II
|
|
1412
|
+
|
|
1279
1413
|
// ------------------------------ Min
|
|
1280
1414
|
|
|
1281
1415
|
namespace detail {
|
|
@@ -1450,6 +1584,20 @@ HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub, fnmacc, _ALL)
|
|
|
1450
1584
|
|
|
1451
1585
|
// ================================================== COMPARE
|
|
1452
1586
|
|
|
1587
|
+
// ------------------------------ MClear
|
|
1588
|
+
|
|
1589
|
+
// mask = f()
|
|
1590
|
+
#define HWY_RVV_RETM(SEW, SHIFT, MLEN, NAME, OP) \
|
|
1591
|
+
HWY_API HWY_RVV_M(MLEN) NAME##MLEN() { \
|
|
1592
|
+
return __riscv_vm##OP##_m_b##MLEN(HWY_RVV_AVL(SEW, SHIFT)); \
|
|
1593
|
+
}
|
|
1594
|
+
|
|
1595
|
+
namespace detail {
|
|
1596
|
+
HWY_RVV_FOREACH_B(HWY_RVV_RETM, MClear, clr) // with ##MLEN suffix
|
|
1597
|
+
} // namespace detail
|
|
1598
|
+
|
|
1599
|
+
#undef HWY_RVV_RETM
|
|
1600
|
+
|
|
1453
1601
|
// Comparisons set a mask bit to 1 if the condition is true, else 0. The XX in
|
|
1454
1602
|
// vboolXX_t is a power of two divisor for vector bits. SEW=8 / LMUL=1 = 1/8th
|
|
1455
1603
|
// of all bits; SEW=8 / LMUL=4 = half of all bits.
|
|
@@ -1463,6 +1611,16 @@ HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub, fnmacc, _ALL)
|
|
|
1463
1611
|
a, b, HWY_RVV_AVL(SEW, SHIFT)); \
|
|
1464
1612
|
}
|
|
1465
1613
|
|
|
1614
|
+
// mask = f(mask, vector, vector)
|
|
1615
|
+
#define HWY_RVV_RETM_ARGMVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
1616
|
+
SHIFT, MLEN, NAME, OP) \
|
|
1617
|
+
HWY_API HWY_RVV_M(MLEN) \
|
|
1618
|
+
NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) a, \
|
|
1619
|
+
HWY_RVV_V(BASE, SEW, LMUL) b) { \
|
|
1620
|
+
return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN##_mu( \
|
|
1621
|
+
m, detail::MClear##MLEN(), a, b, HWY_RVV_AVL(SEW, SHIFT)); \
|
|
1622
|
+
}
|
|
1623
|
+
|
|
1466
1624
|
// mask = f(vector, scalar)
|
|
1467
1625
|
#define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
1468
1626
|
SHIFT, MLEN, NAME, OP) \
|
|
@@ -1472,9 +1630,17 @@ HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub, fnmacc, _ALL)
|
|
|
1472
1630
|
a, b, HWY_RVV_AVL(SEW, SHIFT)); \
|
|
1473
1631
|
}
|
|
1474
1632
|
|
|
1633
|
+
#ifdef HWY_NATIVE_MASKED_COMP
|
|
1634
|
+
#undef HWY_NATIVE_MASKED_COMP
|
|
1635
|
+
#else
|
|
1636
|
+
#define HWY_NATIVE_MASKED_COMP
|
|
1637
|
+
#endif
|
|
1638
|
+
|
|
1475
1639
|
// ------------------------------ Eq
|
|
1476
1640
|
HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVV, Eq, mseq, _ALL)
|
|
1477
1641
|
HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Eq, mfeq, _ALL)
|
|
1642
|
+
HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGMVV, MaskedEq, mseq, _ALL)
|
|
1643
|
+
HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGMVV, MaskedEq, mfeq, _ALL)
|
|
1478
1644
|
|
|
1479
1645
|
namespace detail {
|
|
1480
1646
|
HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, EqS, mseq_vx, _ALL)
|
|
@@ -1484,6 +1650,8 @@ HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, EqS, mfeq_vf, _ALL)
|
|
|
1484
1650
|
// ------------------------------ Ne
|
|
1485
1651
|
HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVV, Ne, msne, _ALL)
|
|
1486
1652
|
HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Ne, mfne, _ALL)
|
|
1653
|
+
HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGMVV, MaskedNe, msne, _ALL)
|
|
1654
|
+
HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGMVV, MaskedNe, mfne, _ALL)
|
|
1487
1655
|
|
|
1488
1656
|
namespace detail {
|
|
1489
1657
|
HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, NeS, msne_vx, _ALL)
|
|
@@ -1494,6 +1662,9 @@ HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, NeS, mfne_vf, _ALL)
|
|
|
1494
1662
|
HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVV, Lt, msltu, _ALL)
|
|
1495
1663
|
HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVV, Lt, mslt, _ALL)
|
|
1496
1664
|
HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Lt, mflt, _ALL)
|
|
1665
|
+
HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGMVV, MaskedLt, msltu, _ALL)
|
|
1666
|
+
HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGMVV, MaskedLt, mslt, _ALL)
|
|
1667
|
+
HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGMVV, MaskedLt, mflt, _ALL)
|
|
1497
1668
|
|
|
1498
1669
|
namespace detail {
|
|
1499
1670
|
HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVS, LtS, mslt_vx, _ALL)
|
|
@@ -1505,20 +1676,43 @@ HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, LtS, mflt_vf, _ALL)
|
|
|
1505
1676
|
HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVV, Le, msleu, _ALL)
|
|
1506
1677
|
HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVV, Le, msle, _ALL)
|
|
1507
1678
|
HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Le, mfle, _ALL)
|
|
1679
|
+
HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGMVV, MaskedLe, msleu, _ALL)
|
|
1680
|
+
HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGMVV, MaskedLe, msle, _ALL)
|
|
1681
|
+
HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGMVV, MaskedLe, mfle, _ALL)
|
|
1682
|
+
|
|
1683
|
+
template <class D>
|
|
1684
|
+
using MFromD = decltype(Eq(Zero(D()), Zero(D())));
|
|
1508
1685
|
|
|
1686
|
+
template <class V, class M, class D = DFromV<V>>
|
|
1687
|
+
HWY_API MFromD<D> MaskedIsNaN(const M m, const V v) {
|
|
1688
|
+
return MaskedNe(m, v, v);
|
|
1689
|
+
}
|
|
1690
|
+
|
|
1691
|
+
#undef HWY_RVV_RETM_ARGMVV
|
|
1509
1692
|
#undef HWY_RVV_RETM_ARGVV
|
|
1510
1693
|
#undef HWY_RVV_RETM_ARGVS
|
|
1511
1694
|
|
|
1512
|
-
// ------------------------------ Gt/Ge
|
|
1695
|
+
// ------------------------------ Gt/Ge (Lt, Le)
|
|
1696
|
+
|
|
1697
|
+
// Swap args to reverse comparisons:
|
|
1698
|
+
template <class V>
|
|
1699
|
+
HWY_API auto Gt(const V a, const V b) -> decltype(Lt(a, b)) {
|
|
1700
|
+
return Lt(b, a);
|
|
1701
|
+
}
|
|
1513
1702
|
|
|
1514
1703
|
template <class V>
|
|
1515
1704
|
HWY_API auto Ge(const V a, const V b) -> decltype(Le(a, b)) {
|
|
1516
1705
|
return Le(b, a);
|
|
1517
1706
|
}
|
|
1518
1707
|
|
|
1519
|
-
template <class V
|
|
1520
|
-
HWY_API
|
|
1521
|
-
return
|
|
1708
|
+
template <class V, class M, class D = DFromV<V>>
|
|
1709
|
+
HWY_API MFromD<D> MaskedGt(M m, V a, V b) {
|
|
1710
|
+
return MaskedLt(m, b, a);
|
|
1711
|
+
}
|
|
1712
|
+
|
|
1713
|
+
template <class V, class M, class D = DFromV<V>>
|
|
1714
|
+
HWY_API MFromD<D> MaskedGe(M m, V a, V b) {
|
|
1715
|
+
return MaskedLe(m, b, a);
|
|
1522
1716
|
}
|
|
1523
1717
|
|
|
1524
1718
|
// ------------------------------ TestBit
|
|
@@ -1592,10 +1786,6 @@ HWY_RVV_FOREACH_F(HWY_RVV_IF_THEN_ZERO_ELSE, IfThenZeroElse, fmerge_vfm, _ALL)
|
|
|
1592
1786
|
#undef HWY_RVV_IF_THEN_ZERO_ELSE
|
|
1593
1787
|
|
|
1594
1788
|
// ------------------------------ MaskFromVec
|
|
1595
|
-
|
|
1596
|
-
template <class D>
|
|
1597
|
-
using MFromD = decltype(Eq(Zero(D()), Zero(D())));
|
|
1598
|
-
|
|
1599
1789
|
template <class V>
|
|
1600
1790
|
HWY_API MFromD<DFromV<V>> MaskFromVec(const V v) {
|
|
1601
1791
|
return detail::NeS(v, 0);
|
|
@@ -2963,6 +3153,32 @@ HWY_RVV_FOREACH_F(HWY_RVV_CONVERT, _, _, _ALL_VIRT)
|
|
|
2963
3153
|
HWY_RVV_FOREACH_F(HWY_RVV_NEAREST, _, _, _ALL)
|
|
2964
3154
|
#undef HWY_RVV_NEAREST
|
|
2965
3155
|
|
|
3156
|
+
template <size_t N>
|
|
3157
|
+
HWY_API vint32mf2_t DemoteToNearestInt(Simd<int32_t, N, -2> d,
|
|
3158
|
+
const vfloat64m1_t v) {
|
|
3159
|
+
return __riscv_vfncvt_x_f_w_i32mf2(v, Lanes(d));
|
|
3160
|
+
}
|
|
3161
|
+
template <size_t N>
|
|
3162
|
+
HWY_API vint32mf2_t DemoteToNearestInt(Simd<int32_t, N, -1> d,
|
|
3163
|
+
const vfloat64m1_t v) {
|
|
3164
|
+
return __riscv_vfncvt_x_f_w_i32mf2(v, Lanes(d));
|
|
3165
|
+
}
|
|
3166
|
+
template <size_t N>
|
|
3167
|
+
HWY_API vint32m1_t DemoteToNearestInt(Simd<int32_t, N, 0> d,
|
|
3168
|
+
const vfloat64m2_t v) {
|
|
3169
|
+
return __riscv_vfncvt_x_f_w_i32m1(v, Lanes(d));
|
|
3170
|
+
}
|
|
3171
|
+
template <size_t N>
|
|
3172
|
+
HWY_API vint32m2_t DemoteToNearestInt(Simd<int32_t, N, 1> d,
|
|
3173
|
+
const vfloat64m4_t v) {
|
|
3174
|
+
return __riscv_vfncvt_x_f_w_i32m2(v, Lanes(d));
|
|
3175
|
+
}
|
|
3176
|
+
template <size_t N>
|
|
3177
|
+
HWY_API vint32m4_t DemoteToNearestInt(Simd<int32_t, N, 2> d,
|
|
3178
|
+
const vfloat64m8_t v) {
|
|
3179
|
+
return __riscv_vfncvt_x_f_w_i32m4(v, Lanes(d));
|
|
3180
|
+
}
|
|
3181
|
+
|
|
2966
3182
|
// ================================================== COMBINE
|
|
2967
3183
|
|
|
2968
3184
|
namespace detail {
|
|
@@ -3025,6 +3241,151 @@ HWY_RVV_FOREACH(HWY_RVV_SLIDE_DOWN, SlideDown, slidedown, _ALL)
|
|
|
3025
3241
|
#undef HWY_RVV_SLIDE_UP
|
|
3026
3242
|
#undef HWY_RVV_SLIDE_DOWN
|
|
3027
3243
|
|
|
3244
|
+
#define HWY_RVV_GET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
|
|
3245
|
+
MLEN, NAME, OP) \
|
|
3246
|
+
template <size_t kIndex> \
|
|
3247
|
+
HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
3248
|
+
return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULH( \
|
|
3249
|
+
v, kIndex); /* no AVL */ \
|
|
3250
|
+
}
|
|
3251
|
+
#define HWY_RVV_GET_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
3252
|
+
SHIFT, MLEN, NAME, OP) \
|
|
3253
|
+
template <size_t kIndex> \
|
|
3254
|
+
HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
3255
|
+
static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); \
|
|
3256
|
+
HWY_IF_CONSTEXPR(kIndex == 0) { return Trunc(v); } \
|
|
3257
|
+
HWY_IF_CONSTEXPR(kIndex != 0) { \
|
|
3258
|
+
return Trunc(SlideDown( \
|
|
3259
|
+
v, Lanes(HWY_RVV_D(BASE, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), \
|
|
3260
|
+
SHIFT - 1){}))); \
|
|
3261
|
+
} \
|
|
3262
|
+
}
|
|
3263
|
+
#define HWY_RVV_GET_SMALLEST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
3264
|
+
SHIFT, MLEN, NAME, OP) \
|
|
3265
|
+
template <size_t kIndex> \
|
|
3266
|
+
HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
3267
|
+
static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); \
|
|
3268
|
+
HWY_IF_CONSTEXPR(kIndex == 0) { return v; } \
|
|
3269
|
+
HWY_IF_CONSTEXPR(kIndex != 0) { \
|
|
3270
|
+
return SlideDown( \
|
|
3271
|
+
v, Lanes(HWY_RVV_D(BASE, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), \
|
|
3272
|
+
SHIFT){}) / \
|
|
3273
|
+
2); \
|
|
3274
|
+
} \
|
|
3275
|
+
}
|
|
3276
|
+
HWY_RVV_FOREACH(HWY_RVV_GET, Get, get, _GET_SET)
|
|
3277
|
+
HWY_RVV_FOREACH(HWY_RVV_GET_VIRT, Get, get, _GET_SET_VIRT)
|
|
3278
|
+
HWY_RVV_FOREACH(HWY_RVV_GET_SMALLEST, Get, get, _GET_SET_SMALLEST)
|
|
3279
|
+
#undef HWY_RVV_GET
|
|
3280
|
+
#undef HWY_RVV_GET_VIRT
|
|
3281
|
+
#undef HWY_RVV_GET_SMALLEST
|
|
3282
|
+
|
|
3283
|
+
template <size_t kIndex, class D>
|
|
3284
|
+
static HWY_INLINE HWY_MAYBE_UNUSED VFromD<AdjustSimdTagToMinVecPow2<Half<D>>>
|
|
3285
|
+
Get(D d, VFromD<D> v) {
|
|
3286
|
+
static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1");
|
|
3287
|
+
HWY_IF_CONSTEXPR(kIndex == 0 || detail::IsFull(d)) { return Get<kIndex>(v); }
|
|
3288
|
+
HWY_IF_CONSTEXPR(kIndex != 0 && !detail::IsFull(d)) {
|
|
3289
|
+
const AdjustSimdTagToMinVecPow2<Half<decltype(d)>> dh;
|
|
3290
|
+
const size_t slide_down_amt =
|
|
3291
|
+
(dh.Pow2() < DFromV<decltype(v)>().Pow2()) ? Lanes(dh) : (Lanes(d) / 2);
|
|
3292
|
+
return ResizeBitCast(dh, SlideDown(v, slide_down_amt));
|
|
3293
|
+
}
|
|
3294
|
+
}
|
|
3295
|
+
|
|
3296
|
+
#define HWY_RVV_PARTIAL_VEC_SET_HALF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
|
|
3297
|
+
LMULH, SHIFT, MLEN, NAME, OP) \
|
|
3298
|
+
template <size_t kIndex> \
|
|
3299
|
+
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
3300
|
+
NAME(HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMULH) v, \
|
|
3301
|
+
size_t half_N) { \
|
|
3302
|
+
static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); \
|
|
3303
|
+
const DFromV<decltype(dest)> d; \
|
|
3304
|
+
HWY_IF_CONSTEXPR(kIndex == 0) { \
|
|
3305
|
+
return __riscv_v##OP##_v_v_##CHAR##SEW##LMUL##_tu(dest, Ext(d, v), \
|
|
3306
|
+
half_N); \
|
|
3307
|
+
} \
|
|
3308
|
+
HWY_IF_CONSTEXPR(kIndex != 0) { return SlideUp(dest, Ext(d, v), half_N); } \
|
|
3309
|
+
}
|
|
3310
|
+
#define HWY_RVV_PARTIAL_VEC_SET_HALF_SMALLEST( \
|
|
3311
|
+
BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP) \
|
|
3312
|
+
template <size_t kIndex> \
|
|
3313
|
+
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
3314
|
+
NAME(HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMUL) v, \
|
|
3315
|
+
size_t half_N) { \
|
|
3316
|
+
static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); \
|
|
3317
|
+
HWY_IF_CONSTEXPR(kIndex == 0) { \
|
|
3318
|
+
return __riscv_v##OP##_v_v_##CHAR##SEW##LMUL##_tu(dest, v, half_N); \
|
|
3319
|
+
} \
|
|
3320
|
+
HWY_IF_CONSTEXPR(kIndex != 0) { return SlideUp(dest, v, half_N); } \
|
|
3321
|
+
}
|
|
3322
|
+
HWY_RVV_FOREACH(HWY_RVV_PARTIAL_VEC_SET_HALF, PartialVecSetHalf, mv, _GET_SET)
|
|
3323
|
+
HWY_RVV_FOREACH(HWY_RVV_PARTIAL_VEC_SET_HALF, PartialVecSetHalf, mv,
|
|
3324
|
+
_GET_SET_VIRT)
|
|
3325
|
+
HWY_RVV_FOREACH(HWY_RVV_PARTIAL_VEC_SET_HALF_SMALLEST, PartialVecSetHalf, mv,
|
|
3326
|
+
_GET_SET_SMALLEST)
|
|
3327
|
+
#undef HWY_RVV_PARTIAL_VEC_SET_HALF
|
|
3328
|
+
#undef HWY_RVV_PARTIAL_VEC_SET_HALF_SMALLEST
|
|
3329
|
+
|
|
3330
|
+
#define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
|
|
3331
|
+
MLEN, NAME, OP) \
|
|
3332
|
+
template <size_t kIndex, size_t N> \
|
|
3333
|
+
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
3334
|
+
NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) dest, \
|
|
3335
|
+
HWY_RVV_V(BASE, SEW, LMULH) v) { \
|
|
3336
|
+
HWY_IF_CONSTEXPR(detail::IsFull(d)) { \
|
|
3337
|
+
return __riscv_v##OP##_v_##CHAR##SEW##LMULH##_##CHAR##SEW##LMUL( \
|
|
3338
|
+
dest, kIndex, v); /* no AVL */ \
|
|
3339
|
+
} \
|
|
3340
|
+
HWY_IF_CONSTEXPR(!detail::IsFull(d)) { \
|
|
3341
|
+
const Half<decltype(d)> dh; \
|
|
3342
|
+
return PartialVecSetHalf<kIndex>(dest, v, Lanes(dh)); \
|
|
3343
|
+
} \
|
|
3344
|
+
}
|
|
3345
|
+
#define HWY_RVV_SET_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
3346
|
+
SHIFT, MLEN, NAME, OP) \
|
|
3347
|
+
template <size_t kIndex, size_t N> \
|
|
3348
|
+
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
3349
|
+
NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) dest, \
|
|
3350
|
+
HWY_RVV_V(BASE, SEW, LMULH) v) { \
|
|
3351
|
+
const Half<decltype(d)> dh; \
|
|
3352
|
+
return PartialVecSetHalf<kIndex>(dest, v, Lanes(dh)); \
|
|
3353
|
+
}
|
|
3354
|
+
#define HWY_RVV_SET_SMALLEST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
|
3355
|
+
SHIFT, MLEN, NAME, OP) \
|
|
3356
|
+
template <size_t kIndex, size_t N> \
|
|
3357
|
+
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
3358
|
+
NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) dest, \
|
|
3359
|
+
HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
3360
|
+
return PartialVecSetHalf<kIndex>(dest, v, Lanes(d) / 2); \
|
|
3361
|
+
}
|
|
3362
|
+
#define HWY_RVV_SET_SMALLEST_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
|
|
3363
|
+
LMULH, SHIFT, MLEN, NAME, OP) \
|
|
3364
|
+
template <size_t kIndex, size_t N> \
|
|
3365
|
+
HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
|
|
3366
|
+
NAME(HWY_RVV_D(BASE, SEW, N, SHIFT - 1) d, \
|
|
3367
|
+
HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMUL) v) { \
|
|
3368
|
+
return PartialVecSetHalf<kIndex>(dest, v, Lanes(d) / 2); \
|
|
3369
|
+
}
|
|
3370
|
+
HWY_RVV_FOREACH(HWY_RVV_SET, Set, set, _GET_SET)
|
|
3371
|
+
HWY_RVV_FOREACH(HWY_RVV_SET_VIRT, Set, set, _GET_SET_VIRT)
|
|
3372
|
+
HWY_RVV_FOREACH(HWY_RVV_SET_SMALLEST, Set, set, _GET_SET_SMALLEST)
|
|
3373
|
+
HWY_RVV_FOREACH_UI163264(HWY_RVV_SET_SMALLEST_VIRT, Set, set, _GET_SET_SMALLEST)
|
|
3374
|
+
HWY_RVV_FOREACH_F(HWY_RVV_SET_SMALLEST_VIRT, Set, set, _GET_SET_SMALLEST)
|
|
3375
|
+
#undef HWY_RVV_SET
|
|
3376
|
+
#undef HWY_RVV_SET_VIRT
|
|
3377
|
+
#undef HWY_RVV_SET_SMALLEST
|
|
3378
|
+
#undef HWY_RVV_SET_SMALLEST_VIRT
|
|
3379
|
+
|
|
3380
|
+
template <size_t kIndex, class D, HWY_RVV_IF_EMULATED_D(D)>
|
|
3381
|
+
static HWY_INLINE HWY_MAYBE_UNUSED VFromD<D> Set(
|
|
3382
|
+
D d, VFromD<D> dest, VFromD<AdjustSimdTagToMinVecPow2<Half<D>>> v) {
|
|
3383
|
+
const RebindToUnsigned<decltype(d)> du;
|
|
3384
|
+
return BitCast(
|
|
3385
|
+
d, Set<kIndex>(du, BitCast(du, dest),
|
|
3386
|
+
BitCast(RebindToUnsigned<DFromV<decltype(v)>>(), v)));
|
|
3387
|
+
}
|
|
3388
|
+
|
|
3028
3389
|
} // namespace detail
|
|
3029
3390
|
|
|
3030
3391
|
// ------------------------------ SlideUpLanes
|
|
@@ -3047,39 +3408,36 @@ HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
|
|
|
3047
3408
|
// ------------------------------ ConcatUpperLower
|
|
3048
3409
|
template <class D, class V>
|
|
3049
3410
|
HWY_API V ConcatUpperLower(D d, const V hi, const V lo) {
|
|
3050
|
-
const
|
|
3051
|
-
|
|
3052
|
-
return detail::SlideUp(lo, hi_down, half);
|
|
3411
|
+
const auto lo_lower = detail::Get<0>(d, lo);
|
|
3412
|
+
return detail::Set<0>(d, hi, lo_lower);
|
|
3053
3413
|
}
|
|
3054
3414
|
|
|
3055
3415
|
// ------------------------------ ConcatLowerLower
|
|
3056
3416
|
template <class D, class V>
|
|
3057
3417
|
HWY_API V ConcatLowerLower(D d, const V hi, const V lo) {
|
|
3058
|
-
|
|
3418
|
+
const auto hi_lower = detail::Get<0>(d, hi);
|
|
3419
|
+
return detail::Set<1>(d, lo, hi_lower);
|
|
3059
3420
|
}
|
|
3060
3421
|
|
|
3061
3422
|
// ------------------------------ ConcatUpperUpper
|
|
3062
3423
|
template <class D, class V>
|
|
3063
3424
|
HWY_API V ConcatUpperUpper(D d, const V hi, const V lo) {
|
|
3064
|
-
const
|
|
3065
|
-
|
|
3066
|
-
const V lo_down = detail::SlideDown(lo, half);
|
|
3067
|
-
return detail::SlideUp(lo_down, hi_down, half);
|
|
3425
|
+
const auto lo_upper = detail::Get<1>(d, lo);
|
|
3426
|
+
return detail::Set<0>(d, hi, lo_upper);
|
|
3068
3427
|
}
|
|
3069
3428
|
|
|
3070
3429
|
// ------------------------------ ConcatLowerUpper
|
|
3071
3430
|
template <class D, class V>
|
|
3072
3431
|
HWY_API V ConcatLowerUpper(D d, const V hi, const V lo) {
|
|
3073
|
-
const
|
|
3074
|
-
const
|
|
3075
|
-
return detail::
|
|
3432
|
+
const auto lo_upper = detail::Get<1>(d, lo);
|
|
3433
|
+
const auto hi_lower = detail::Get<0>(d, hi);
|
|
3434
|
+
return detail::Set<1>(d, ResizeBitCast(d, lo_upper), hi_lower);
|
|
3076
3435
|
}
|
|
3077
3436
|
|
|
3078
3437
|
// ------------------------------ Combine
|
|
3079
3438
|
template <class D2, class V>
|
|
3080
3439
|
HWY_API VFromD<D2> Combine(D2 d2, const V hi, const V lo) {
|
|
3081
|
-
return detail::
|
|
3082
|
-
Lanes(d2) / 2);
|
|
3440
|
+
return detail::Set<1>(d2, ResizeBitCast(d2, lo), hi);
|
|
3083
3441
|
}
|
|
3084
3442
|
|
|
3085
3443
|
// ------------------------------ ZeroExtendVector
|
|
@@ -3126,8 +3484,9 @@ HWY_API VFromD<Half<DFromV<V>>> LowerHalf(const V v) {
|
|
|
3126
3484
|
}
|
|
3127
3485
|
|
|
3128
3486
|
template <class DH>
|
|
3129
|
-
HWY_API VFromD<DH> UpperHalf(const DH d2
|
|
3130
|
-
|
|
3487
|
+
HWY_API VFromD<DH> UpperHalf(const DH /*d2*/, const VFromD<Twice<DH>> v) {
|
|
3488
|
+
const Twice<DH> d;
|
|
3489
|
+
return detail::Get<1>(d, v);
|
|
3131
3490
|
}
|
|
3132
3491
|
|
|
3133
3492
|
// ================================================== SWIZZLE
|
|
@@ -3309,6 +3668,24 @@ HWY_API V SwapAdjacentBlocks(const V v) {
|
|
|
3309
3668
|
return OddEvenBlocks(up, down);
|
|
3310
3669
|
}
|
|
3311
3670
|
|
|
3671
|
+
// ------------------------------ InterleaveEvenBlocks
|
|
3672
|
+
// (SlideUpLanes, OddEvenBlocks)
|
|
3673
|
+
|
|
3674
|
+
template <class D, class V = VFromD<D>>
|
|
3675
|
+
HWY_API V InterleaveEvenBlocks(D d, V a, V b) {
|
|
3676
|
+
const size_t lpb = detail::LanesPerBlock(d);
|
|
3677
|
+
return OddEvenBlocks(SlideUpLanes(d, b, lpb), a);
|
|
3678
|
+
}
|
|
3679
|
+
|
|
3680
|
+
// ------------------------------ InterleaveOddBlocks
|
|
3681
|
+
// (SlideDownLanes, OddEvenBlocks)
|
|
3682
|
+
|
|
3683
|
+
template <class D, class V = VFromD<D>>
|
|
3684
|
+
HWY_API V InterleaveOddBlocks(D d, V a, V b) {
|
|
3685
|
+
const size_t lpb = detail::LanesPerBlock(d);
|
|
3686
|
+
return OddEvenBlocks(b, SlideDownLanes(d, a, lpb));
|
|
3687
|
+
}
|
|
3688
|
+
|
|
3312
3689
|
// ------------------------------ TableLookupLanes
|
|
3313
3690
|
|
|
3314
3691
|
template <class D, class VI>
|
|
@@ -4457,6 +4834,8 @@ HWY_API T ReduceMax(D d, const VFromD<D> v) {
|
|
|
4457
4834
|
|
|
4458
4835
|
#undef HWY_RVV_REDUCE
|
|
4459
4836
|
|
|
4837
|
+
// TODO: add MaskedReduceSum/Min/Max
|
|
4838
|
+
|
|
4460
4839
|
// ------------------------------ SumOfLanes
|
|
4461
4840
|
|
|
4462
4841
|
template <class D, HWY_IF_LANES_GT_D(D, 1)>
|
|
@@ -4687,7 +5066,7 @@ HWY_RVV_FOREACH(HWY_RVV_STORE4, StoreInterleaved4, sseg4, _LE2_VIRT)
|
|
|
4687
5066
|
|
|
4688
5067
|
#else // !HWY_HAVE_TUPLE
|
|
4689
5068
|
|
|
4690
|
-
template <class D, typename T = TFromD<D
|
|
5069
|
+
template <class D, typename T = TFromD<D>, HWY_RVV_IF_NOT_EMULATED_D(D)>
|
|
4691
5070
|
HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
|
|
4692
5071
|
VFromD<D>& v0, VFromD<D>& v1) {
|
|
4693
5072
|
const VFromD<D> A = LoadU(d, unaligned); // v1[1] v0[1] v1[0] v0[0]
|
|
@@ -4710,7 +5089,7 @@ HWY_RVV_FOREACH(HWY_RVV_LOAD_STRIDED, LoadStrided, lse, _ALL_VIRT)
|
|
|
4710
5089
|
#undef HWY_RVV_LOAD_STRIDED
|
|
4711
5090
|
} // namespace detail
|
|
4712
5091
|
|
|
4713
|
-
template <class D, typename T = TFromD<D
|
|
5092
|
+
template <class D, typename T = TFromD<D>, HWY_RVV_IF_NOT_EMULATED_D(D)>
|
|
4714
5093
|
HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
|
|
4715
5094
|
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
|
|
4716
5095
|
// Offsets are bytes, and this is not documented.
|
|
@@ -4719,7 +5098,7 @@ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
|
|
|
4719
5098
|
v2 = detail::LoadStrided(d, unaligned + 2, 3 * sizeof(T));
|
|
4720
5099
|
}
|
|
4721
5100
|
|
|
4722
|
-
template <class D, typename T = TFromD<D
|
|
5101
|
+
template <class D, typename T = TFromD<D>, HWY_RVV_IF_NOT_EMULATED_D(D)>
|
|
4723
5102
|
HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
|
|
4724
5103
|
VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
|
|
4725
5104
|
VFromD<D>& v3) {
|
|
@@ -4732,7 +5111,7 @@ HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
|
|
|
4732
5111
|
|
|
4733
5112
|
// Not 64-bit / max LMUL: interleave via promote, slide, OddEven.
|
|
4734
5113
|
template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE_D(D, 8),
|
|
4735
|
-
HWY_IF_POW2_LE_D(D, 2)>
|
|
5114
|
+
HWY_IF_POW2_LE_D(D, 2), HWY_RVV_IF_NOT_EMULATED_D(D)>
|
|
4736
5115
|
HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
|
|
4737
5116
|
T* HWY_RESTRICT unaligned) {
|
|
4738
5117
|
const RebindToUnsigned<D> du;
|
|
@@ -4747,7 +5126,7 @@ HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
|
|
|
4747
5126
|
|
|
4748
5127
|
// Can promote, max LMUL: two half-length
|
|
4749
5128
|
template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE_D(D, 8),
|
|
4750
|
-
HWY_IF_POW2_GT_D(D, 2)>
|
|
5129
|
+
HWY_IF_POW2_GT_D(D, 2), HWY_RVV_IF_NOT_EMULATED_D(D)>
|
|
4751
5130
|
HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
|
|
4752
5131
|
T* HWY_RESTRICT unaligned) {
|
|
4753
5132
|
const Half<decltype(d)> dh;
|
|
@@ -4771,7 +5150,8 @@ HWY_RVV_FOREACH(HWY_RVV_STORE_STRIDED, StoreStrided, sse, _ALL_VIRT)
|
|
|
4771
5150
|
} // namespace detail
|
|
4772
5151
|
|
|
4773
5152
|
// 64-bit: strided
|
|
4774
|
-
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE_D(D, 8)
|
|
5153
|
+
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE_D(D, 8),
|
|
5154
|
+
HWY_RVV_IF_NOT_EMULATED_D(D)>
|
|
4775
5155
|
HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
|
|
4776
5156
|
T* HWY_RESTRICT unaligned) {
|
|
4777
5157
|
// Offsets are bytes, and this is not documented.
|
|
@@ -4779,7 +5159,7 @@ HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
|
|
|
4779
5159
|
detail::StoreStrided(v1, d, unaligned + 1, 2 * sizeof(T));
|
|
4780
5160
|
}
|
|
4781
5161
|
|
|
4782
|
-
template <class D, typename T = TFromD<D
|
|
5162
|
+
template <class D, typename T = TFromD<D>, HWY_RVV_IF_NOT_EMULATED_D(D)>
|
|
4783
5163
|
HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
|
|
4784
5164
|
T* HWY_RESTRICT unaligned) {
|
|
4785
5165
|
// Offsets are bytes, and this is not documented.
|
|
@@ -4788,7 +5168,7 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
|
|
|
4788
5168
|
detail::StoreStrided(v2, d, unaligned + 2, 3 * sizeof(T));
|
|
4789
5169
|
}
|
|
4790
5170
|
|
|
4791
|
-
template <class D, typename T = TFromD<D
|
|
5171
|
+
template <class D, typename T = TFromD<D>, HWY_RVV_IF_NOT_EMULATED_D(D)>
|
|
4792
5172
|
HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
|
|
4793
5173
|
VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
|
|
4794
5174
|
// Offsets are bytes, and this is not documented.
|
|
@@ -4800,6 +5180,9 @@ HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
|
|
|
4800
5180
|
|
|
4801
5181
|
#endif // HWY_HAVE_TUPLE
|
|
4802
5182
|
|
|
5183
|
+
// Rely on generic Load/StoreInterleaved[234] for any emulated types.
|
|
5184
|
+
// Requires HWY_GENERIC_IF_EMULATED_D mirrors HWY_RVV_IF_EMULATED_D.
|
|
5185
|
+
|
|
4803
5186
|
// ------------------------------ Dup128VecFromValues (ResizeBitCast)
|
|
4804
5187
|
|
|
4805
5188
|
template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_D(D, 1)>
|
|
@@ -5176,6 +5559,12 @@ template <size_t kN, HWY_IF_LANES_GT(kN, 31)>
|
|
|
5176
5559
|
constexpr unsigned MaxMaskBits() {
|
|
5177
5560
|
return ~0u;
|
|
5178
5561
|
}
|
|
5562
|
+
|
|
5563
|
+
template <class D>
|
|
5564
|
+
constexpr int SufficientPow2ForMask() {
|
|
5565
|
+
return HWY_MAX(
|
|
5566
|
+
D().Pow2() - 3 - static_cast<int>(FloorLog2(sizeof(TFromD<D>))), -3);
|
|
5567
|
+
}
|
|
5179
5568
|
} // namespace detail
|
|
5180
5569
|
|
|
5181
5570
|
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_LE_D(D, 8)>
|
|
@@ -5202,11 +5591,13 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
|
5202
5591
|
template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 8)>
|
|
5203
5592
|
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
5204
5593
|
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
5205
|
-
const ScalableTag<uint8_t> du8;
|
|
5206
|
-
const ScalableTag<uint16_t> du16;
|
|
5594
|
+
const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8;
|
|
5595
|
+
const ScalableTag<uint16_t, detail::SufficientPow2ForMask<D>()> du16;
|
|
5207
5596
|
// There are exactly 16 mask bits for 128 vector bits of 8-bit lanes.
|
|
5208
5597
|
return detail::U8MaskBitsVecToMask(
|
|
5209
|
-
d,
|
|
5598
|
+
d, detail::ChangeLMUL(
|
|
5599
|
+
ScalableTag<uint8_t>(),
|
|
5600
|
+
BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits)))));
|
|
5210
5601
|
#else
|
|
5211
5602
|
// Slow fallback for completeness; the above bits to mask cast is preferred.
|
|
5212
5603
|
const RebindToUnsigned<decltype(d)> du8;
|
|
@@ -5233,10 +5624,11 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
|
5233
5624
|
if (kN < 8) mask_bits &= detail::MaxMaskBits<kN>();
|
|
5234
5625
|
|
|
5235
5626
|
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
5236
|
-
const ScalableTag<uint8_t> du8;
|
|
5627
|
+
const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8;
|
|
5237
5628
|
// There are exactly 8 mask bits for 128 vector bits of 16-bit lanes.
|
|
5238
|
-
return detail::U8MaskBitsVecToMask(
|
|
5239
|
-
|
|
5629
|
+
return detail::U8MaskBitsVecToMask(
|
|
5630
|
+
d, detail::ChangeLMUL(ScalableTag<uint8_t>(),
|
|
5631
|
+
Set(du8, static_cast<uint8_t>(mask_bits))));
|
|
5240
5632
|
#else
|
|
5241
5633
|
// Slow fallback for completeness; the above bits to mask cast is preferred.
|
|
5242
5634
|
const RebindToUnsigned<D> du;
|
|
@@ -5252,9 +5644,10 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
|
5252
5644
|
if (kN < 4) mask_bits &= detail::MaxMaskBits<kN>();
|
|
5253
5645
|
|
|
5254
5646
|
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
5255
|
-
const ScalableTag<uint8_t> du8;
|
|
5647
|
+
const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8;
|
|
5256
5648
|
return detail::U8MaskBitsVecToMask(
|
|
5257
|
-
d,
|
|
5649
|
+
d, detail::ChangeLMUL(ScalableTag<uint8_t>(),
|
|
5650
|
+
Set(du8, static_cast<uint8_t>(mask_bits * 0x11))));
|
|
5258
5651
|
#else
|
|
5259
5652
|
// Slow fallback for completeness; the above bits to mask cast is preferred.
|
|
5260
5653
|
const RebindToUnsigned<D> du;
|
|
@@ -5269,9 +5662,10 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
|
|
|
5269
5662
|
if (kN < 2) mask_bits &= detail::MaxMaskBits<kN>();
|
|
5270
5663
|
|
|
5271
5664
|
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
5272
|
-
const ScalableTag<uint8_t> du8;
|
|
5665
|
+
const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8;
|
|
5273
5666
|
return detail::U8MaskBitsVecToMask(
|
|
5274
|
-
d,
|
|
5667
|
+
d, detail::ChangeLMUL(ScalableTag<uint8_t>(),
|
|
5668
|
+
Set(du8, static_cast<uint8_t>(mask_bits * 0x55))));
|
|
5275
5669
|
#else
|
|
5276
5670
|
// Slow fallback for completeness; the above bits to mask cast is preferred.
|
|
5277
5671
|
const RebindToUnsigned<D> du;
|
|
@@ -5553,9 +5947,13 @@ HWY_API V64 BitShuffle(V64 values, VI idx) {
|
|
|
5553
5947
|
template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
5554
5948
|
class D = DFromV<V>, class DW = RepartitionToWide<D>>
|
|
5555
5949
|
HWY_API VFromD<DW> MulEven(const V a, const V b) {
|
|
5556
|
-
|
|
5557
|
-
|
|
5558
|
-
|
|
5950
|
+
constexpr int maskVal = sizeof(TFromD<D>) == 4 ? 5
|
|
5951
|
+
: sizeof(TFromD<D>) == 2 ? 0x55
|
|
5952
|
+
: 0x5555;
|
|
5953
|
+
const auto mask = Dup128MaskFromMaskBits(D(), maskVal);
|
|
5954
|
+
const auto hi = Slide1Up(D(), MulHigh(a, b));
|
|
5955
|
+
const auto res = MaskedMulOr(hi, mask, a, b);
|
|
5956
|
+
return BitCast(DW(), res);
|
|
5559
5957
|
}
|
|
5560
5958
|
|
|
5561
5959
|
template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4)),
|
|
@@ -5569,9 +5967,9 @@ HWY_API VFromD<DW> MulOdd(const V a, const V b) {
|
|
|
5569
5967
|
// There is no 64x64 vwmul.
|
|
5570
5968
|
template <class V, HWY_IF_T_SIZE_V(V, 8)>
|
|
5571
5969
|
HWY_INLINE V MulEven(const V a, const V b) {
|
|
5572
|
-
const auto
|
|
5573
|
-
const auto hi = MulHigh(a, b);
|
|
5574
|
-
return
|
|
5970
|
+
const auto mask = Dup128MaskFromMaskBits(DFromV<V>(), 1);
|
|
5971
|
+
const auto hi = Slide1Up(DFromV<V>(), MulHigh(a, b));
|
|
5972
|
+
return MaskedMulOr(hi, mask, a, b);
|
|
5575
5973
|
}
|
|
5576
5974
|
|
|
5577
5975
|
template <class V, HWY_IF_T_SIZE_V(V, 8)>
|
|
@@ -5915,6 +6313,23 @@ HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
|
5915
6313
|
#endif // HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
5916
6314
|
|
|
5917
6315
|
// ------------------------------ Lt128Upper
|
|
6316
|
+
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
6317
|
+
|
|
6318
|
+
template <class D>
|
|
6319
|
+
HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
6320
|
+
static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
|
|
6321
|
+
auto du8mf8 = ScalableTag<uint8_t, -3>{};
|
|
6322
|
+
const vuint8mf8_t ltHL =
|
|
6323
|
+
detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Lt(a, b)));
|
|
6324
|
+
const vuint8mf8_t ltHx = detail::AndS(ltHL, 0xaa);
|
|
6325
|
+
const vuint8mf8_t ltxL = ShiftRight<1>(ltHx);
|
|
6326
|
+
auto du8m1 = ScalableTag<uint8_t>{};
|
|
6327
|
+
return detail::U8MaskBitsVecToMask(d,
|
|
6328
|
+
detail::ChangeLMUL(du8m1, Or(ltHx, ltxL)));
|
|
6329
|
+
}
|
|
6330
|
+
|
|
6331
|
+
#else
|
|
6332
|
+
|
|
5918
6333
|
template <class D>
|
|
5919
6334
|
HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
5920
6335
|
static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
|
|
@@ -5926,7 +6341,27 @@ HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
|
5926
6341
|
return MaskFromVec(OddEven(ltHL, down));
|
|
5927
6342
|
}
|
|
5928
6343
|
|
|
6344
|
+
#endif // HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
6345
|
+
|
|
5929
6346
|
// ------------------------------ Eq128
|
|
6347
|
+
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
6348
|
+
|
|
6349
|
+
template <class D>
|
|
6350
|
+
HWY_INLINE MFromD<D> Eq128(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
6351
|
+
static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
|
|
6352
|
+
auto du8mf8 = ScalableTag<uint8_t, -3>{};
|
|
6353
|
+
const vuint8mf8_t eqHL =
|
|
6354
|
+
detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Eq(a, b)));
|
|
6355
|
+
const vuint8mf8_t eqxH = ShiftRight<1>(eqHL);
|
|
6356
|
+
const vuint8mf8_t result0L = detail::AndS(And(eqHL, eqxH), 0x55);
|
|
6357
|
+
const vuint8mf8_t resultH0 = Add(result0L, result0L);
|
|
6358
|
+
auto du8m1 = ScalableTag<uint8_t>{};
|
|
6359
|
+
return detail::U8MaskBitsVecToMask(
|
|
6360
|
+
d, detail::ChangeLMUL(du8m1, Or(result0L, resultH0)));
|
|
6361
|
+
}
|
|
6362
|
+
|
|
6363
|
+
#else
|
|
6364
|
+
|
|
5930
6365
|
template <class D>
|
|
5931
6366
|
HWY_INLINE MFromD<D> Eq128(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
5932
6367
|
static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
|
|
@@ -5938,7 +6373,26 @@ HWY_INLINE MFromD<D> Eq128(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
|
5938
6373
|
return MaskFromVec(eq);
|
|
5939
6374
|
}
|
|
5940
6375
|
|
|
6376
|
+
#endif
|
|
6377
|
+
|
|
5941
6378
|
// ------------------------------ Eq128Upper
|
|
6379
|
+
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
6380
|
+
|
|
6381
|
+
template <class D>
|
|
6382
|
+
HWY_INLINE MFromD<D> Eq128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
6383
|
+
static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
|
|
6384
|
+
auto du8mf8 = ScalableTag<uint8_t, -3>{};
|
|
6385
|
+
const vuint8mf8_t eqHL =
|
|
6386
|
+
detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Eq(a, b)));
|
|
6387
|
+
const vuint8mf8_t eqHx = detail::AndS(eqHL, 0xaa);
|
|
6388
|
+
const vuint8mf8_t eqxL = ShiftRight<1>(eqHx);
|
|
6389
|
+
auto du8m1 = ScalableTag<uint8_t>{};
|
|
6390
|
+
return detail::U8MaskBitsVecToMask(d,
|
|
6391
|
+
detail::ChangeLMUL(du8m1, Or(eqHx, eqxL)));
|
|
6392
|
+
}
|
|
6393
|
+
|
|
6394
|
+
#else
|
|
6395
|
+
|
|
5942
6396
|
template <class D>
|
|
5943
6397
|
HWY_INLINE MFromD<D> Eq128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
5944
6398
|
static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
|
|
@@ -5947,7 +6401,27 @@ HWY_INLINE MFromD<D> Eq128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
|
5947
6401
|
return MaskFromVec(OddEven(eqHL, detail::Slide1Down(eqHL)));
|
|
5948
6402
|
}
|
|
5949
6403
|
|
|
6404
|
+
#endif
|
|
6405
|
+
|
|
5950
6406
|
// ------------------------------ Ne128
|
|
6407
|
+
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
6408
|
+
|
|
6409
|
+
template <class D>
|
|
6410
|
+
HWY_INLINE MFromD<D> Ne128(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
6411
|
+
static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
|
|
6412
|
+
auto du8mf8 = ScalableTag<uint8_t, -3>{};
|
|
6413
|
+
const vuint8mf8_t neHL =
|
|
6414
|
+
detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Ne(a, b)));
|
|
6415
|
+
const vuint8mf8_t nexH = ShiftRight<1>(neHL);
|
|
6416
|
+
const vuint8mf8_t result0L = detail::AndS(Or(neHL, nexH), 0x55);
|
|
6417
|
+
const vuint8mf8_t resultH0 = Add(result0L, result0L);
|
|
6418
|
+
auto du8m1 = ScalableTag<uint8_t>{};
|
|
6419
|
+
return detail::U8MaskBitsVecToMask(
|
|
6420
|
+
d, detail::ChangeLMUL(du8m1, Or(result0L, resultH0)));
|
|
6421
|
+
}
|
|
6422
|
+
|
|
6423
|
+
#else
|
|
6424
|
+
|
|
5951
6425
|
template <class D>
|
|
5952
6426
|
HWY_INLINE MFromD<D> Ne128(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
5953
6427
|
static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
|
|
@@ -5958,7 +6432,26 @@ HWY_INLINE MFromD<D> Ne128(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
|
5958
6432
|
return MaskFromVec(Or(neHL, neLH));
|
|
5959
6433
|
}
|
|
5960
6434
|
|
|
6435
|
+
#endif
|
|
6436
|
+
|
|
5961
6437
|
// ------------------------------ Ne128Upper
|
|
6438
|
+
#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
|
|
6439
|
+
|
|
6440
|
+
template <class D>
|
|
6441
|
+
HWY_INLINE MFromD<D> Ne128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
6442
|
+
static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
|
|
6443
|
+
auto du8mf8 = ScalableTag<uint8_t, -3>{};
|
|
6444
|
+
const vuint8mf8_t neHL =
|
|
6445
|
+
detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Ne(a, b)));
|
|
6446
|
+
const vuint8mf8_t neHx = detail::AndS(neHL, 0xaa);
|
|
6447
|
+
const vuint8mf8_t nexL = ShiftRight<1>(neHx);
|
|
6448
|
+
auto du8m1 = ScalableTag<uint8_t>{};
|
|
6449
|
+
return detail::U8MaskBitsVecToMask(d,
|
|
6450
|
+
detail::ChangeLMUL(du8m1, Or(neHx, nexL)));
|
|
6451
|
+
}
|
|
6452
|
+
|
|
6453
|
+
#else
|
|
6454
|
+
|
|
5962
6455
|
template <class D>
|
|
5963
6456
|
HWY_INLINE MFromD<D> Ne128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
5964
6457
|
static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
|
|
@@ -5970,6 +6463,8 @@ HWY_INLINE MFromD<D> Ne128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
|
|
|
5970
6463
|
return MaskFromVec(OddEven(neHL, down));
|
|
5971
6464
|
}
|
|
5972
6465
|
|
|
6466
|
+
#endif
|
|
6467
|
+
|
|
5973
6468
|
// ------------------------------ Min128, Max128 (Lt128)
|
|
5974
6469
|
|
|
5975
6470
|
template <class D>
|