@img/sharp-libvips-dev 1.2.1 → 1.2.2-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/include/aom/aom_decoder.h +1 -1
  2. package/include/aom/aom_encoder.h +2 -0
  3. package/include/aom/aomcx.h +106 -25
  4. package/include/ffi.h +3 -3
  5. package/include/freetype2/freetype/config/ftconfig.h +1 -1
  6. package/include/freetype2/freetype/config/ftheader.h +1 -1
  7. package/include/freetype2/freetype/config/ftoption.h +37 -12
  8. package/include/freetype2/freetype/config/ftstdlib.h +1 -1
  9. package/include/freetype2/freetype/config/integer-types.h +29 -2
  10. package/include/freetype2/freetype/config/mac-support.h +1 -1
  11. package/include/freetype2/freetype/config/public-macros.h +3 -3
  12. package/include/freetype2/freetype/freetype.h +51 -47
  13. package/include/freetype2/freetype/ftadvanc.h +1 -1
  14. package/include/freetype2/freetype/ftbbox.h +1 -1
  15. package/include/freetype2/freetype/ftbdf.h +1 -1
  16. package/include/freetype2/freetype/ftbitmap.h +1 -1
  17. package/include/freetype2/freetype/ftbzip2.h +1 -1
  18. package/include/freetype2/freetype/ftcache.h +1 -1
  19. package/include/freetype2/freetype/ftcid.h +1 -1
  20. package/include/freetype2/freetype/ftcolor.h +13 -4
  21. package/include/freetype2/freetype/ftdriver.h +3 -3
  22. package/include/freetype2/freetype/fterrdef.h +1 -1
  23. package/include/freetype2/freetype/fterrors.h +1 -1
  24. package/include/freetype2/freetype/ftfntfmt.h +1 -1
  25. package/include/freetype2/freetype/ftgasp.h +1 -1
  26. package/include/freetype2/freetype/ftglyph.h +1 -1
  27. package/include/freetype2/freetype/ftgxval.h +1 -1
  28. package/include/freetype2/freetype/ftgzip.h +1 -1
  29. package/include/freetype2/freetype/ftimage.h +6 -2
  30. package/include/freetype2/freetype/ftincrem.h +1 -1
  31. package/include/freetype2/freetype/ftlcdfil.h +1 -1
  32. package/include/freetype2/freetype/ftlist.h +1 -1
  33. package/include/freetype2/freetype/ftlogging.h +184 -0
  34. package/include/freetype2/freetype/ftlzw.h +1 -1
  35. package/include/freetype2/freetype/ftmac.h +1 -1
  36. package/include/freetype2/freetype/ftmm.h +159 -103
  37. package/include/freetype2/freetype/ftmodapi.h +1 -1
  38. package/include/freetype2/freetype/ftmoderr.h +1 -1
  39. package/include/freetype2/freetype/ftotval.h +1 -1
  40. package/include/freetype2/freetype/ftoutln.h +1 -1
  41. package/include/freetype2/freetype/ftparams.h +1 -1
  42. package/include/freetype2/freetype/ftpfr.h +1 -1
  43. package/include/freetype2/freetype/ftrender.h +1 -1
  44. package/include/freetype2/freetype/ftsizes.h +1 -1
  45. package/include/freetype2/freetype/ftsnames.h +1 -1
  46. package/include/freetype2/freetype/ftstroke.h +1 -1
  47. package/include/freetype2/freetype/ftsynth.h +1 -1
  48. package/include/freetype2/freetype/ftsystem.h +1 -1
  49. package/include/freetype2/freetype/fttrigon.h +1 -1
  50. package/include/freetype2/freetype/fttypes.h +1 -1
  51. package/include/freetype2/freetype/ftwinfnt.h +2 -3
  52. package/include/freetype2/freetype/otsvg.h +1 -1
  53. package/include/freetype2/freetype/t1tables.h +1 -1
  54. package/include/freetype2/freetype/ttnameid.h +129 -129
  55. package/include/freetype2/freetype/tttables.h +8 -5
  56. package/include/freetype2/freetype/tttags.h +1 -1
  57. package/include/freetype2/ft2build.h +1 -1
  58. package/include/glib-2.0/gio/gdbuserror.h +9 -8
  59. package/include/glib-2.0/gio/ginetaddress.h +12 -0
  60. package/include/glib-2.0/gio/gioenums.h +9 -2
  61. package/include/glib-2.0/glib/gstring.h +2 -2
  62. package/include/glib-2.0/glib/gunicode.h +1 -1
  63. package/include/glib-2.0/gobject/glib-types.h +1 -1
  64. package/include/glib-2.0/gobject/gparam.h +1 -1
  65. package/include/glib-2.0/gobject/gvalue.h +78 -35
  66. package/include/harfbuzz/hb-script-list.h +12 -0
  67. package/include/harfbuzz/hb-version.h +3 -3
  68. package/include/hwy/abort.h +2 -19
  69. package/include/hwy/aligned_allocator.h +11 -7
  70. package/include/hwy/auto_tune.h +504 -0
  71. package/include/hwy/base.h +425 -104
  72. package/include/hwy/cache_control.h +16 -0
  73. package/include/hwy/detect_compiler_arch.h +32 -1
  74. package/include/hwy/detect_targets.h +251 -67
  75. package/include/hwy/foreach_target.h +35 -0
  76. package/include/hwy/highway.h +185 -76
  77. package/include/hwy/nanobenchmark.h +1 -19
  78. package/include/hwy/ops/arm_neon-inl.h +969 -458
  79. package/include/hwy/ops/arm_sve-inl.h +1137 -359
  80. package/include/hwy/ops/emu128-inl.h +97 -11
  81. package/include/hwy/ops/generic_ops-inl.h +1222 -34
  82. package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
  83. package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
  84. package/include/hwy/ops/ppc_vsx-inl.h +306 -126
  85. package/include/hwy/ops/rvv-inl.h +546 -51
  86. package/include/hwy/ops/scalar-inl.h +77 -22
  87. package/include/hwy/ops/set_macros-inl.h +138 -17
  88. package/include/hwy/ops/shared-inl.h +50 -10
  89. package/include/hwy/ops/wasm_128-inl.h +137 -92
  90. package/include/hwy/ops/x86_128-inl.h +773 -214
  91. package/include/hwy/ops/x86_256-inl.h +712 -255
  92. package/include/hwy/ops/x86_512-inl.h +429 -753
  93. package/include/hwy/ops/x86_avx3-inl.h +501 -0
  94. package/include/hwy/per_target.h +2 -1
  95. package/include/hwy/profiler.h +622 -486
  96. package/include/hwy/targets.h +62 -20
  97. package/include/hwy/timer-inl.h +8 -160
  98. package/include/hwy/timer.h +170 -3
  99. package/include/hwy/x86_cpuid.h +81 -0
  100. package/include/libheif/heif_cxx.h +25 -5
  101. package/include/libheif/heif_regions.h +5 -5
  102. package/include/libheif/heif_version.h +2 -2
  103. package/include/librsvg-2.0/librsvg/rsvg-version.h +3 -3
  104. package/include/libxml2/libxml/valid.h +0 -3
  105. package/include/libxml2/libxml/xmlerror.h +1 -1
  106. package/include/libxml2/libxml/xmlversion.h +4 -4
  107. package/include/pango-1.0/pango/pango-enum-types.h +3 -0
  108. package/include/pango-1.0/pango/pango-features.h +3 -3
  109. package/include/pango-1.0/pango/pango-font.h +30 -0
  110. package/include/pango-1.0/pango/pango-version-macros.h +26 -0
  111. package/include/vips/connection.h +4 -4
  112. package/include/vips/version.h +4 -4
  113. package/include/zlib.h +3 -3
  114. package/package.json +1 -1
  115. package/versions.json +13 -13
@@ -16,8 +16,21 @@
16
16
  // RISC-V V vectors (length not known at compile time).
17
17
  // External include guard in highway.h - see comment there.
18
18
 
19
+ #pragma push_macro("__riscv_v_elen")
20
+
21
+ // Workaround that ensures that all of the __riscv_vsetvl_* and
22
+ // __riscv_vsetvlmax_* macros in riscv_vector.h are defined when compiling with
23
+ // Clang 20 with dynamic dispatch and a baseline target of SCALAR or EMU128
24
+ #if HWY_COMPILER_CLANG >= 2000 && HWY_COMPILER_CLANG < 2100 && \
25
+ (!defined(__riscv_v_elen) || __riscv_v_elen < 64)
26
+ #undef __riscv_v_elen
27
+ #define __riscv_v_elen 64
28
+ #endif
29
+
19
30
  #include <riscv_vector.h>
20
31
 
32
+ #pragma pop_macro("__riscv_v_elen")
33
+
21
34
  #include "hwy/ops/shared-inl.h"
22
35
 
23
36
  HWY_BEFORE_NAMESPACE();
@@ -127,6 +140,26 @@ namespace detail { // for code folding
127
140
  X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \
128
141
  X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
129
142
 
143
+ #define HWY_RVV_FOREACH_08_GET_SET(X_MACRO, BASE, CHAR, NAME, OP) \
144
+ X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \
145
+ X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \
146
+ X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
147
+
148
+ #define HWY_RVV_FOREACH_16_GET_SET(X_MACRO, BASE, CHAR, NAME, OP) \
149
+ X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \
150
+ X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \
151
+ X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
152
+
153
+ #define HWY_RVV_FOREACH_32_GET_SET(X_MACRO, BASE, CHAR, NAME, OP) \
154
+ X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \
155
+ X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \
156
+ X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
157
+
158
+ #define HWY_RVV_FOREACH_64_GET_SET(X_MACRO, BASE, CHAR, NAME, OP) \
159
+ X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \
160
+ X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \
161
+ X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
162
+
130
163
  // LMULS = _DEMOTE: can demote from SEW*LMUL to SEWH*LMULH.
131
164
  #define HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
132
165
  X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
@@ -275,6 +308,35 @@ namespace detail { // for code folding
275
308
  HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
276
309
  HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
277
310
 
311
+ // GET/SET + VIRT
312
+ #define HWY_RVV_FOREACH_08_GET_SET_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
313
+ X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
314
+ X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \
315
+ X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP)
316
+
317
+ #define HWY_RVV_FOREACH_16_GET_SET_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
318
+ X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \
319
+ X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP)
320
+
321
+ #define HWY_RVV_FOREACH_32_GET_SET_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
322
+ X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP)
323
+
324
+ #define HWY_RVV_FOREACH_64_GET_SET_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
325
+
326
+ // For the smallest LMUL for each SEW, similar to the LowerHalf operator, we
327
+ // provide the Get and Set operator that returns the same vector type.
328
+ #define HWY_RVV_FOREACH_08_GET_SET_SMALLEST(X_MACRO, BASE, CHAR, NAME, OP) \
329
+ X_MACRO(BASE, CHAR, 8, 16, __, mf8, mf4, __, -3, /*MLEN=*/64, NAME, OP)
330
+
331
+ #define HWY_RVV_FOREACH_16_GET_SET_SMALLEST(X_MACRO, BASE, CHAR, NAME, OP) \
332
+ X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP)
333
+
334
+ #define HWY_RVV_FOREACH_32_GET_SET_SMALLEST(X_MACRO, BASE, CHAR, NAME, OP) \
335
+ X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP)
336
+
337
+ #define HWY_RVV_FOREACH_64_GET_SET_SMALLEST(X_MACRO, BASE, CHAR, NAME, OP) \
338
+ X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP)
339
+
278
340
  // EXT + VIRT
279
341
  #define HWY_RVV_FOREACH_08_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
280
342
  HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
@@ -341,9 +403,13 @@ namespace detail { // for code folding
341
403
  HWY_RVV_FOREACH_F16_UNCONDITIONAL(X_MACRO, NAME, OP, LMULS)
342
404
  // Only BF16 is emulated.
343
405
  #define HWY_RVV_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
406
+ #define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
407
+ #define HWY_RVV_IF_NOT_EMULATED_D(D) HWY_IF_NOT_BF16_D(D)
344
408
  #else
345
409
  #define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS)
346
410
  #define HWY_RVV_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
411
+ #define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
412
+ #define HWY_RVV_IF_NOT_EMULATED_D(D) HWY_IF_NOT_SPECIAL_FLOAT_D(D)
347
413
  #endif
348
414
  #define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
349
415
  HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, float, f, NAME, OP)
@@ -1114,6 +1180,18 @@ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, SaturatedSub, ssub, _ALL)
1114
1180
 
1115
1181
  // ------------------------------ AverageRound
1116
1182
 
1183
+ #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32
1184
+ #undef HWY_NATIVE_AVERAGE_ROUND_UI32
1185
+ #else
1186
+ #define HWY_NATIVE_AVERAGE_ROUND_UI32
1187
+ #endif
1188
+
1189
+ #ifdef HWY_NATIVE_AVERAGE_ROUND_UI64
1190
+ #undef HWY_NATIVE_AVERAGE_ROUND_UI64
1191
+ #else
1192
+ #define HWY_NATIVE_AVERAGE_ROUND_UI64
1193
+ #endif
1194
+
1117
1195
  // Define this to opt-out of the default behavior, which is AVOID on certain
1118
1196
  // compiler versions. You can define only this to use VXRM, or define both this
1119
1197
  // and HWY_RVV_AVOID_VXRM to always avoid VXRM.
@@ -1123,9 +1201,9 @@ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, SaturatedSub, ssub, _ALL)
1123
1201
  #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400
1124
1202
  #define HWY_RVV_AVOID_VXRM
1125
1203
  // Clang 16 with __riscv_v_intrinsic == 11000 may either require VXRM or avoid.
1126
- // Assume earlier versions avoid.
1204
+ // Assume that Clang 16 and earlier avoid VXRM.
1127
1205
  #elif HWY_COMPILER_CLANG && \
1128
- (HWY_COMPILER_CLANG < 1600 || __riscv_v_intrinsic < 11000)
1206
+ (HWY_COMPILER_CLANG < 1700 || __riscv_v_intrinsic < 11000)
1129
1207
  #define HWY_RVV_AVOID_VXRM
1130
1208
  #endif
1131
1209
 
@@ -1153,8 +1231,8 @@ HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, SaturatedSub, ssub, _ALL)
1153
1231
  a, b, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \
1154
1232
  }
1155
1233
 
1156
- HWY_RVV_FOREACH_U08(HWY_RVV_RETV_AVERAGE, AverageRound, aaddu, _ALL)
1157
- HWY_RVV_FOREACH_U16(HWY_RVV_RETV_AVERAGE, AverageRound, aaddu, _ALL)
1234
+ HWY_RVV_FOREACH_I(HWY_RVV_RETV_AVERAGE, AverageRound, aadd, _ALL)
1235
+ HWY_RVV_FOREACH_U(HWY_RVV_RETV_AVERAGE, AverageRound, aaddu, _ALL)
1158
1236
 
1159
1237
  #undef HWY_RVV_RETV_AVERAGE
1160
1238
 
@@ -1183,6 +1261,35 @@ HWY_RVV_FOREACH_I(HWY_RVV_SHIFT, ShiftRight, sra, _ALL)
1183
1261
 
1184
1262
  #undef HWY_RVV_SHIFT
1185
1263
 
1264
+ // ------------------------------ RoundingShiftRight[Same]
1265
+
1266
+ #ifdef HWY_NATIVE_ROUNDING_SHR
1267
+ #undef HWY_NATIVE_ROUNDING_SHR
1268
+ #else
1269
+ #define HWY_NATIVE_ROUNDING_SHR
1270
+ #endif
1271
+
1272
+ // Intrinsics do not define .vi forms, so use .vx instead.
1273
+ #define HWY_RVV_ROUNDING_SHR(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1274
+ SHIFT, MLEN, NAME, OP) \
1275
+ template <int kBits> \
1276
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
1277
+ return __riscv_v##OP##_vx_##CHAR##SEW##LMUL( \
1278
+ v, kBits, \
1279
+ HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \
1280
+ } \
1281
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1282
+ NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \
1283
+ return __riscv_v##OP##_vx_##CHAR##SEW##LMUL( \
1284
+ v, static_cast<uint8_t>(bits), \
1285
+ HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \
1286
+ }
1287
+
1288
+ HWY_RVV_FOREACH_U(HWY_RVV_ROUNDING_SHR, RoundingShiftRight, ssrl, _ALL)
1289
+ HWY_RVV_FOREACH_I(HWY_RVV_ROUNDING_SHR, RoundingShiftRight, ssra, _ALL)
1290
+
1291
+ #undef HWY_RVV_ROUNDING_SHR
1292
+
1186
1293
  // ------------------------------ SumsOf8 (ShiftRight, Add)
1187
1294
  template <class VU8, HWY_IF_U8_D(DFromV<VU8>)>
1188
1295
  HWY_API VFromD<Repartition<uint64_t, DFromV<VU8>>> SumsOf8(const VU8 v) {
@@ -1276,6 +1383,33 @@ HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shr, sra, _ALL)
1276
1383
  #undef HWY_RVV_SHIFT_II
1277
1384
  #undef HWY_RVV_SHIFT_VV
1278
1385
 
1386
+ // ------------------------------ RoundingShr
1387
+ #define HWY_RVV_ROUNDING_SHR_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
1388
+ LMULH, SHIFT, MLEN, NAME, OP) \
1389
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1390
+ NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
1391
+ return __riscv_v##OP##_vv_##CHAR##SEW##LMUL( \
1392
+ v, bits, \
1393
+ HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \
1394
+ }
1395
+
1396
+ HWY_RVV_FOREACH_U(HWY_RVV_ROUNDING_SHR_VV, RoundingShr, ssrl, _ALL)
1397
+
1398
+ #define HWY_RVV_ROUNDING_SHR_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
1399
+ LMULH, SHIFT, MLEN, NAME, OP) \
1400
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1401
+ NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
1402
+ const HWY_RVV_D(uint, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), SHIFT) du; \
1403
+ return __riscv_v##OP##_vv_##CHAR##SEW##LMUL( \
1404
+ v, BitCast(du, bits), \
1405
+ HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \
1406
+ }
1407
+
1408
+ HWY_RVV_FOREACH_I(HWY_RVV_ROUNDING_SHR_II, RoundingShr, ssra, _ALL)
1409
+
1410
+ #undef HWY_RVV_ROUNDING_SHR_VV
1411
+ #undef HWY_RVV_ROUNDING_SHR_II
1412
+
1279
1413
  // ------------------------------ Min
1280
1414
 
1281
1415
  namespace detail {
@@ -1450,6 +1584,20 @@ HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub, fnmacc, _ALL)
1450
1584
 
1451
1585
  // ================================================== COMPARE
1452
1586
 
1587
+ // ------------------------------ MClear
1588
+
1589
+ // mask = f()
1590
+ #define HWY_RVV_RETM(SEW, SHIFT, MLEN, NAME, OP) \
1591
+ HWY_API HWY_RVV_M(MLEN) NAME##MLEN() { \
1592
+ return __riscv_vm##OP##_m_b##MLEN(HWY_RVV_AVL(SEW, SHIFT)); \
1593
+ }
1594
+
1595
+ namespace detail {
1596
+ HWY_RVV_FOREACH_B(HWY_RVV_RETM, MClear, clr) // with ##MLEN suffix
1597
+ } // namespace detail
1598
+
1599
+ #undef HWY_RVV_RETM
1600
+
1453
1601
  // Comparisons set a mask bit to 1 if the condition is true, else 0. The XX in
1454
1602
  // vboolXX_t is a power of two divisor for vector bits. SEW=8 / LMUL=1 = 1/8th
1455
1603
  // of all bits; SEW=8 / LMUL=4 = half of all bits.
@@ -1463,6 +1611,16 @@ HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub, fnmacc, _ALL)
1463
1611
  a, b, HWY_RVV_AVL(SEW, SHIFT)); \
1464
1612
  }
1465
1613
 
1614
+ // mask = f(mask, vector, vector)
1615
+ #define HWY_RVV_RETM_ARGMVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1616
+ SHIFT, MLEN, NAME, OP) \
1617
+ HWY_API HWY_RVV_M(MLEN) \
1618
+ NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) a, \
1619
+ HWY_RVV_V(BASE, SEW, LMUL) b) { \
1620
+ return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN##_mu( \
1621
+ m, detail::MClear##MLEN(), a, b, HWY_RVV_AVL(SEW, SHIFT)); \
1622
+ }
1623
+
1466
1624
  // mask = f(vector, scalar)
1467
1625
  #define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1468
1626
  SHIFT, MLEN, NAME, OP) \
@@ -1472,9 +1630,17 @@ HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub, fnmacc, _ALL)
1472
1630
  a, b, HWY_RVV_AVL(SEW, SHIFT)); \
1473
1631
  }
1474
1632
 
1633
+ #ifdef HWY_NATIVE_MASKED_COMP
1634
+ #undef HWY_NATIVE_MASKED_COMP
1635
+ #else
1636
+ #define HWY_NATIVE_MASKED_COMP
1637
+ #endif
1638
+
1475
1639
  // ------------------------------ Eq
1476
1640
  HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVV, Eq, mseq, _ALL)
1477
1641
  HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Eq, mfeq, _ALL)
1642
+ HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGMVV, MaskedEq, mseq, _ALL)
1643
+ HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGMVV, MaskedEq, mfeq, _ALL)
1478
1644
 
1479
1645
  namespace detail {
1480
1646
  HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, EqS, mseq_vx, _ALL)
@@ -1484,6 +1650,8 @@ HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, EqS, mfeq_vf, _ALL)
1484
1650
  // ------------------------------ Ne
1485
1651
  HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVV, Ne, msne, _ALL)
1486
1652
  HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Ne, mfne, _ALL)
1653
+ HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGMVV, MaskedNe, msne, _ALL)
1654
+ HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGMVV, MaskedNe, mfne, _ALL)
1487
1655
 
1488
1656
  namespace detail {
1489
1657
  HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, NeS, msne_vx, _ALL)
@@ -1494,6 +1662,9 @@ HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, NeS, mfne_vf, _ALL)
1494
1662
  HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVV, Lt, msltu, _ALL)
1495
1663
  HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVV, Lt, mslt, _ALL)
1496
1664
  HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Lt, mflt, _ALL)
1665
+ HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGMVV, MaskedLt, msltu, _ALL)
1666
+ HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGMVV, MaskedLt, mslt, _ALL)
1667
+ HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGMVV, MaskedLt, mflt, _ALL)
1497
1668
 
1498
1669
  namespace detail {
1499
1670
  HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVS, LtS, mslt_vx, _ALL)
@@ -1505,20 +1676,43 @@ HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, LtS, mflt_vf, _ALL)
1505
1676
  HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVV, Le, msleu, _ALL)
1506
1677
  HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVV, Le, msle, _ALL)
1507
1678
  HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Le, mfle, _ALL)
1679
+ HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGMVV, MaskedLe, msleu, _ALL)
1680
+ HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGMVV, MaskedLe, msle, _ALL)
1681
+ HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGMVV, MaskedLe, mfle, _ALL)
1682
+
1683
+ template <class D>
1684
+ using MFromD = decltype(Eq(Zero(D()), Zero(D())));
1508
1685
 
1686
+ template <class V, class M, class D = DFromV<V>>
1687
+ HWY_API MFromD<D> MaskedIsNaN(const M m, const V v) {
1688
+ return MaskedNe(m, v, v);
1689
+ }
1690
+
1691
+ #undef HWY_RVV_RETM_ARGMVV
1509
1692
  #undef HWY_RVV_RETM_ARGVV
1510
1693
  #undef HWY_RVV_RETM_ARGVS
1511
1694
 
1512
- // ------------------------------ Gt/Ge
1695
+ // ------------------------------ Gt/Ge (Lt, Le)
1696
+
1697
+ // Swap args to reverse comparisons:
1698
+ template <class V>
1699
+ HWY_API auto Gt(const V a, const V b) -> decltype(Lt(a, b)) {
1700
+ return Lt(b, a);
1701
+ }
1513
1702
 
1514
1703
  template <class V>
1515
1704
  HWY_API auto Ge(const V a, const V b) -> decltype(Le(a, b)) {
1516
1705
  return Le(b, a);
1517
1706
  }
1518
1707
 
1519
- template <class V>
1520
- HWY_API auto Gt(const V a, const V b) -> decltype(Lt(a, b)) {
1521
- return Lt(b, a);
1708
+ template <class V, class M, class D = DFromV<V>>
1709
+ HWY_API MFromD<D> MaskedGt(M m, V a, V b) {
1710
+ return MaskedLt(m, b, a);
1711
+ }
1712
+
1713
+ template <class V, class M, class D = DFromV<V>>
1714
+ HWY_API MFromD<D> MaskedGe(M m, V a, V b) {
1715
+ return MaskedLe(m, b, a);
1522
1716
  }
1523
1717
 
1524
1718
  // ------------------------------ TestBit
@@ -1592,10 +1786,6 @@ HWY_RVV_FOREACH_F(HWY_RVV_IF_THEN_ZERO_ELSE, IfThenZeroElse, fmerge_vfm, _ALL)
1592
1786
  #undef HWY_RVV_IF_THEN_ZERO_ELSE
1593
1787
 
1594
1788
  // ------------------------------ MaskFromVec
1595
-
1596
- template <class D>
1597
- using MFromD = decltype(Eq(Zero(D()), Zero(D())));
1598
-
1599
1789
  template <class V>
1600
1790
  HWY_API MFromD<DFromV<V>> MaskFromVec(const V v) {
1601
1791
  return detail::NeS(v, 0);
@@ -2963,6 +3153,32 @@ HWY_RVV_FOREACH_F(HWY_RVV_CONVERT, _, _, _ALL_VIRT)
2963
3153
  HWY_RVV_FOREACH_F(HWY_RVV_NEAREST, _, _, _ALL)
2964
3154
  #undef HWY_RVV_NEAREST
2965
3155
 
3156
+ template <size_t N>
3157
+ HWY_API vint32mf2_t DemoteToNearestInt(Simd<int32_t, N, -2> d,
3158
+ const vfloat64m1_t v) {
3159
+ return __riscv_vfncvt_x_f_w_i32mf2(v, Lanes(d));
3160
+ }
3161
+ template <size_t N>
3162
+ HWY_API vint32mf2_t DemoteToNearestInt(Simd<int32_t, N, -1> d,
3163
+ const vfloat64m1_t v) {
3164
+ return __riscv_vfncvt_x_f_w_i32mf2(v, Lanes(d));
3165
+ }
3166
+ template <size_t N>
3167
+ HWY_API vint32m1_t DemoteToNearestInt(Simd<int32_t, N, 0> d,
3168
+ const vfloat64m2_t v) {
3169
+ return __riscv_vfncvt_x_f_w_i32m1(v, Lanes(d));
3170
+ }
3171
+ template <size_t N>
3172
+ HWY_API vint32m2_t DemoteToNearestInt(Simd<int32_t, N, 1> d,
3173
+ const vfloat64m4_t v) {
3174
+ return __riscv_vfncvt_x_f_w_i32m2(v, Lanes(d));
3175
+ }
3176
+ template <size_t N>
3177
+ HWY_API vint32m4_t DemoteToNearestInt(Simd<int32_t, N, 2> d,
3178
+ const vfloat64m8_t v) {
3179
+ return __riscv_vfncvt_x_f_w_i32m4(v, Lanes(d));
3180
+ }
3181
+
2966
3182
  // ================================================== COMBINE
2967
3183
 
2968
3184
  namespace detail {
@@ -3025,6 +3241,151 @@ HWY_RVV_FOREACH(HWY_RVV_SLIDE_DOWN, SlideDown, slidedown, _ALL)
3025
3241
  #undef HWY_RVV_SLIDE_UP
3026
3242
  #undef HWY_RVV_SLIDE_DOWN
3027
3243
 
3244
+ #define HWY_RVV_GET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
3245
+ MLEN, NAME, OP) \
3246
+ template <size_t kIndex> \
3247
+ HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
3248
+ return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULH( \
3249
+ v, kIndex); /* no AVL */ \
3250
+ }
3251
+ #define HWY_RVV_GET_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
3252
+ SHIFT, MLEN, NAME, OP) \
3253
+ template <size_t kIndex> \
3254
+ HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
3255
+ static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); \
3256
+ HWY_IF_CONSTEXPR(kIndex == 0) { return Trunc(v); } \
3257
+ HWY_IF_CONSTEXPR(kIndex != 0) { \
3258
+ return Trunc(SlideDown( \
3259
+ v, Lanes(HWY_RVV_D(BASE, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), \
3260
+ SHIFT - 1){}))); \
3261
+ } \
3262
+ }
3263
+ #define HWY_RVV_GET_SMALLEST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
3264
+ SHIFT, MLEN, NAME, OP) \
3265
+ template <size_t kIndex> \
3266
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
3267
+ static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); \
3268
+ HWY_IF_CONSTEXPR(kIndex == 0) { return v; } \
3269
+ HWY_IF_CONSTEXPR(kIndex != 0) { \
3270
+ return SlideDown( \
3271
+ v, Lanes(HWY_RVV_D(BASE, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), \
3272
+ SHIFT){}) / \
3273
+ 2); \
3274
+ } \
3275
+ }
3276
+ HWY_RVV_FOREACH(HWY_RVV_GET, Get, get, _GET_SET)
3277
+ HWY_RVV_FOREACH(HWY_RVV_GET_VIRT, Get, get, _GET_SET_VIRT)
3278
+ HWY_RVV_FOREACH(HWY_RVV_GET_SMALLEST, Get, get, _GET_SET_SMALLEST)
3279
+ #undef HWY_RVV_GET
3280
+ #undef HWY_RVV_GET_VIRT
3281
+ #undef HWY_RVV_GET_SMALLEST
3282
+
3283
+ template <size_t kIndex, class D>
3284
+ static HWY_INLINE HWY_MAYBE_UNUSED VFromD<AdjustSimdTagToMinVecPow2<Half<D>>>
3285
+ Get(D d, VFromD<D> v) {
3286
+ static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1");
3287
+ HWY_IF_CONSTEXPR(kIndex == 0 || detail::IsFull(d)) { return Get<kIndex>(v); }
3288
+ HWY_IF_CONSTEXPR(kIndex != 0 && !detail::IsFull(d)) {
3289
+ const AdjustSimdTagToMinVecPow2<Half<decltype(d)>> dh;
3290
+ const size_t slide_down_amt =
3291
+ (dh.Pow2() < DFromV<decltype(v)>().Pow2()) ? Lanes(dh) : (Lanes(d) / 2);
3292
+ return ResizeBitCast(dh, SlideDown(v, slide_down_amt));
3293
+ }
3294
+ }
3295
+
3296
+ #define HWY_RVV_PARTIAL_VEC_SET_HALF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
3297
+ LMULH, SHIFT, MLEN, NAME, OP) \
3298
+ template <size_t kIndex> \
3299
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
3300
+ NAME(HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMULH) v, \
3301
+ size_t half_N) { \
3302
+ static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); \
3303
+ const DFromV<decltype(dest)> d; \
3304
+ HWY_IF_CONSTEXPR(kIndex == 0) { \
3305
+ return __riscv_v##OP##_v_v_##CHAR##SEW##LMUL##_tu(dest, Ext(d, v), \
3306
+ half_N); \
3307
+ } \
3308
+ HWY_IF_CONSTEXPR(kIndex != 0) { return SlideUp(dest, Ext(d, v), half_N); } \
3309
+ }
3310
+ #define HWY_RVV_PARTIAL_VEC_SET_HALF_SMALLEST( \
3311
+ BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP) \
3312
+ template <size_t kIndex> \
3313
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
3314
+ NAME(HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMUL) v, \
3315
+ size_t half_N) { \
3316
+ static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); \
3317
+ HWY_IF_CONSTEXPR(kIndex == 0) { \
3318
+ return __riscv_v##OP##_v_v_##CHAR##SEW##LMUL##_tu(dest, v, half_N); \
3319
+ } \
3320
+ HWY_IF_CONSTEXPR(kIndex != 0) { return SlideUp(dest, v, half_N); } \
3321
+ }
3322
+ HWY_RVV_FOREACH(HWY_RVV_PARTIAL_VEC_SET_HALF, PartialVecSetHalf, mv, _GET_SET)
3323
+ HWY_RVV_FOREACH(HWY_RVV_PARTIAL_VEC_SET_HALF, PartialVecSetHalf, mv,
3324
+ _GET_SET_VIRT)
3325
+ HWY_RVV_FOREACH(HWY_RVV_PARTIAL_VEC_SET_HALF_SMALLEST, PartialVecSetHalf, mv,
3326
+ _GET_SET_SMALLEST)
3327
+ #undef HWY_RVV_PARTIAL_VEC_SET_HALF
3328
+ #undef HWY_RVV_PARTIAL_VEC_SET_HALF_SMALLEST
3329
+
3330
+ #define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
3331
+ MLEN, NAME, OP) \
3332
+ template <size_t kIndex, size_t N> \
3333
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
3334
+ NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) dest, \
3335
+ HWY_RVV_V(BASE, SEW, LMULH) v) { \
3336
+ HWY_IF_CONSTEXPR(detail::IsFull(d)) { \
3337
+ return __riscv_v##OP##_v_##CHAR##SEW##LMULH##_##CHAR##SEW##LMUL( \
3338
+ dest, kIndex, v); /* no AVL */ \
3339
+ } \
3340
+ HWY_IF_CONSTEXPR(!detail::IsFull(d)) { \
3341
+ const Half<decltype(d)> dh; \
3342
+ return PartialVecSetHalf<kIndex>(dest, v, Lanes(dh)); \
3343
+ } \
3344
+ }
3345
+ #define HWY_RVV_SET_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
3346
+ SHIFT, MLEN, NAME, OP) \
3347
+ template <size_t kIndex, size_t N> \
3348
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
3349
+ NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) dest, \
3350
+ HWY_RVV_V(BASE, SEW, LMULH) v) { \
3351
+ const Half<decltype(d)> dh; \
3352
+ return PartialVecSetHalf<kIndex>(dest, v, Lanes(dh)); \
3353
+ }
3354
+ #define HWY_RVV_SET_SMALLEST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
3355
+ SHIFT, MLEN, NAME, OP) \
3356
+ template <size_t kIndex, size_t N> \
3357
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
3358
+ NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) dest, \
3359
+ HWY_RVV_V(BASE, SEW, LMUL) v) { \
3360
+ return PartialVecSetHalf<kIndex>(dest, v, Lanes(d) / 2); \
3361
+ }
3362
+ #define HWY_RVV_SET_SMALLEST_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
3363
+ LMULH, SHIFT, MLEN, NAME, OP) \
3364
+ template <size_t kIndex, size_t N> \
3365
+ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
3366
+ NAME(HWY_RVV_D(BASE, SEW, N, SHIFT - 1) d, \
3367
+ HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMUL) v) { \
3368
+ return PartialVecSetHalf<kIndex>(dest, v, Lanes(d) / 2); \
3369
+ }
3370
+ HWY_RVV_FOREACH(HWY_RVV_SET, Set, set, _GET_SET)
3371
+ HWY_RVV_FOREACH(HWY_RVV_SET_VIRT, Set, set, _GET_SET_VIRT)
3372
+ HWY_RVV_FOREACH(HWY_RVV_SET_SMALLEST, Set, set, _GET_SET_SMALLEST)
3373
+ HWY_RVV_FOREACH_UI163264(HWY_RVV_SET_SMALLEST_VIRT, Set, set, _GET_SET_SMALLEST)
3374
+ HWY_RVV_FOREACH_F(HWY_RVV_SET_SMALLEST_VIRT, Set, set, _GET_SET_SMALLEST)
3375
+ #undef HWY_RVV_SET
3376
+ #undef HWY_RVV_SET_VIRT
3377
+ #undef HWY_RVV_SET_SMALLEST
3378
+ #undef HWY_RVV_SET_SMALLEST_VIRT
3379
+
3380
+ template <size_t kIndex, class D, HWY_RVV_IF_EMULATED_D(D)>
3381
+ static HWY_INLINE HWY_MAYBE_UNUSED VFromD<D> Set(
3382
+ D d, VFromD<D> dest, VFromD<AdjustSimdTagToMinVecPow2<Half<D>>> v) {
3383
+ const RebindToUnsigned<decltype(d)> du;
3384
+ return BitCast(
3385
+ d, Set<kIndex>(du, BitCast(du, dest),
3386
+ BitCast(RebindToUnsigned<DFromV<decltype(v)>>(), v)));
3387
+ }
3388
+
3028
3389
  } // namespace detail
3029
3390
 
3030
3391
  // ------------------------------ SlideUpLanes
@@ -3047,39 +3408,36 @@ HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
3047
3408
  // ------------------------------ ConcatUpperLower
3048
3409
  template <class D, class V>
3049
3410
  HWY_API V ConcatUpperLower(D d, const V hi, const V lo) {
3050
- const size_t half = Lanes(d) / 2;
3051
- const V hi_down = detail::SlideDown(hi, half);
3052
- return detail::SlideUp(lo, hi_down, half);
3411
+ const auto lo_lower = detail::Get<0>(d, lo);
3412
+ return detail::Set<0>(d, hi, lo_lower);
3053
3413
  }
3054
3414
 
3055
3415
  // ------------------------------ ConcatLowerLower
3056
3416
  template <class D, class V>
3057
3417
  HWY_API V ConcatLowerLower(D d, const V hi, const V lo) {
3058
- return detail::SlideUp(lo, hi, Lanes(d) / 2);
3418
+ const auto hi_lower = detail::Get<0>(d, hi);
3419
+ return detail::Set<1>(d, lo, hi_lower);
3059
3420
  }
3060
3421
 
3061
3422
  // ------------------------------ ConcatUpperUpper
3062
3423
  template <class D, class V>
3063
3424
  HWY_API V ConcatUpperUpper(D d, const V hi, const V lo) {
3064
- const size_t half = Lanes(d) / 2;
3065
- const V hi_down = detail::SlideDown(hi, half);
3066
- const V lo_down = detail::SlideDown(lo, half);
3067
- return detail::SlideUp(lo_down, hi_down, half);
3425
+ const auto lo_upper = detail::Get<1>(d, lo);
3426
+ return detail::Set<0>(d, hi, lo_upper);
3068
3427
  }
3069
3428
 
3070
3429
  // ------------------------------ ConcatLowerUpper
3071
3430
  template <class D, class V>
3072
3431
  HWY_API V ConcatLowerUpper(D d, const V hi, const V lo) {
3073
- const size_t half = Lanes(d) / 2;
3074
- const V lo_down = detail::SlideDown(lo, half);
3075
- return detail::SlideUp(lo_down, hi, half);
3432
+ const auto lo_upper = detail::Get<1>(d, lo);
3433
+ const auto hi_lower = detail::Get<0>(d, hi);
3434
+ return detail::Set<1>(d, ResizeBitCast(d, lo_upper), hi_lower);
3076
3435
  }
3077
3436
 
3078
3437
  // ------------------------------ Combine
3079
3438
  template <class D2, class V>
3080
3439
  HWY_API VFromD<D2> Combine(D2 d2, const V hi, const V lo) {
3081
- return detail::SlideUp(detail::Ext(d2, lo), detail::Ext(d2, hi),
3082
- Lanes(d2) / 2);
3440
+ return detail::Set<1>(d2, ResizeBitCast(d2, lo), hi);
3083
3441
  }
3084
3442
 
3085
3443
  // ------------------------------ ZeroExtendVector
@@ -3126,8 +3484,9 @@ HWY_API VFromD<Half<DFromV<V>>> LowerHalf(const V v) {
3126
3484
  }
3127
3485
 
3128
3486
  template <class DH>
3129
- HWY_API VFromD<DH> UpperHalf(const DH d2, const VFromD<Twice<DH>> v) {
3130
- return LowerHalf(d2, detail::SlideDown(v, Lanes(d2)));
3487
+ HWY_API VFromD<DH> UpperHalf(const DH /*d2*/, const VFromD<Twice<DH>> v) {
3488
+ const Twice<DH> d;
3489
+ return detail::Get<1>(d, v);
3131
3490
  }
3132
3491
 
3133
3492
  // ================================================== SWIZZLE
@@ -3309,6 +3668,24 @@ HWY_API V SwapAdjacentBlocks(const V v) {
3309
3668
  return OddEvenBlocks(up, down);
3310
3669
  }
3311
3670
 
3671
+ // ------------------------------ InterleaveEvenBlocks
3672
+ // (SlideUpLanes, OddEvenBlocks)
3673
+
3674
+ template <class D, class V = VFromD<D>>
3675
+ HWY_API V InterleaveEvenBlocks(D d, V a, V b) {
3676
+ const size_t lpb = detail::LanesPerBlock(d);
3677
+ return OddEvenBlocks(SlideUpLanes(d, b, lpb), a);
3678
+ }
3679
+
3680
+ // ------------------------------ InterleaveOddBlocks
3681
+ // (SlideDownLanes, OddEvenBlocks)
3682
+
3683
+ template <class D, class V = VFromD<D>>
3684
+ HWY_API V InterleaveOddBlocks(D d, V a, V b) {
3685
+ const size_t lpb = detail::LanesPerBlock(d);
3686
+ return OddEvenBlocks(b, SlideDownLanes(d, a, lpb));
3687
+ }
3688
+
3312
3689
  // ------------------------------ TableLookupLanes
3313
3690
 
3314
3691
  template <class D, class VI>
@@ -4457,6 +4834,8 @@ HWY_API T ReduceMax(D d, const VFromD<D> v) {
4457
4834
 
4458
4835
  #undef HWY_RVV_REDUCE
4459
4836
 
4837
+ // TODO: add MaskedReduceSum/Min/Max
4838
+
4460
4839
  // ------------------------------ SumOfLanes
4461
4840
 
4462
4841
  template <class D, HWY_IF_LANES_GT_D(D, 1)>
@@ -4687,7 +5066,7 @@ HWY_RVV_FOREACH(HWY_RVV_STORE4, StoreInterleaved4, sseg4, _LE2_VIRT)
4687
5066
 
4688
5067
  #else // !HWY_HAVE_TUPLE
4689
5068
 
4690
- template <class D, typename T = TFromD<D>>
5069
+ template <class D, typename T = TFromD<D>, HWY_RVV_IF_NOT_EMULATED_D(D)>
4691
5070
  HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
4692
5071
  VFromD<D>& v0, VFromD<D>& v1) {
4693
5072
  const VFromD<D> A = LoadU(d, unaligned); // v1[1] v0[1] v1[0] v0[0]
@@ -4710,7 +5089,7 @@ HWY_RVV_FOREACH(HWY_RVV_LOAD_STRIDED, LoadStrided, lse, _ALL_VIRT)
4710
5089
  #undef HWY_RVV_LOAD_STRIDED
4711
5090
  } // namespace detail
4712
5091
 
4713
- template <class D, typename T = TFromD<D>>
5092
+ template <class D, typename T = TFromD<D>, HWY_RVV_IF_NOT_EMULATED_D(D)>
4714
5093
  HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
4715
5094
  VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
4716
5095
  // Offsets are bytes, and this is not documented.
@@ -4719,7 +5098,7 @@ HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
4719
5098
  v2 = detail::LoadStrided(d, unaligned + 2, 3 * sizeof(T));
4720
5099
  }
4721
5100
 
4722
- template <class D, typename T = TFromD<D>>
5101
+ template <class D, typename T = TFromD<D>, HWY_RVV_IF_NOT_EMULATED_D(D)>
4723
5102
  HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
4724
5103
  VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
4725
5104
  VFromD<D>& v3) {
@@ -4732,7 +5111,7 @@ HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
4732
5111
 
4733
5112
  // Not 64-bit / max LMUL: interleave via promote, slide, OddEven.
4734
5113
  template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE_D(D, 8),
4735
- HWY_IF_POW2_LE_D(D, 2)>
5114
+ HWY_IF_POW2_LE_D(D, 2), HWY_RVV_IF_NOT_EMULATED_D(D)>
4736
5115
  HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
4737
5116
  T* HWY_RESTRICT unaligned) {
4738
5117
  const RebindToUnsigned<D> du;
@@ -4747,7 +5126,7 @@ HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
4747
5126
 
4748
5127
  // Can promote, max LMUL: two half-length
4749
5128
  template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE_D(D, 8),
4750
- HWY_IF_POW2_GT_D(D, 2)>
5129
+ HWY_IF_POW2_GT_D(D, 2), HWY_RVV_IF_NOT_EMULATED_D(D)>
4751
5130
  HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
4752
5131
  T* HWY_RESTRICT unaligned) {
4753
5132
  const Half<decltype(d)> dh;
@@ -4771,7 +5150,8 @@ HWY_RVV_FOREACH(HWY_RVV_STORE_STRIDED, StoreStrided, sse, _ALL_VIRT)
4771
5150
  } // namespace detail
4772
5151
 
4773
5152
  // 64-bit: strided
4774
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE_D(D, 8)>
5153
+ template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE_D(D, 8),
5154
+ HWY_RVV_IF_NOT_EMULATED_D(D)>
4775
5155
  HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
4776
5156
  T* HWY_RESTRICT unaligned) {
4777
5157
  // Offsets are bytes, and this is not documented.
@@ -4779,7 +5159,7 @@ HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
4779
5159
  detail::StoreStrided(v1, d, unaligned + 1, 2 * sizeof(T));
4780
5160
  }
4781
5161
 
4782
- template <class D, typename T = TFromD<D>>
5162
+ template <class D, typename T = TFromD<D>, HWY_RVV_IF_NOT_EMULATED_D(D)>
4783
5163
  HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
4784
5164
  T* HWY_RESTRICT unaligned) {
4785
5165
  // Offsets are bytes, and this is not documented.
@@ -4788,7 +5168,7 @@ HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
4788
5168
  detail::StoreStrided(v2, d, unaligned + 2, 3 * sizeof(T));
4789
5169
  }
4790
5170
 
4791
- template <class D, typename T = TFromD<D>>
5171
+ template <class D, typename T = TFromD<D>, HWY_RVV_IF_NOT_EMULATED_D(D)>
4792
5172
  HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
4793
5173
  VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
4794
5174
  // Offsets are bytes, and this is not documented.
@@ -4800,6 +5180,9 @@ HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
4800
5180
 
4801
5181
  #endif // HWY_HAVE_TUPLE
4802
5182
 
5183
+ // Rely on generic Load/StoreInterleaved[234] for any emulated types.
5184
+ // Requires HWY_GENERIC_IF_EMULATED_D mirrors HWY_RVV_IF_EMULATED_D.
5185
+
4803
5186
  // ------------------------------ Dup128VecFromValues (ResizeBitCast)
4804
5187
 
4805
5188
  template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_D(D, 1)>
@@ -5176,6 +5559,12 @@ template <size_t kN, HWY_IF_LANES_GT(kN, 31)>
5176
5559
  constexpr unsigned MaxMaskBits() {
5177
5560
  return ~0u;
5178
5561
  }
5562
+
5563
+ template <class D>
5564
+ constexpr int SufficientPow2ForMask() {
5565
+ return HWY_MAX(
5566
+ D().Pow2() - 3 - static_cast<int>(FloorLog2(sizeof(TFromD<D>))), -3);
5567
+ }
5179
5568
  } // namespace detail
5180
5569
 
5181
5570
  template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_LE_D(D, 8)>
@@ -5202,11 +5591,13 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5202
5591
  template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 8)>
5203
5592
  HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5204
5593
  #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5205
- const ScalableTag<uint8_t> du8;
5206
- const ScalableTag<uint16_t> du16;
5594
+ const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8;
5595
+ const ScalableTag<uint16_t, detail::SufficientPow2ForMask<D>()> du16;
5207
5596
  // There are exactly 16 mask bits for 128 vector bits of 8-bit lanes.
5208
5597
  return detail::U8MaskBitsVecToMask(
5209
- d, BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits))));
5598
+ d, detail::ChangeLMUL(
5599
+ ScalableTag<uint8_t>(),
5600
+ BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits)))));
5210
5601
  #else
5211
5602
  // Slow fallback for completeness; the above bits to mask cast is preferred.
5212
5603
  const RebindToUnsigned<decltype(d)> du8;
@@ -5233,10 +5624,11 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5233
5624
  if (kN < 8) mask_bits &= detail::MaxMaskBits<kN>();
5234
5625
 
5235
5626
  #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5236
- const ScalableTag<uint8_t> du8;
5627
+ const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8;
5237
5628
  // There are exactly 8 mask bits for 128 vector bits of 16-bit lanes.
5238
- return detail::U8MaskBitsVecToMask(d,
5239
- Set(du8, static_cast<uint8_t>(mask_bits)));
5629
+ return detail::U8MaskBitsVecToMask(
5630
+ d, detail::ChangeLMUL(ScalableTag<uint8_t>(),
5631
+ Set(du8, static_cast<uint8_t>(mask_bits))));
5240
5632
  #else
5241
5633
  // Slow fallback for completeness; the above bits to mask cast is preferred.
5242
5634
  const RebindToUnsigned<D> du;
@@ -5252,9 +5644,10 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5252
5644
  if (kN < 4) mask_bits &= detail::MaxMaskBits<kN>();
5253
5645
 
5254
5646
  #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5255
- const ScalableTag<uint8_t> du8;
5647
+ const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8;
5256
5648
  return detail::U8MaskBitsVecToMask(
5257
- d, Set(du8, static_cast<uint8_t>(mask_bits * 0x11)));
5649
+ d, detail::ChangeLMUL(ScalableTag<uint8_t>(),
5650
+ Set(du8, static_cast<uint8_t>(mask_bits * 0x11))));
5258
5651
  #else
5259
5652
  // Slow fallback for completeness; the above bits to mask cast is preferred.
5260
5653
  const RebindToUnsigned<D> du;
@@ -5269,9 +5662,10 @@ HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5269
5662
  if (kN < 2) mask_bits &= detail::MaxMaskBits<kN>();
5270
5663
 
5271
5664
  #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5272
- const ScalableTag<uint8_t> du8;
5665
+ const ScalableTag<uint8_t, detail::SufficientPow2ForMask<D>()> du8;
5273
5666
  return detail::U8MaskBitsVecToMask(
5274
- d, Set(du8, static_cast<uint8_t>(mask_bits * 0x55)));
5667
+ d, detail::ChangeLMUL(ScalableTag<uint8_t>(),
5668
+ Set(du8, static_cast<uint8_t>(mask_bits * 0x55))));
5275
5669
  #else
5276
5670
  // Slow fallback for completeness; the above bits to mask cast is preferred.
5277
5671
  const RebindToUnsigned<D> du;
@@ -5553,9 +5947,13 @@ HWY_API V64 BitShuffle(V64 values, VI idx) {
5553
5947
  template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4)),
5554
5948
  class D = DFromV<V>, class DW = RepartitionToWide<D>>
5555
5949
  HWY_API VFromD<DW> MulEven(const V a, const V b) {
5556
- const auto lo = Mul(a, b);
5557
- const auto hi = MulHigh(a, b);
5558
- return BitCast(DW(), OddEven(detail::Slide1Up(hi), lo));
5950
+ constexpr int maskVal = sizeof(TFromD<D>) == 4 ? 5
5951
+ : sizeof(TFromD<D>) == 2 ? 0x55
5952
+ : 0x5555;
5953
+ const auto mask = Dup128MaskFromMaskBits(D(), maskVal);
5954
+ const auto hi = Slide1Up(D(), MulHigh(a, b));
5955
+ const auto res = MaskedMulOr(hi, mask, a, b);
5956
+ return BitCast(DW(), res);
5559
5957
  }
5560
5958
 
5561
5959
  template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4)),
@@ -5569,9 +5967,9 @@ HWY_API VFromD<DW> MulOdd(const V a, const V b) {
5569
5967
  // There is no 64x64 vwmul.
5570
5968
  template <class V, HWY_IF_T_SIZE_V(V, 8)>
5571
5969
  HWY_INLINE V MulEven(const V a, const V b) {
5572
- const auto lo = Mul(a, b);
5573
- const auto hi = MulHigh(a, b);
5574
- return OddEven(detail::Slide1Up(hi), lo);
5970
+ const auto mask = Dup128MaskFromMaskBits(DFromV<V>(), 1);
5971
+ const auto hi = Slide1Up(DFromV<V>(), MulHigh(a, b));
5972
+ return MaskedMulOr(hi, mask, a, b);
5575
5973
  }
5576
5974
 
5577
5975
  template <class V, HWY_IF_T_SIZE_V(V, 8)>
@@ -5915,6 +6313,23 @@ HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
5915
6313
  #endif // HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5916
6314
 
5917
6315
  // ------------------------------ Lt128Upper
6316
+ #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
6317
+
6318
+ template <class D>
6319
+ HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
6320
+ static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
6321
+ auto du8mf8 = ScalableTag<uint8_t, -3>{};
6322
+ const vuint8mf8_t ltHL =
6323
+ detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Lt(a, b)));
6324
+ const vuint8mf8_t ltHx = detail::AndS(ltHL, 0xaa);
6325
+ const vuint8mf8_t ltxL = ShiftRight<1>(ltHx);
6326
+ auto du8m1 = ScalableTag<uint8_t>{};
6327
+ return detail::U8MaskBitsVecToMask(d,
6328
+ detail::ChangeLMUL(du8m1, Or(ltHx, ltxL)));
6329
+ }
6330
+
6331
+ #else
6332
+
5918
6333
  template <class D>
5919
6334
  HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
5920
6335
  static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
@@ -5926,7 +6341,27 @@ HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
5926
6341
  return MaskFromVec(OddEven(ltHL, down));
5927
6342
  }
5928
6343
 
6344
+ #endif // HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
6345
+
5929
6346
  // ------------------------------ Eq128
6347
+ #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
6348
+
6349
+ template <class D>
6350
+ HWY_INLINE MFromD<D> Eq128(D d, const VFromD<D> a, const VFromD<D> b) {
6351
+ static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
6352
+ auto du8mf8 = ScalableTag<uint8_t, -3>{};
6353
+ const vuint8mf8_t eqHL =
6354
+ detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Eq(a, b)));
6355
+ const vuint8mf8_t eqxH = ShiftRight<1>(eqHL);
6356
+ const vuint8mf8_t result0L = detail::AndS(And(eqHL, eqxH), 0x55);
6357
+ const vuint8mf8_t resultH0 = Add(result0L, result0L);
6358
+ auto du8m1 = ScalableTag<uint8_t>{};
6359
+ return detail::U8MaskBitsVecToMask(
6360
+ d, detail::ChangeLMUL(du8m1, Or(result0L, resultH0)));
6361
+ }
6362
+
6363
+ #else
6364
+
5930
6365
  template <class D>
5931
6366
  HWY_INLINE MFromD<D> Eq128(D d, const VFromD<D> a, const VFromD<D> b) {
5932
6367
  static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
@@ -5938,7 +6373,26 @@ HWY_INLINE MFromD<D> Eq128(D d, const VFromD<D> a, const VFromD<D> b) {
5938
6373
  return MaskFromVec(eq);
5939
6374
  }
5940
6375
 
6376
+ #endif
6377
+
5941
6378
  // ------------------------------ Eq128Upper
6379
+ #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
6380
+
6381
+ template <class D>
6382
+ HWY_INLINE MFromD<D> Eq128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
6383
+ static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
6384
+ auto du8mf8 = ScalableTag<uint8_t, -3>{};
6385
+ const vuint8mf8_t eqHL =
6386
+ detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Eq(a, b)));
6387
+ const vuint8mf8_t eqHx = detail::AndS(eqHL, 0xaa);
6388
+ const vuint8mf8_t eqxL = ShiftRight<1>(eqHx);
6389
+ auto du8m1 = ScalableTag<uint8_t>{};
6390
+ return detail::U8MaskBitsVecToMask(d,
6391
+ detail::ChangeLMUL(du8m1, Or(eqHx, eqxL)));
6392
+ }
6393
+
6394
+ #else
6395
+
5942
6396
  template <class D>
5943
6397
  HWY_INLINE MFromD<D> Eq128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
5944
6398
  static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
@@ -5947,7 +6401,27 @@ HWY_INLINE MFromD<D> Eq128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
5947
6401
  return MaskFromVec(OddEven(eqHL, detail::Slide1Down(eqHL)));
5948
6402
  }
5949
6403
 
6404
+ #endif
6405
+
5950
6406
  // ------------------------------ Ne128
6407
+ #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
6408
+
6409
+ template <class D>
6410
+ HWY_INLINE MFromD<D> Ne128(D d, const VFromD<D> a, const VFromD<D> b) {
6411
+ static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
6412
+ auto du8mf8 = ScalableTag<uint8_t, -3>{};
6413
+ const vuint8mf8_t neHL =
6414
+ detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Ne(a, b)));
6415
+ const vuint8mf8_t nexH = ShiftRight<1>(neHL);
6416
+ const vuint8mf8_t result0L = detail::AndS(Or(neHL, nexH), 0x55);
6417
+ const vuint8mf8_t resultH0 = Add(result0L, result0L);
6418
+ auto du8m1 = ScalableTag<uint8_t>{};
6419
+ return detail::U8MaskBitsVecToMask(
6420
+ d, detail::ChangeLMUL(du8m1, Or(result0L, resultH0)));
6421
+ }
6422
+
6423
+ #else
6424
+
5951
6425
  template <class D>
5952
6426
  HWY_INLINE MFromD<D> Ne128(D d, const VFromD<D> a, const VFromD<D> b) {
5953
6427
  static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
@@ -5958,7 +6432,26 @@ HWY_INLINE MFromD<D> Ne128(D d, const VFromD<D> a, const VFromD<D> b) {
5958
6432
  return MaskFromVec(Or(neHL, neLH));
5959
6433
  }
5960
6434
 
6435
+ #endif
6436
+
5961
6437
  // ------------------------------ Ne128Upper
6438
+ #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
6439
+
6440
+ template <class D>
6441
+ HWY_INLINE MFromD<D> Ne128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
6442
+ static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
6443
+ auto du8mf8 = ScalableTag<uint8_t, -3>{};
6444
+ const vuint8mf8_t neHL =
6445
+ detail::ChangeLMUL(du8mf8, detail::MaskToU8MaskBitsVec(Ne(a, b)));
6446
+ const vuint8mf8_t neHx = detail::AndS(neHL, 0xaa);
6447
+ const vuint8mf8_t nexL = ShiftRight<1>(neHx);
6448
+ auto du8m1 = ScalableTag<uint8_t>{};
6449
+ return detail::U8MaskBitsVecToMask(d,
6450
+ detail::ChangeLMUL(du8m1, Or(neHx, nexL)));
6451
+ }
6452
+
6453
+ #else
6454
+
5962
6455
  template <class D>
5963
6456
  HWY_INLINE MFromD<D> Ne128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
5964
6457
  static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
@@ -5970,6 +6463,8 @@ HWY_INLINE MFromD<D> Ne128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
5970
6463
  return MaskFromVec(OddEven(neHL, down));
5971
6464
  }
5972
6465
 
6466
+ #endif
6467
+
5973
6468
  // ------------------------------ Min128, Max128 (Lt128)
5974
6469
 
5975
6470
  template <class D>