@img/sharp-libvips-dev 1.2.0 → 1.2.2-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/include/ffi.h +3 -3
- package/include/harfbuzz/hb-deprecated.h +4 -4
- package/include/harfbuzz/hb-font.h +120 -9
- package/include/harfbuzz/hb-version.h +3 -3
- package/include/hwy/abort.h +2 -19
- package/include/hwy/aligned_allocator.h +11 -7
- package/include/hwy/auto_tune.h +504 -0
- package/include/hwy/base.h +425 -104
- package/include/hwy/cache_control.h +16 -0
- package/include/hwy/detect_compiler_arch.h +32 -1
- package/include/hwy/detect_targets.h +251 -67
- package/include/hwy/foreach_target.h +35 -0
- package/include/hwy/highway.h +185 -76
- package/include/hwy/nanobenchmark.h +1 -19
- package/include/hwy/ops/arm_neon-inl.h +969 -458
- package/include/hwy/ops/arm_sve-inl.h +1137 -359
- package/include/hwy/ops/emu128-inl.h +97 -11
- package/include/hwy/ops/generic_ops-inl.h +1222 -34
- package/include/hwy/ops/loongarch_lasx-inl.h +4664 -0
- package/include/hwy/ops/loongarch_lsx-inl.h +5933 -0
- package/include/hwy/ops/ppc_vsx-inl.h +306 -126
- package/include/hwy/ops/rvv-inl.h +546 -51
- package/include/hwy/ops/scalar-inl.h +77 -22
- package/include/hwy/ops/set_macros-inl.h +138 -17
- package/include/hwy/ops/shared-inl.h +50 -10
- package/include/hwy/ops/wasm_128-inl.h +137 -92
- package/include/hwy/ops/x86_128-inl.h +773 -214
- package/include/hwy/ops/x86_256-inl.h +712 -255
- package/include/hwy/ops/x86_512-inl.h +429 -753
- package/include/hwy/ops/x86_avx3-inl.h +501 -0
- package/include/hwy/per_target.h +2 -1
- package/include/hwy/profiler.h +622 -486
- package/include/hwy/targets.h +62 -20
- package/include/hwy/timer-inl.h +8 -160
- package/include/hwy/timer.h +170 -3
- package/include/hwy/x86_cpuid.h +81 -0
- package/include/libheif/heif_cxx.h +25 -5
- package/include/libheif/heif_regions.h +5 -5
- package/include/libheif/heif_version.h +2 -2
- package/include/librsvg-2.0/librsvg/rsvg-version.h +2 -2
- package/include/libxml2/libxml/xmlversion.h +4 -4
- package/include/pango-1.0/pango/pango-enum-types.h +3 -0
- package/include/pango-1.0/pango/pango-features.h +3 -3
- package/include/pango-1.0/pango/pango-font.h +30 -0
- package/include/pango-1.0/pango/pango-version-macros.h +26 -0
- package/include/pixman-1/pixman-version.h +2 -2
- package/include/webp/decode.h +11 -2
- package/include/webp/demux.h +2 -0
- package/include/webp/encode.h +2 -0
- package/include/webp/mux_types.h +1 -0
- package/include/webp/sharpyuv/sharpyuv.h +1 -1
- package/include/webp/types.h +2 -2
- package/include/zlib.h +3 -3
- package/package.json +1 -1
- package/versions.json +11 -11
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
#ifndef HIGHWAY_HWY_CACHE_CONTROL_H_
|
|
17
17
|
#define HIGHWAY_HWY_CACHE_CONTROL_H_
|
|
18
18
|
|
|
19
|
+
#include "hwy/aligned_allocator.h" // HWY_ALIGNMENT
|
|
19
20
|
#include "hwy/base.h"
|
|
20
21
|
|
|
21
22
|
// Requires SSE2; fails to compile on 32-bit Clang 7 (see
|
|
@@ -66,6 +67,21 @@ HWY_INLINE HWY_ATTR_CACHE void LoadFence() {
|
|
|
66
67
|
// TODO(janwas): remove when this function is removed. (See above.)
|
|
67
68
|
#pragma pop_macro("LoadFence")
|
|
68
69
|
|
|
70
|
+
// Overwrites "to" while attempting to bypass the cache (read-for-ownership).
|
|
71
|
+
// Both pointers must be aligned.
|
|
72
|
+
static HWY_INLINE void StreamCacheLine(const uint64_t* HWY_RESTRICT from,
|
|
73
|
+
uint64_t* HWY_RESTRICT to) {
|
|
74
|
+
HWY_DASSERT(IsAligned(from));
|
|
75
|
+
HWY_DASSERT(IsAligned(to));
|
|
76
|
+
#if HWY_COMPILER_CLANG && !defined(HWY_DISABLE_CACHE_CONTROL)
|
|
77
|
+
for (size_t i = 0; i < HWY_ALIGNMENT / sizeof(uint64_t); ++i) {
|
|
78
|
+
__builtin_nontemporal_store(from[i], to + i);
|
|
79
|
+
}
|
|
80
|
+
#else
|
|
81
|
+
hwy::CopyBytes(from, to, HWY_ALIGNMENT);
|
|
82
|
+
#endif
|
|
83
|
+
}
|
|
84
|
+
|
|
69
85
|
// Ensures values written by previous `Stream` calls are visible on the current
|
|
70
86
|
// core. This is NOT sufficient for synchronizing across cores; when `Stream`
|
|
71
87
|
// outputs are to be consumed by other core(s), the producer must publish
|
|
@@ -192,6 +192,18 @@
|
|
|
192
192
|
#define HWY_IF_CONSTEXPR if
|
|
193
193
|
#endif
|
|
194
194
|
|
|
195
|
+
// Use for constexpr variables at namespace scope in headers. Constexpr is
|
|
196
|
+
// separate to allow using `HWY_CXX14_CONSTEXPR` if required.
|
|
197
|
+
#ifndef HWY_INLINE_VAR
|
|
198
|
+
#if __cplusplus > 201402L
|
|
199
|
+
// C++17: mark as COMDAT to ensure linkers de-duplicate it. See
|
|
200
|
+
// https://quuxplusone.github.io/blog/2022/07/08/inline-constexpr/
|
|
201
|
+
#define HWY_INLINE_VAR inline
|
|
202
|
+
#else
|
|
203
|
+
#define HWY_INLINE_VAR
|
|
204
|
+
#endif
|
|
205
|
+
#endif
|
|
206
|
+
|
|
195
207
|
//------------------------------------------------------------------------------
|
|
196
208
|
// Architecture
|
|
197
209
|
|
|
@@ -303,10 +315,29 @@
|
|
|
303
315
|
#define HWY_ARCH_S390X 0
|
|
304
316
|
#endif
|
|
305
317
|
|
|
318
|
+
#if defined(__loongarch64__) || defined(__loongarch64) || \
|
|
319
|
+
(defined(__loongarch_grlen) && __loongarch_grlen == 64)
|
|
320
|
+
#define HWY_ARCH_LOONGARCH_64 1
|
|
321
|
+
#else
|
|
322
|
+
#define HWY_ARCH_LOONGARCH_64 0
|
|
323
|
+
#endif
|
|
324
|
+
|
|
325
|
+
#if defined(__loongarch__) && !HWY_ARCH_LOONGARCH_64
|
|
326
|
+
#define HWY_ARCH_LOONGARCH_32 1
|
|
327
|
+
#else
|
|
328
|
+
#define HWY_ARCH_LOONGARCH_32 0
|
|
329
|
+
#endif
|
|
330
|
+
|
|
331
|
+
#if HWY_ARCH_LOONGARCH_64 || HWY_ARCH_LOONGARCH_32
|
|
332
|
+
#define HWY_ARCH_LOONGARCH 1
|
|
333
|
+
#else
|
|
334
|
+
#define HWY_ARCH_LOONGARCH 0
|
|
335
|
+
#endif
|
|
336
|
+
|
|
306
337
|
// It is an error to detect multiple architectures at the same time, but OK to
|
|
307
338
|
// detect none of the above.
|
|
308
339
|
#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_ARM_OLD + \
|
|
309
|
-
HWY_ARCH_WASM + HWY_ARCH_RISCV + HWY_ARCH_S390X) > 1
|
|
340
|
+
HWY_ARCH_WASM + HWY_ARCH_RISCV + HWY_ARCH_S390X + HWY_ARCH_LOONGARCH) > 1
|
|
310
341
|
#error "Must not detect more than one architecture"
|
|
311
342
|
#endif
|
|
312
343
|
|
|
@@ -59,20 +59,20 @@
|
|
|
59
59
|
// left-shifting 2^62), but still do not use bit 63 because it is the sign bit.
|
|
60
60
|
|
|
61
61
|
// --------------------------- x86: 15 targets (+ one fallback)
|
|
62
|
-
// Bits 0..
|
|
62
|
+
// Bits 0..2 reserved (3 targets)
|
|
63
|
+
#define HWY_AVX10_2 (1LL << 3) // AVX10.2 with 512-bit vectors
|
|
63
64
|
#define HWY_AVX3_SPR (1LL << 4)
|
|
64
|
-
// Bit 5 reserved (
|
|
65
|
-
// Currently HWY_AVX3_DL plus AVX512BF16 and a special case for
|
|
66
|
-
// (10x as fast).
|
|
67
|
-
//
|
|
65
|
+
// Bit 5: reserved (1 target)
|
|
66
|
+
// Currently `HWY_AVX3_DL` plus `AVX512BF16` and a special case for
|
|
67
|
+
// `CompressStore` (10x as fast, still useful on Zen5). We may later also use
|
|
68
|
+
// `VPCONFLICT`. Note that `VP2INTERSECT` is available in Zen5.
|
|
68
69
|
#define HWY_AVX3_ZEN4 (1LL << 6) // see HWY_WANT_AVX3_ZEN4 below
|
|
69
70
|
|
|
70
|
-
// Currently satisfiable by Ice Lake (VNNI
|
|
71
|
-
// VAES
|
|
72
|
-
|
|
73
|
-
#define
|
|
74
|
-
#define
|
|
75
|
-
#define HWY_AVX2 (1LL << 9) // HWY_SSE4 plus BMI2 + F16 + FMA
|
|
71
|
+
// Currently satisfiable by Ice Lake (`VNNI`, `VPCLMULQDQ`, `VPOPCNTDQ`,
|
|
72
|
+
// `VBMI`, `VBMI2`, `VAES`, `BITALG`, `GFNI`).
|
|
73
|
+
#define HWY_AVX3_DL (1LL << 7)
|
|
74
|
+
#define HWY_AVX3 (1LL << 8) // HWY_AVX2 plus AVX-512F/BW/CD/DQ/VL
|
|
75
|
+
#define HWY_AVX2 (1LL << 9) // HWY_SSE4 plus BMI2 + F16 + FMA
|
|
76
76
|
// Bit 10: reserved
|
|
77
77
|
#define HWY_SSE4 (1LL << 11) // SSE4.2 plus AES + CLMUL
|
|
78
78
|
#define HWY_SSSE3 (1LL << 12) // S-SSE3
|
|
@@ -107,8 +107,14 @@
|
|
|
107
107
|
// Bit 38 reserved
|
|
108
108
|
#define HWY_HIGHEST_TARGET_BIT_RVV 38
|
|
109
109
|
|
|
110
|
-
// ---------------------------
|
|
111
|
-
// Bits 39
|
|
110
|
+
// --------------------------- LoongArch: 3 targets (+ one fallback)
|
|
111
|
+
// Bits 39 reserved (1 target)
|
|
112
|
+
#define HWY_LASX (1LL << 40)
|
|
113
|
+
#define HWY_LSX (1LL << 41)
|
|
114
|
+
#define HWY_HIGHEST_TARGET_BIT_LOONGARCH 41
|
|
115
|
+
|
|
116
|
+
// --------------------------- Future expansion: 1 target
|
|
117
|
+
// Bits 42 reserved
|
|
112
118
|
|
|
113
119
|
// --------------------------- IBM Power/ZSeries: 9 targets (+ one fallback)
|
|
114
120
|
// Bits 43..46 reserved (4 targets)
|
|
@@ -149,10 +155,10 @@
|
|
|
149
155
|
// Broken means excluded from enabled due to known compiler issues. We define
|
|
150
156
|
// separate HWY_BROKEN_* and then OR them together (more than one might apply).
|
|
151
157
|
|
|
158
|
+
#ifndef HWY_BROKEN_CLANG6 // allow override
|
|
152
159
|
// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
|
|
153
160
|
// SSE4 codegen (possibly only for msan), so disable all those targets.
|
|
154
161
|
#if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
|
|
155
|
-
|
|
156
162
|
#define HWY_BROKEN_CLANG6 (HWY_SSE4 | (HWY_SSE4 - 1))
|
|
157
163
|
// This entails a major speed reduction, so warn unless the user explicitly
|
|
158
164
|
// opts in to scalar-only.
|
|
@@ -163,21 +169,32 @@
|
|
|
163
169
|
#else
|
|
164
170
|
#define HWY_BROKEN_CLANG6 0
|
|
165
171
|
#endif
|
|
172
|
+
#endif // HWY_BROKEN_CLANG6
|
|
166
173
|
|
|
174
|
+
#ifndef HWY_BROKEN_32BIT // allow override
|
|
167
175
|
// 32-bit may fail to compile AVX2/3.
|
|
168
176
|
#if HWY_ARCH_X86_32
|
|
177
|
+
// GCC-13 is ok with AVX2:
|
|
178
|
+
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1300)
|
|
179
|
+
#define HWY_BROKEN_32BIT (HWY_AVX3 | (HWY_AVX3 - 1))
|
|
180
|
+
#else
|
|
169
181
|
#define HWY_BROKEN_32BIT (HWY_AVX2 | (HWY_AVX2 - 1))
|
|
182
|
+
#endif
|
|
170
183
|
#else
|
|
171
184
|
#define HWY_BROKEN_32BIT 0
|
|
172
185
|
#endif
|
|
186
|
+
#endif // HWY_BROKEN_32BIT
|
|
173
187
|
|
|
188
|
+
#ifndef HWY_BROKEN_MSVC // allow override
|
|
174
189
|
// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
|
|
175
190
|
#if HWY_COMPILER_MSVC != 0
|
|
176
191
|
#define HWY_BROKEN_MSVC (HWY_AVX3 | (HWY_AVX3 - 1))
|
|
177
192
|
#else
|
|
178
193
|
#define HWY_BROKEN_MSVC 0
|
|
179
194
|
#endif
|
|
195
|
+
#endif // HWY_BROKEN_MSVC
|
|
180
196
|
|
|
197
|
+
#ifndef HWY_BROKEN_AVX3_DL_ZEN4 // allow override
|
|
181
198
|
// AVX3_DL and AVX3_ZEN4 require clang >= 7 (ensured above), gcc >= 8.1 or ICC
|
|
182
199
|
// 2021.
|
|
183
200
|
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 801) || \
|
|
@@ -186,7 +203,9 @@
|
|
|
186
203
|
#else
|
|
187
204
|
#define HWY_BROKEN_AVX3_DL_ZEN4 0
|
|
188
205
|
#endif
|
|
206
|
+
#endif // HWY_BROKEN_AVX3_DL_ZEN4
|
|
189
207
|
|
|
208
|
+
#ifndef HWY_BROKEN_AVX3_SPR // allow override
|
|
190
209
|
// AVX3_SPR requires clang >= 14, gcc >= 12, or ICC 2021.
|
|
191
210
|
#if (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1400) || \
|
|
192
211
|
(HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200) || \
|
|
@@ -195,25 +214,37 @@
|
|
|
195
214
|
#else
|
|
196
215
|
#define HWY_BROKEN_AVX3_SPR 0
|
|
197
216
|
#endif
|
|
217
|
+
#endif // HWY_BROKEN_AVX3_SPR
|
|
198
218
|
|
|
219
|
+
#ifndef HWY_BROKEN_ARM7_BIG_ENDIAN // allow override
|
|
199
220
|
// armv7be has not been tested and is not yet supported.
|
|
200
221
|
#if HWY_ARCH_ARM_V7 && HWY_IS_BIG_ENDIAN
|
|
201
222
|
#define HWY_BROKEN_ARM7_BIG_ENDIAN HWY_ALL_NEON
|
|
202
223
|
#else
|
|
203
224
|
#define HWY_BROKEN_ARM7_BIG_ENDIAN 0
|
|
204
225
|
#endif
|
|
226
|
+
#endif // HWY_BROKEN_ARM7_BIG_ENDIAN
|
|
205
227
|
|
|
228
|
+
#ifdef __ARM_NEON_FP
|
|
229
|
+
#define HWY_HAVE_NEON_FP __ARM_NEON_FP
|
|
230
|
+
#else
|
|
231
|
+
#define HWY_HAVE_NEON_FP 0
|
|
232
|
+
#endif
|
|
233
|
+
|
|
234
|
+
#ifndef HWY_BROKEN_ARM7_WITHOUT_VFP4 // allow override
|
|
206
235
|
// armv7-a without a detected vfpv4 is not supported
|
|
207
236
|
// (for example Cortex-A8, Cortex-A9)
|
|
208
237
|
// vfpv4 always have neon half-float _and_ FMA.
|
|
209
238
|
#if HWY_ARCH_ARM_V7 && (__ARM_ARCH_PROFILE == 'A') && \
|
|
210
239
|
!defined(__ARM_VFPV4__) && \
|
|
211
|
-
!((
|
|
240
|
+
!((HWY_HAVE_NEON_FP & 0x2 /* half-float */) && (__ARM_FEATURE_FMA == 1))
|
|
212
241
|
#define HWY_BROKEN_ARM7_WITHOUT_VFP4 HWY_ALL_NEON
|
|
213
242
|
#else
|
|
214
243
|
#define HWY_BROKEN_ARM7_WITHOUT_VFP4 0
|
|
215
244
|
#endif
|
|
245
|
+
#endif // HWY_BROKEN_ARM7_WITHOUT_VFP4
|
|
216
246
|
|
|
247
|
+
#ifndef HWY_BROKEN_NEON_BF16 // allow override
|
|
217
248
|
// HWY_NEON_BF16 requires recent compilers.
|
|
218
249
|
#if (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1700) || \
|
|
219
250
|
(HWY_COMPILER_GCC_ACTUAL != 0 && HWY_COMPILER_GCC_ACTUAL < 1302)
|
|
@@ -221,15 +252,34 @@
|
|
|
221
252
|
#else
|
|
222
253
|
#define HWY_BROKEN_NEON_BF16 0
|
|
223
254
|
#endif
|
|
255
|
+
#endif // HWY_BROKEN_NEON_BF16
|
|
224
256
|
|
|
225
257
|
// SVE[2] require recent clang or gcc versions.
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
258
|
+
|
|
259
|
+
#ifndef HWY_BROKEN_SVE // allow override
|
|
260
|
+
// GCC 10+. Clang 19 still has many test failures for SVE. No Apple CPU (at
|
|
261
|
+
// least up to and including M4 and A18) has SVE.
|
|
262
|
+
#if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 2000) || \
|
|
263
|
+
(HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000) || \
|
|
264
|
+
HWY_OS_APPLE
|
|
265
|
+
#define HWY_BROKEN_SVE (HWY_SVE | HWY_SVE_256)
|
|
229
266
|
#else
|
|
230
267
|
#define HWY_BROKEN_SVE 0
|
|
231
268
|
#endif
|
|
269
|
+
#endif // HWY_BROKEN_SVE
|
|
270
|
+
|
|
271
|
+
#ifndef HWY_BROKEN_SVE2 // allow override
|
|
272
|
+
// Clang 19 still has many test failures for SVE2.
|
|
273
|
+
#if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 2000) || \
|
|
274
|
+
(HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000) || \
|
|
275
|
+
HWY_OS_APPLE
|
|
276
|
+
#define HWY_BROKEN_SVE2 (HWY_SVE2 | HWY_SVE2_128)
|
|
277
|
+
#else
|
|
278
|
+
#define HWY_BROKEN_SVE2 0
|
|
279
|
+
#endif
|
|
280
|
+
#endif // HWY_BROKEN_SVE2
|
|
232
281
|
|
|
282
|
+
#ifndef HWY_BROKEN_PPC10 // allow override
|
|
233
283
|
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1100)
|
|
234
284
|
// GCC 10 supports the -mcpu=power10 option but does not support the PPC10
|
|
235
285
|
// vector intrinsics
|
|
@@ -257,6 +307,55 @@
|
|
|
257
307
|
#else
|
|
258
308
|
#define HWY_BROKEN_PPC10 0
|
|
259
309
|
#endif
|
|
310
|
+
#endif // HWY_BROKEN_PPC10
|
|
311
|
+
|
|
312
|
+
#ifndef HWY_BROKEN_PPC_32BIT // allow override
|
|
313
|
+
// PPC8/PPC9/PPC10 targets may fail to compile on 32-bit PowerPC
|
|
314
|
+
#if HWY_ARCH_PPC && !HWY_ARCH_PPC_64
|
|
315
|
+
#define HWY_BROKEN_PPC_32BIT (HWY_PPC8 | HWY_PPC9 | HWY_PPC10)
|
|
316
|
+
#else
|
|
317
|
+
#define HWY_BROKEN_PPC_32BIT 0
|
|
318
|
+
#endif
|
|
319
|
+
#endif // HWY_BROKEN_PPC_32BIT
|
|
320
|
+
|
|
321
|
+
#ifndef HWY_BROKEN_RVV // allow override
|
|
322
|
+
// HWY_RVV fails to compile with GCC < 13 or Clang < 16.
|
|
323
|
+
#if HWY_ARCH_RISCV && \
|
|
324
|
+
((HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1600) || \
|
|
325
|
+
(HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1300))
|
|
326
|
+
#define HWY_BROKEN_RVV (HWY_RVV)
|
|
327
|
+
#else
|
|
328
|
+
#define HWY_BROKEN_RVV 0
|
|
329
|
+
#endif
|
|
330
|
+
#endif // HWY_BROKEN_RVV
|
|
331
|
+
|
|
332
|
+
#ifndef HWY_BROKEN_LOONGARCH // allow override
|
|
333
|
+
// Using __loongarch_sx and __loongarch_asx macros to
|
|
334
|
+
// check whether LSX/LASX targets are available.
|
|
335
|
+
#if !defined(__loongarch_sx)
|
|
336
|
+
#define HWY_BROKEN_LOONGARCH (HWY_LSX | HWY_LASX)
|
|
337
|
+
#elif !defined(__loongarch_asx)
|
|
338
|
+
#define HWY_BROKEN_LOONGARCH (HWY_LASX)
|
|
339
|
+
#else
|
|
340
|
+
#define HWY_BROKEN_LOONGARCH 0
|
|
341
|
+
#endif
|
|
342
|
+
#endif // HWY_BROKEN_LOONGARCH
|
|
343
|
+
|
|
344
|
+
#ifndef HWY_BROKEN_Z14 // allow override
|
|
345
|
+
#if HWY_ARCH_S390X
|
|
346
|
+
#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1900
|
|
347
|
+
// Clang 18 and earlier have bugs with some ZVector intrinsics
|
|
348
|
+
#define HWY_BROKEN_Z14 (HWY_Z14 | HWY_Z15)
|
|
349
|
+
#elif HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900
|
|
350
|
+
// Z15 target requires GCC 9 or later
|
|
351
|
+
#define HWY_BROKEN_Z14 (HWY_Z15)
|
|
352
|
+
#else
|
|
353
|
+
#define HWY_BROKEN_Z14 0
|
|
354
|
+
#endif
|
|
355
|
+
#else // !HWY_ARCH_S390X
|
|
356
|
+
#define HWY_BROKEN_Z14 0
|
|
357
|
+
#endif // HWY_ARCH_S390X
|
|
358
|
+
#endif // HWY_BROKEN_Z14
|
|
260
359
|
|
|
261
360
|
// Allow the user to override this without any guarantee of success.
|
|
262
361
|
#ifndef HWY_BROKEN_TARGETS
|
|
@@ -265,7 +364,9 @@
|
|
|
265
364
|
(HWY_BROKEN_CLANG6 | HWY_BROKEN_32BIT | HWY_BROKEN_MSVC | \
|
|
266
365
|
HWY_BROKEN_AVX3_DL_ZEN4 | HWY_BROKEN_AVX3_SPR | \
|
|
267
366
|
HWY_BROKEN_ARM7_BIG_ENDIAN | HWY_BROKEN_ARM7_WITHOUT_VFP4 | \
|
|
268
|
-
HWY_BROKEN_NEON_BF16 | HWY_BROKEN_SVE |
|
|
367
|
+
HWY_BROKEN_NEON_BF16 | HWY_BROKEN_SVE | HWY_BROKEN_SVE2 | \
|
|
368
|
+
HWY_BROKEN_PPC10 | HWY_BROKEN_PPC_32BIT | HWY_BROKEN_RVV | \
|
|
369
|
+
HWY_BROKEN_LOONGARCH | HWY_BROKEN_Z14)
|
|
269
370
|
|
|
270
371
|
#endif // HWY_BROKEN_TARGETS
|
|
271
372
|
|
|
@@ -279,7 +380,7 @@
|
|
|
279
380
|
// because it affects the fallback target, which must always be enabled. If 1,
|
|
280
381
|
// we instead choose HWY_SCALAR even without HWY_COMPILE_ONLY_SCALAR being set.
|
|
281
382
|
#if !defined(HWY_BROKEN_EMU128) // allow overriding
|
|
282
|
-
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL <
|
|
383
|
+
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1600) || \
|
|
283
384
|
defined(HWY_NO_LIBCXX)
|
|
284
385
|
#define HWY_BROKEN_EMU128 1
|
|
285
386
|
#else
|
|
@@ -471,19 +572,22 @@
|
|
|
471
572
|
|
|
472
573
|
#endif // non-MSVC
|
|
473
574
|
|
|
474
|
-
#if HWY_ARCH_X86 &&
|
|
575
|
+
#if HWY_ARCH_X86 && \
|
|
576
|
+
((defined(HWY_WANT_SSE2) && HWY_WANT_SSE2) || HWY_CHECK_SSE2)
|
|
475
577
|
#define HWY_BASELINE_SSE2 HWY_SSE2
|
|
476
578
|
#else
|
|
477
579
|
#define HWY_BASELINE_SSE2 0
|
|
478
580
|
#endif
|
|
479
581
|
|
|
480
|
-
#if HWY_ARCH_X86 &&
|
|
582
|
+
#if HWY_ARCH_X86 && \
|
|
583
|
+
((defined(HWY_WANT_SSSE3) && HWY_WANT_SSSE3) || HWY_CHECK_SSSE3)
|
|
481
584
|
#define HWY_BASELINE_SSSE3 HWY_SSSE3
|
|
482
585
|
#else
|
|
483
586
|
#define HWY_BASELINE_SSSE3 0
|
|
484
587
|
#endif
|
|
485
588
|
|
|
486
|
-
#if HWY_ARCH_X86 && (HWY_WANT_SSE4
|
|
589
|
+
#if HWY_ARCH_X86 && ((defined(HWY_WANT_SSE4) && HWY_WANT_SSE4) || \
|
|
590
|
+
(HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES))
|
|
487
591
|
#define HWY_BASELINE_SSE4 HWY_SSE4
|
|
488
592
|
#else
|
|
489
593
|
#define HWY_BASELINE_SSE4 0
|
|
@@ -497,18 +601,25 @@
|
|
|
497
601
|
#endif
|
|
498
602
|
|
|
499
603
|
// Require everything in AVX2 plus AVX-512 flags (also set by MSVC)
|
|
500
|
-
#if HWY_BASELINE_AVX2 != 0 &&
|
|
501
|
-
defined(
|
|
604
|
+
#if HWY_BASELINE_AVX2 != 0 && \
|
|
605
|
+
((defined(__AVX512F__) && defined(__AVX512BW__) && \
|
|
606
|
+
defined(__AVX512DQ__) && defined(__AVX512VL__)) || \
|
|
607
|
+
defined(__AVX10_2__)) && \
|
|
608
|
+
((!HWY_COMPILER_GCC_ACTUAL && !HWY_COMPILER_CLANG) || \
|
|
609
|
+
HWY_COMPILER_GCC_ACTUAL < 1400 || HWY_COMPILER_CLANG < 1800 || \
|
|
610
|
+
defined(__EVEX512__))
|
|
502
611
|
#define HWY_BASELINE_AVX3 HWY_AVX3
|
|
503
612
|
#else
|
|
504
613
|
#define HWY_BASELINE_AVX3 0
|
|
505
614
|
#endif
|
|
506
615
|
|
|
507
616
|
// TODO(janwas): not yet known whether these will be set by MSVC
|
|
508
|
-
#if HWY_BASELINE_AVX3 != 0 &&
|
|
509
|
-
defined(
|
|
510
|
-
|
|
511
|
-
|
|
617
|
+
#if HWY_BASELINE_AVX3 != 0 && \
|
|
618
|
+
((defined(__AVX512VNNI__) && defined(__VAES__) && \
|
|
619
|
+
defined(__VPCLMULQDQ__) && defined(__AVX512VBMI__) && \
|
|
620
|
+
defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
|
|
621
|
+
defined(__AVX512BITALG__)) || \
|
|
622
|
+
defined(__AVX10_2__))
|
|
512
623
|
#define HWY_BASELINE_AVX3_DL HWY_AVX3_DL
|
|
513
624
|
#else
|
|
514
625
|
#define HWY_BASELINE_AVX3_DL 0
|
|
@@ -523,21 +634,41 @@
|
|
|
523
634
|
#define HWY_BASELINE_AVX3_ZEN4 0
|
|
524
635
|
#endif
|
|
525
636
|
|
|
526
|
-
#if HWY_BASELINE_AVX3_DL != 0 &&
|
|
527
|
-
defined(__AVX512FP16__)
|
|
637
|
+
#if HWY_BASELINE_AVX3_DL != 0 && \
|
|
638
|
+
((defined(__AVX512BF16__) && defined(__AVX512FP16__)) || \
|
|
639
|
+
defined(__AVX10_2__))
|
|
528
640
|
#define HWY_BASELINE_AVX3_SPR HWY_AVX3_SPR
|
|
529
641
|
#else
|
|
530
642
|
#define HWY_BASELINE_AVX3_SPR 0
|
|
531
643
|
#endif
|
|
532
644
|
|
|
645
|
+
#if HWY_BASELINE_AVX3_SPR != 0 && defined(__AVX10_2__) && \
|
|
646
|
+
(HWY_COMPILER_GCC_ACTUAL >= 1500 || HWY_COMPILER_CLANG >= 2001)
|
|
647
|
+
#define HWY_BASELINE_AVX10_2 HWY_AVX10_2
|
|
648
|
+
#else
|
|
649
|
+
#define HWY_BASELINE_AVX10_2 0
|
|
650
|
+
#endif
|
|
651
|
+
|
|
533
652
|
// RVV requires intrinsics 0.11 or later, see #1156.
|
|
534
|
-
|
|
653
|
+
|
|
654
|
+
// Also check that the __riscv_v macro is defined as GCC or Clang will define
|
|
655
|
+
// the __risc_v macro if the RISC-V "V" extension is enabled.
|
|
656
|
+
|
|
657
|
+
#if HWY_ARCH_RISCV && defined(__riscv_v) && defined(__riscv_v_intrinsic) && \
|
|
535
658
|
__riscv_v_intrinsic >= 11000
|
|
536
659
|
#define HWY_BASELINE_RVV HWY_RVV
|
|
537
660
|
#else
|
|
538
661
|
#define HWY_BASELINE_RVV 0
|
|
539
662
|
#endif
|
|
540
663
|
|
|
664
|
+
#if HWY_ARCH_LOONGARCH && defined(__loongarch_sx) && defined(__loongarch_asx)
|
|
665
|
+
#define HWY_BASELINE_LOONGARCH (HWY_LSX | HWY_LASX)
|
|
666
|
+
#elif HWY_ARCH_LOONGARCH && defined(__loongarch_sx)
|
|
667
|
+
#define HWY_BASELINE_LOONGARCH (HWY_LSX)
|
|
668
|
+
#else
|
|
669
|
+
#define HWY_BASELINE_LOONGARCH 0
|
|
670
|
+
#endif
|
|
671
|
+
|
|
541
672
|
// Allow the user to override this without any guarantee of success.
|
|
542
673
|
#ifndef HWY_BASELINE_TARGETS
|
|
543
674
|
#define HWY_BASELINE_TARGETS \
|
|
@@ -547,7 +678,7 @@
|
|
|
547
678
|
HWY_BASELINE_NEON | HWY_BASELINE_SSE2 | HWY_BASELINE_SSSE3 | \
|
|
548
679
|
HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | \
|
|
549
680
|
HWY_BASELINE_AVX3_DL | HWY_BASELINE_AVX3_ZEN4 | HWY_BASELINE_AVX3_SPR | \
|
|
550
|
-
HWY_BASELINE_RVV)
|
|
681
|
+
HWY_BASELINE_AVX10_2 | HWY_BASELINE_RVV | HWY_BASELINE_LOONGARCH)
|
|
551
682
|
#endif // HWY_BASELINE_TARGETS
|
|
552
683
|
|
|
553
684
|
//------------------------------------------------------------------------------
|
|
@@ -577,6 +708,22 @@
|
|
|
577
708
|
#endif
|
|
578
709
|
// Defining one of HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.
|
|
579
710
|
|
|
711
|
+
#ifndef HWY_HAVE_ASM_HWCAP // allow override
|
|
712
|
+
#ifdef TOOLCHAIN_MISS_ASM_HWCAP_H
|
|
713
|
+
#define HWY_HAVE_ASM_HWCAP 0 // CMake failed to find the header
|
|
714
|
+
#elif defined(__has_include) // note: wrapper macro fails on Clang ~17
|
|
715
|
+
// clang-format off
|
|
716
|
+
#if __has_include(<asm/hwcap.h>)
|
|
717
|
+
// clang-format on
|
|
718
|
+
#define HWY_HAVE_ASM_HWCAP 1 // header present
|
|
719
|
+
#else
|
|
720
|
+
#define HWY_HAVE_ASM_HWCAP 0 // header not present
|
|
721
|
+
#endif // __has_include
|
|
722
|
+
#else // compiler lacks __has_include
|
|
723
|
+
#define HWY_HAVE_ASM_HWCAP 0
|
|
724
|
+
#endif
|
|
725
|
+
#endif // HWY_HAVE_ASM_HWCAP
|
|
726
|
+
|
|
580
727
|
#ifndef HWY_HAVE_AUXV // allow override
|
|
581
728
|
#ifdef TOOLCHAIN_MISS_SYS_AUXV_H
|
|
582
729
|
#define HWY_HAVE_AUXV 0 // CMake failed to find the header
|
|
@@ -587,7 +734,7 @@
|
|
|
587
734
|
// clang-format off
|
|
588
735
|
#if __has_include(<sys/auxv.h>)
|
|
589
736
|
// clang-format on
|
|
590
|
-
#define HWY_HAVE_AUXV 1
|
|
737
|
+
#define HWY_HAVE_AUXV 1 // header present
|
|
591
738
|
#else
|
|
592
739
|
#define HWY_HAVE_AUXV 0 // header not present
|
|
593
740
|
#endif // __has_include
|
|
@@ -596,33 +743,57 @@
|
|
|
596
743
|
#endif
|
|
597
744
|
#endif // HWY_HAVE_AUXV
|
|
598
745
|
|
|
746
|
+
#ifndef HWY_HAVE_RUNTIME_DISPATCH_RVV // allow override
|
|
747
|
+
// The riscv_vector.h in Clang 16-18 requires compiler flags, and 19 still has
|
|
748
|
+
// some missing intrinsics, see
|
|
749
|
+
// https://github.com/llvm/llvm-project/issues/56592. GCC 13.3 also has an
|
|
750
|
+
// #error check, whereas 14.1 fails with "argument type 'vuint16m8_t' requires
|
|
751
|
+
// the V ISA extension": https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115325.
|
|
752
|
+
#if HWY_ARCH_RISCV && HWY_COMPILER_CLANG >= 1900 && 0
|
|
753
|
+
#define HWY_HAVE_RUNTIME_DISPATCH_RVV 1
|
|
754
|
+
#else
|
|
755
|
+
#define HWY_HAVE_RUNTIME_DISPATCH_RVV 0
|
|
756
|
+
#endif
|
|
757
|
+
#endif // HWY_HAVE_RUNTIME_DISPATCH_RVV
|
|
758
|
+
|
|
759
|
+
#ifndef HWY_HAVE_RUNTIME_DISPATCH_APPLE // allow override
|
|
760
|
+
#if HWY_ARCH_ARM_A64 && HWY_OS_APPLE && \
|
|
761
|
+
(HWY_COMPILER_GCC_ACTUAL || HWY_COMPILER_CLANG >= 1700)
|
|
762
|
+
#define HWY_HAVE_RUNTIME_DISPATCH_APPLE 1
|
|
763
|
+
#else
|
|
764
|
+
#define HWY_HAVE_RUNTIME_DISPATCH_APPLE 0
|
|
765
|
+
#endif
|
|
766
|
+
#endif // HWY_HAVE_RUNTIME_DISPATCH_APPLE
|
|
767
|
+
|
|
768
|
+
#ifndef HWY_HAVE_RUNTIME_DISPATCH_LOONGARCH // allow override
|
|
769
|
+
#if HWY_ARCH_LOONGARCH && HWY_HAVE_AUXV && (defined(__loongarch_sx) || \
|
|
770
|
+
defined(__loongarch_asx))
|
|
771
|
+
#define HWY_HAVE_RUNTIME_DISPATCH_LOONGARCH 1
|
|
772
|
+
#else
|
|
773
|
+
#define HWY_HAVE_RUNTIME_DISPATCH_LOONGARCH 0
|
|
774
|
+
#endif
|
|
775
|
+
#endif // HWY_HAVE_RUNTIME_DISPATCH_LOONGARCH
|
|
776
|
+
|
|
777
|
+
#ifndef HWY_HAVE_RUNTIME_DISPATCH_LINUX // allow override
|
|
778
|
+
#if (HWY_ARCH_ARM || HWY_ARCH_PPC || HWY_ARCH_S390X) && HWY_OS_LINUX && \
|
|
779
|
+
(HWY_COMPILER_GCC_ACTUAL || HWY_COMPILER_CLANG >= 1700) && HWY_HAVE_AUXV
|
|
780
|
+
#define HWY_HAVE_RUNTIME_DISPATCH_LINUX 1
|
|
781
|
+
#else
|
|
782
|
+
#define HWY_HAVE_RUNTIME_DISPATCH_LINUX 0
|
|
783
|
+
#endif
|
|
784
|
+
#endif // HWY_HAVE_RUNTIME_DISPATCH_LINUX
|
|
785
|
+
|
|
599
786
|
// Allow opting out, and without a guarantee of success, opting-in.
|
|
600
787
|
#ifndef HWY_HAVE_RUNTIME_DISPATCH
|
|
601
|
-
// Clang, GCC and MSVC allow runtime dispatch on x86.
|
|
602
|
-
#if HWY_ARCH_X86
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
// to detect CPU capabilities.
|
|
606
|
-
#elif (HWY_ARCH_ARM || HWY_ARCH_PPC || HWY_ARCH_S390X || HWY_ARCH_RISCV) && \
|
|
607
|
-
(HWY_COMPILER_GCC_ACTUAL || HWY_COMPILER_CLANG >= 1700) && HWY_OS_LINUX && \
|
|
608
|
-
HWY_HAVE_AUXV
|
|
609
|
-
#define HWY_HAVE_RUNTIME_DISPATCH 1
|
|
610
|
-
#elif HWY_ARCH_ARM_A64 && HWY_OS_APPLE && \
|
|
611
|
-
(HWY_COMPILER_GCC_ACTUAL || HWY_COMPILER_CLANG >= 1700)
|
|
788
|
+
// Clang, GCC and MSVC allow OS-independent runtime dispatch on x86.
|
|
789
|
+
#if HWY_ARCH_X86 || HWY_HAVE_RUNTIME_DISPATCH_RVV || \
|
|
790
|
+
HWY_HAVE_RUNTIME_DISPATCH_APPLE || HWY_HAVE_RUNTIME_DISPATCH_LOONGARCH || \
|
|
791
|
+
HWY_HAVE_RUNTIME_DISPATCH_LINUX
|
|
612
792
|
#define HWY_HAVE_RUNTIME_DISPATCH 1
|
|
613
793
|
#else
|
|
614
794
|
#define HWY_HAVE_RUNTIME_DISPATCH 0
|
|
615
|
-
#endif // HWY_ARCH_*
|
|
616
|
-
#endif // HWY_HAVE_RUNTIME_DISPATCH
|
|
617
|
-
|
|
618
|
-
// AVX3_DL is not widely available yet. To reduce code size and compile time,
|
|
619
|
-
// only include it in the set of attainable targets (for dynamic dispatch) if
|
|
620
|
-
// the user opts in, OR it is in the baseline (we check whether enabled below).
|
|
621
|
-
#if defined(HWY_WANT_AVX3_DL) || (HWY_BASELINE_TARGETS & HWY_AVX3_DL)
|
|
622
|
-
#define HWY_ATTAINABLE_AVX3_DL (HWY_AVX3_DL)
|
|
623
|
-
#else
|
|
624
|
-
#define HWY_ATTAINABLE_AVX3_DL 0
|
|
625
795
|
#endif
|
|
796
|
+
#endif // HWY_HAVE_RUNTIME_DISPATCH
|
|
626
797
|
|
|
627
798
|
#if HWY_ARCH_ARM_A64 && HWY_HAVE_RUNTIME_DISPATCH
|
|
628
799
|
#define HWY_ATTAINABLE_NEON HWY_ALL_NEON
|
|
@@ -675,24 +846,34 @@
|
|
|
675
846
|
#endif
|
|
676
847
|
|
|
677
848
|
#if HWY_ARCH_RISCV && HWY_HAVE_RUNTIME_DISPATCH
|
|
678
|
-
#define HWY_ATTAINABLE_RISCV
|
|
849
|
+
#define HWY_ATTAINABLE_RISCV HWY_RVV
|
|
679
850
|
#else
|
|
680
|
-
#define HWY_ATTAINABLE_RISCV
|
|
851
|
+
#define HWY_ATTAINABLE_RISCV HWY_BASELINE_RVV
|
|
681
852
|
#endif
|
|
682
853
|
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
#
|
|
686
|
-
#
|
|
854
|
+
#if HWY_ARCH_LOONGARCH && HWY_HAVE_RUNTIME_DISPATCH
|
|
855
|
+
#define HWY_ATTAINABLE_LOONGARCH (HWY_LSX | HWY_LASX)
|
|
856
|
+
#else
|
|
857
|
+
#define HWY_ATTAINABLE_LOONGARCH HWY_BASELINE_LOONGARCH
|
|
858
|
+
#endif
|
|
859
|
+
|
|
860
|
+
#ifndef HWY_ATTAINABLE_TARGETS_X86 // allow override
|
|
861
|
+
#if HWY_COMPILER_MSVC && defined(HWY_SLOW_MSVC)
|
|
687
862
|
// Fewer targets for faster builds.
|
|
688
|
-
#define
|
|
863
|
+
#define HWY_ATTAINABLE_TARGETS_X86 \
|
|
689
864
|
HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_STATIC_TARGET | HWY_AVX2)
|
|
690
865
|
#else // !HWY_COMPILER_MSVC
|
|
691
|
-
#define
|
|
692
|
-
HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSE2 | HWY_SSSE3 | HWY_SSE4 |
|
|
693
|
-
HWY_AVX2 | HWY_AVX3 |
|
|
694
|
-
HWY_AVX3_SPR)
|
|
866
|
+
#define HWY_ATTAINABLE_TARGETS_X86 \
|
|
867
|
+
HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSE2 | HWY_SSSE3 | HWY_SSE4 | \
|
|
868
|
+
HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL | HWY_AVX3_ZEN4 | \
|
|
869
|
+
HWY_AVX3_SPR | HWY_AVX10_2)
|
|
695
870
|
#endif // !HWY_COMPILER_MSVC
|
|
871
|
+
#endif // HWY_ATTAINABLE_TARGETS_X86
|
|
872
|
+
|
|
873
|
+
// Attainable means enabled and the compiler allows intrinsics (even when not
|
|
874
|
+
// allowed to auto-vectorize). Used in 3 and 4.
|
|
875
|
+
#if HWY_ARCH_X86
|
|
876
|
+
#define HWY_ATTAINABLE_TARGETS HWY_ATTAINABLE_TARGETS_X86
|
|
696
877
|
#elif HWY_ARCH_ARM
|
|
697
878
|
#define HWY_ATTAINABLE_TARGETS \
|
|
698
879
|
HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_NEON | HWY_ATTAINABLE_SVE | \
|
|
@@ -703,9 +884,12 @@
|
|
|
703
884
|
#elif HWY_ARCH_S390X
|
|
704
885
|
#define HWY_ATTAINABLE_TARGETS \
|
|
705
886
|
HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_S390X)
|
|
706
|
-
#elif
|
|
887
|
+
#elif HWY_ARCH_RISCV
|
|
707
888
|
#define HWY_ATTAINABLE_TARGETS \
|
|
708
889
|
HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_RISCV)
|
|
890
|
+
#elif HWY_ARCH_LOONGARCH
|
|
891
|
+
#define HWY_ATTAINABLE_TARGETS \
|
|
892
|
+
HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_LOONGARCH)
|
|
709
893
|
#else
|
|
710
894
|
#define HWY_ATTAINABLE_TARGETS (HWY_ENABLED_BASELINE)
|
|
711
895
|
#endif // HWY_ARCH_*
|
|
@@ -143,6 +143,17 @@
|
|
|
143
143
|
#endif
|
|
144
144
|
#endif
|
|
145
145
|
|
|
146
|
+
#if (HWY_TARGETS & HWY_AVX10_2) && (HWY_STATIC_TARGET != HWY_AVX10_2)
|
|
147
|
+
#undef HWY_TARGET
|
|
148
|
+
#define HWY_TARGET HWY_AVX10_2
|
|
149
|
+
#include HWY_TARGET_INCLUDE
|
|
150
|
+
#ifdef HWY_TARGET_TOGGLE
|
|
151
|
+
#undef HWY_TARGET_TOGGLE
|
|
152
|
+
#else
|
|
153
|
+
#define HWY_TARGET_TOGGLE
|
|
154
|
+
#endif
|
|
155
|
+
#endif
|
|
156
|
+
|
|
146
157
|
// ------------------------------ HWY_ARCH_ARM
|
|
147
158
|
|
|
148
159
|
#if (HWY_TARGETS & HWY_NEON_WITHOUT_AES) && \
|
|
@@ -319,6 +330,30 @@
|
|
|
319
330
|
#endif
|
|
320
331
|
#endif
|
|
321
332
|
|
|
333
|
+
// ------------------------------ HWY_ARCH_LOONGARCH
|
|
334
|
+
|
|
335
|
+
#if (HWY_TARGETS & HWY_LSX) && (HWY_STATIC_TARGET != HWY_LSX)
|
|
336
|
+
#undef HWY_TARGET
|
|
337
|
+
#define HWY_TARGET HWY_LSX
|
|
338
|
+
#include HWY_TARGET_INCLUDE
|
|
339
|
+
#ifdef HWY_TARGET_TOGGLE
|
|
340
|
+
#undef HWY_TARGET_TOGGLE
|
|
341
|
+
#else
|
|
342
|
+
#define HWY_TARGET_TOGGLE
|
|
343
|
+
#endif
|
|
344
|
+
#endif
|
|
345
|
+
|
|
346
|
+
#if (HWY_TARGETS & HWY_LASX) && (HWY_STATIC_TARGET != HWY_LASX)
|
|
347
|
+
#undef HWY_TARGET
|
|
348
|
+
#define HWY_TARGET HWY_LASX
|
|
349
|
+
#include HWY_TARGET_INCLUDE
|
|
350
|
+
#ifdef HWY_TARGET_TOGGLE
|
|
351
|
+
#undef HWY_TARGET_TOGGLE
|
|
352
|
+
#else
|
|
353
|
+
#define HWY_TARGET_TOGGLE
|
|
354
|
+
#endif
|
|
355
|
+
#endif
|
|
356
|
+
|
|
322
357
|
// ------------------------------ Scalar
|
|
323
358
|
|
|
324
359
|
#if (HWY_TARGETS & HWY_EMU128) && (HWY_STATIC_TARGET != HWY_EMU128)
|