numkong 7.4.2 → 7.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +84 -84
- package/c/numkong.c +1 -1
- package/include/numkong/attention/sapphireamx.h +2 -2
- package/include/numkong/attention/sme.h +2 -2
- package/include/numkong/capabilities.h +47 -47
- package/include/numkong/cast/diamond.h +2 -2
- package/include/numkong/cast/haswell.h +2 -2
- package/include/numkong/cast/icelake.h +2 -2
- package/include/numkong/cast/loongsonasx.h +2 -2
- package/include/numkong/cast/neon.h +2 -2
- package/include/numkong/cast/powervsx.h +2 -2
- package/include/numkong/cast/rvv.h +2 -2
- package/include/numkong/cast/sapphire.h +2 -2
- package/include/numkong/cast/skylake.h +2 -2
- package/include/numkong/curved/genoa.h +2 -2
- package/include/numkong/curved/haswell.h +2 -2
- package/include/numkong/curved/neon.h +2 -2
- package/include/numkong/curved/neonbfdot.h +2 -2
- package/include/numkong/curved/rvv.h +2 -2
- package/include/numkong/curved/skylake.h +2 -2
- package/include/numkong/curved/smef64.h +2 -2
- package/include/numkong/dot/alder.h +2 -2
- package/include/numkong/dot/diamond.h +2 -2
- package/include/numkong/dot/genoa.h +2 -2
- package/include/numkong/dot/haswell.h +2 -2
- package/include/numkong/dot/icelake.h +2 -2
- package/include/numkong/dot/loongsonasx.h +2 -2
- package/include/numkong/dot/neon.h +2 -2
- package/include/numkong/dot/neonbfdot.h +2 -2
- package/include/numkong/dot/neonfhm.h +2 -2
- package/include/numkong/dot/neonfp8.h +2 -2
- package/include/numkong/dot/neonsdot.h +2 -2
- package/include/numkong/dot/rvv.h +2 -2
- package/include/numkong/dot/rvvbb.h +2 -2
- package/include/numkong/dot/rvvbf16.h +2 -2
- package/include/numkong/dot/rvvhalf.h +2 -2
- package/include/numkong/dot/sapphire.h +2 -2
- package/include/numkong/dot/sierra.h +2 -2
- package/include/numkong/dot/skylake.h +2 -2
- package/include/numkong/dot/sve.h +2 -2
- package/include/numkong/dot/svebfdot.h +2 -2
- package/include/numkong/dot/svehalf.h +2 -2
- package/include/numkong/dot/svesdot.h +2 -2
- package/include/numkong/dots/alder.h +2 -2
- package/include/numkong/dots/diamond.h +2 -2
- package/include/numkong/dots/genoa.h +2 -2
- package/include/numkong/dots/haswell.h +2 -2
- package/include/numkong/dots/icelake.h +2 -2
- package/include/numkong/dots/loongsonasx.h +2 -2
- package/include/numkong/dots/neon.h +2 -2
- package/include/numkong/dots/neonbfdot.h +2 -2
- package/include/numkong/dots/neonfhm.h +2 -2
- package/include/numkong/dots/neonfp8.h +2 -2
- package/include/numkong/dots/neonsdot.h +2 -2
- package/include/numkong/dots/powervsx.h +2 -2
- package/include/numkong/dots/rvv.h +2 -2
- package/include/numkong/dots/sapphireamx.h +2 -2
- package/include/numkong/dots/sierra.h +2 -2
- package/include/numkong/dots/skylake.h +2 -2
- package/include/numkong/dots/sme.h +10 -10
- package/include/numkong/dots/smebi32.h +2 -2
- package/include/numkong/dots/smef64.h +2 -2
- package/include/numkong/dots/smehalf.h +2 -2
- package/include/numkong/each/haswell.h +2 -2
- package/include/numkong/each/icelake.h +2 -2
- package/include/numkong/each/neon.h +2 -2
- package/include/numkong/each/neonbfdot.h +2 -2
- package/include/numkong/each/neonhalf.h +2 -2
- package/include/numkong/each/rvv.h +2 -2
- package/include/numkong/each/sapphire.h +2 -2
- package/include/numkong/each/skylake.h +2 -2
- package/include/numkong/geospatial/haswell.h +2 -2
- package/include/numkong/geospatial/neon.h +2 -2
- package/include/numkong/geospatial/rvv.h +2 -2
- package/include/numkong/geospatial/skylake.h +2 -2
- package/include/numkong/maxsim/alder.h +2 -2
- package/include/numkong/maxsim/genoa.h +2 -2
- package/include/numkong/maxsim/haswell.h +2 -2
- package/include/numkong/maxsim/icelake.h +2 -2
- package/include/numkong/maxsim/neonsdot.h +2 -2
- package/include/numkong/maxsim/sapphireamx.h +2 -2
- package/include/numkong/maxsim/sme.h +2 -2
- package/include/numkong/mesh/haswell.h +2 -2
- package/include/numkong/mesh/neon.h +2 -2
- package/include/numkong/mesh/neonbfdot.h +2 -2
- package/include/numkong/mesh/rvv.h +2 -2
- package/include/numkong/mesh/skylake.h +2 -2
- package/include/numkong/numkong.h +1 -1
- package/include/numkong/probability/haswell.h +2 -2
- package/include/numkong/probability/neon.h +2 -2
- package/include/numkong/probability/rvv.h +2 -2
- package/include/numkong/probability/skylake.h +2 -2
- package/include/numkong/reduce/alder.h +2 -2
- package/include/numkong/reduce/genoa.h +2 -2
- package/include/numkong/reduce/haswell.h +2 -2
- package/include/numkong/reduce/icelake.h +2 -2
- package/include/numkong/reduce/neon.h +2 -2
- package/include/numkong/reduce/neonbfdot.h +2 -2
- package/include/numkong/reduce/neonfhm.h +2 -2
- package/include/numkong/reduce/neonsdot.h +2 -2
- package/include/numkong/reduce/rvv.h +2 -2
- package/include/numkong/reduce/sierra.h +2 -2
- package/include/numkong/reduce/skylake.h +2 -2
- package/include/numkong/scalar/haswell.h +2 -2
- package/include/numkong/scalar/loongsonasx.h +2 -2
- package/include/numkong/scalar/neon.h +2 -2
- package/include/numkong/scalar/neonhalf.h +2 -2
- package/include/numkong/scalar/powervsx.h +2 -2
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +2 -2
- package/include/numkong/set/haswell.h +2 -2
- package/include/numkong/set/icelake.h +2 -2
- package/include/numkong/set/loongsonasx.h +2 -2
- package/include/numkong/set/neon.h +2 -2
- package/include/numkong/set/powervsx.h +2 -2
- package/include/numkong/set/rvv.h +2 -2
- package/include/numkong/set/rvvbb.h +2 -2
- package/include/numkong/set/sve.h +2 -2
- package/include/numkong/sets/haswell.h +2 -2
- package/include/numkong/sets/icelake.h +2 -2
- package/include/numkong/sets/loongsonasx.h +2 -2
- package/include/numkong/sets/neon.h +2 -2
- package/include/numkong/sets/powervsx.h +2 -2
- package/include/numkong/sets/smebi32.h +2 -2
- package/include/numkong/sparse/icelake.h +2 -2
- package/include/numkong/sparse/neon.h +2 -2
- package/include/numkong/sparse/sve2.h +2 -2
- package/include/numkong/sparse/turin.h +2 -2
- package/include/numkong/spatial/alder.h +2 -2
- package/include/numkong/spatial/diamond.h +2 -2
- package/include/numkong/spatial/genoa.h +2 -2
- package/include/numkong/spatial/haswell.h +2 -2
- package/include/numkong/spatial/icelake.h +2 -2
- package/include/numkong/spatial/loongsonasx.h +2 -2
- package/include/numkong/spatial/neon.h +2 -2
- package/include/numkong/spatial/neonbfdot.h +2 -2
- package/include/numkong/spatial/neonfp8.h +2 -2
- package/include/numkong/spatial/neonsdot.h +2 -2
- package/include/numkong/spatial/powervsx.h +2 -2
- package/include/numkong/spatial/rvv.h +2 -2
- package/include/numkong/spatial/rvvbf16.h +2 -2
- package/include/numkong/spatial/rvvhalf.h +2 -2
- package/include/numkong/spatial/sierra.h +2 -2
- package/include/numkong/spatial/skylake.h +2 -2
- package/include/numkong/spatial/sve.h +2 -2
- package/include/numkong/spatial/svebfdot.h +2 -2
- package/include/numkong/spatial/svehalf.h +2 -2
- package/include/numkong/spatial/svesdot.h +2 -2
- package/include/numkong/spatials/alder.h +2 -2
- package/include/numkong/spatials/diamond.h +2 -2
- package/include/numkong/spatials/genoa.h +2 -2
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/icelake.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +2 -2
- package/include/numkong/spatials/neon.h +2 -2
- package/include/numkong/spatials/neonbfdot.h +2 -2
- package/include/numkong/spatials/neonfhm.h +2 -2
- package/include/numkong/spatials/neonfp8.h +2 -2
- package/include/numkong/spatials/neonsdot.h +2 -2
- package/include/numkong/spatials/powervsx.h +2 -2
- package/include/numkong/spatials/rvv.h +2 -2
- package/include/numkong/spatials/sapphireamx.h +2 -2
- package/include/numkong/spatials/sierra.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +2 -2
- package/include/numkong/spatials/smef64.h +2 -2
- package/include/numkong/trigonometry/haswell.h +2 -2
- package/include/numkong/trigonometry/neon.h +2 -2
- package/include/numkong/trigonometry/rvv.h +2 -2
- package/include/numkong/trigonometry/skylake.h +2 -2
- package/include/numkong/types.h +88 -80
- package/package.json +7 -7
|
@@ -32,7 +32,7 @@
|
|
|
32
32
|
#ifndef NK_TRIGONOMETRY_NEON_H
|
|
33
33
|
#define NK_TRIGONOMETRY_NEON_H
|
|
34
34
|
|
|
35
|
-
#if
|
|
35
|
+
#if NK_TARGET_ARM64_
|
|
36
36
|
#if NK_TARGET_NEON
|
|
37
37
|
|
|
38
38
|
#include "numkong/types.h"
|
|
@@ -634,5 +634,5 @@ NK_PUBLIC void nk_each_atan_f64_neon(nk_f64_t const *ins, nk_size_t n, nk_f64_t
|
|
|
634
634
|
#endif
|
|
635
635
|
|
|
636
636
|
#endif // NK_TARGET_NEON
|
|
637
|
-
#endif //
|
|
637
|
+
#endif // NK_TARGET_ARM64_
|
|
638
638
|
#endif // NK_TRIGONOMETRY_NEON_H
|
|
@@ -37,7 +37,7 @@
|
|
|
37
37
|
#ifndef NK_TRIGONOMETRY_RVV_H
|
|
38
38
|
#define NK_TRIGONOMETRY_RVV_H
|
|
39
39
|
|
|
40
|
-
#if
|
|
40
|
+
#if NK_TARGET_RISCV64_
|
|
41
41
|
#if NK_TARGET_RVV
|
|
42
42
|
|
|
43
43
|
#include "numkong/types.h"
|
|
@@ -696,5 +696,5 @@ NK_PUBLIC void nk_each_atan_f16_rvv(nk_f16_t const *ins, nk_size_t n, nk_f16_t *
|
|
|
696
696
|
#endif
|
|
697
697
|
|
|
698
698
|
#endif // NK_TARGET_RVV
|
|
699
|
-
#endif //
|
|
699
|
+
#endif // NK_TARGET_RISCV64_
|
|
700
700
|
#endif // NK_TRIGONOMETRY_RVV_H
|
|
@@ -23,7 +23,7 @@
|
|
|
23
23
|
#ifndef NK_TRIGONOMETRY_SKYLAKE_H
|
|
24
24
|
#define NK_TRIGONOMETRY_SKYLAKE_H
|
|
25
25
|
|
|
26
|
-
#if
|
|
26
|
+
#if NK_TARGET_X8664_
|
|
27
27
|
#if NK_TARGET_SKYLAKE
|
|
28
28
|
|
|
29
29
|
#include "numkong/types.h"
|
|
@@ -721,5 +721,5 @@ NK_PUBLIC void nk_each_atan_f16_skylake(nk_f16_t const *ins, nk_size_t n, nk_f16
|
|
|
721
721
|
#endif
|
|
722
722
|
|
|
723
723
|
#endif // NK_TARGET_SKYLAKE
|
|
724
|
-
#endif //
|
|
724
|
+
#endif // NK_TARGET_X8664_
|
|
725
725
|
#endif // NK_TRIGONOMETRY_SKYLAKE_H
|
package/include/numkong/types.h
CHANGED
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
* Defines:
|
|
8
8
|
*
|
|
9
9
|
* - Sized aliases for numeric types, like: `nk_i32_t` and `nk_f64_t`.
|
|
10
|
-
* - Macros for internal compiler/hardware checks, like: `
|
|
10
|
+
* - Macros for internal compiler/hardware checks, like: `NK_TARGET_ARM64_`.
|
|
11
11
|
* - Macros for feature controls, like: `NK_TARGET_NEON`
|
|
12
12
|
*
|
|
13
13
|
* @section fp8_types FP8 Numeric Types
|
|
@@ -126,52 +126,52 @@
|
|
|
126
126
|
#define NK_ALLOW_ISA_REDIRECT 1
|
|
127
127
|
#endif
|
|
128
128
|
|
|
129
|
-
// Compiling for Arm:
|
|
129
|
+
// Compiling for 64-bit Arm: NK_TARGET_ARM64_
|
|
130
130
|
// https://arm-software.github.io/acle/main/acle.html
|
|
131
|
-
#if !defined(
|
|
131
|
+
#if !defined(NK_TARGET_ARM64_)
|
|
132
132
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
|
133
|
-
#define
|
|
133
|
+
#define NK_TARGET_ARM64_ 1
|
|
134
134
|
#else
|
|
135
|
-
#define
|
|
135
|
+
#define NK_TARGET_ARM64_ 0
|
|
136
136
|
#endif // defined(__aarch64__) || defined(_M_ARM64)
|
|
137
|
-
#endif // !defined(
|
|
137
|
+
#endif // !defined(NK_TARGET_ARM64_)
|
|
138
138
|
|
|
139
|
-
// Compiling for x86:
|
|
139
|
+
// Compiling for x86: NK_TARGET_X8664_
|
|
140
140
|
// https://www.intel.com/content/www/us/en/docs/dpcpp-cpp-compiler/developer-guide-reference/2024-2/additional-predefined-macros.html
|
|
141
|
-
#if !defined(
|
|
141
|
+
#if !defined(NK_TARGET_X8664_)
|
|
142
142
|
#if defined(__x86_64__) || defined(_M_X64)
|
|
143
|
-
#define
|
|
143
|
+
#define NK_TARGET_X8664_ 1
|
|
144
144
|
#else
|
|
145
|
-
#define
|
|
145
|
+
#define NK_TARGET_X8664_ 0
|
|
146
146
|
#endif // defined(__x86_64__) || defined(_M_X64)
|
|
147
|
-
#endif // !defined(
|
|
147
|
+
#endif // !defined(NK_TARGET_X8664_)
|
|
148
148
|
|
|
149
|
-
// Compiling for RISC-V:
|
|
150
|
-
#if !defined(
|
|
149
|
+
// Compiling for RISC-V: NK_TARGET_RISCV64_
|
|
150
|
+
#if !defined(NK_TARGET_RISCV64_)
|
|
151
151
|
#if defined(__riscv) && (__riscv_xlen == 64)
|
|
152
|
-
#define
|
|
152
|
+
#define NK_TARGET_RISCV64_ 1
|
|
153
153
|
#else
|
|
154
|
-
#define
|
|
154
|
+
#define NK_TARGET_RISCV64_ 0
|
|
155
155
|
#endif // defined(__riscv) && (__riscv_xlen == 64)
|
|
156
|
-
#endif // !defined(
|
|
156
|
+
#endif // !defined(NK_TARGET_RISCV64_)
|
|
157
157
|
|
|
158
|
-
// Compiling for LoongArch:
|
|
159
|
-
#if !defined(
|
|
158
|
+
// Compiling for LoongArch: NK_TARGET_LOONGARCH64_
|
|
159
|
+
#if !defined(NK_TARGET_LOONGARCH64_)
|
|
160
160
|
#if defined(__loongarch__)
|
|
161
|
-
#define
|
|
161
|
+
#define NK_TARGET_LOONGARCH64_ 1
|
|
162
162
|
#else
|
|
163
|
-
#define
|
|
163
|
+
#define NK_TARGET_LOONGARCH64_ 0
|
|
164
164
|
#endif // defined(__loongarch__)
|
|
165
|
-
#endif // !defined(
|
|
165
|
+
#endif // !defined(NK_TARGET_LOONGARCH64_)
|
|
166
166
|
|
|
167
|
-
// Compiling for Power:
|
|
168
|
-
#if !defined(
|
|
167
|
+
// Compiling for Power: NK_TARGET_POWER64_
|
|
168
|
+
#if !defined(NK_TARGET_POWER64_)
|
|
169
169
|
#if defined(__powerpc64__) || defined(__ppc64__) || defined(_ARCH_PPC64)
|
|
170
|
-
#define
|
|
170
|
+
#define NK_TARGET_POWER64_ 1
|
|
171
171
|
#else
|
|
172
|
-
#define
|
|
172
|
+
#define NK_TARGET_POWER64_ 0
|
|
173
173
|
#endif // defined(__powerpc64__) || defined(__ppc64__) || defined(_ARCH_PPC64)
|
|
174
|
-
#endif // !defined(
|
|
174
|
+
#endif // !defined(NK_TARGET_POWER64_)
|
|
175
175
|
|
|
176
176
|
// Compiling for WASM: NK_TARGET_WASM_
|
|
177
177
|
#if !defined(NK_TARGET_WASM_)
|
|
@@ -203,7 +203,7 @@
|
|
|
203
203
|
#endif // !defined(NK_TARGET_V128RELAXED) || ...
|
|
204
204
|
|
|
205
205
|
// Compiling for RISC-V Vector: NK_TARGET_RVV
|
|
206
|
-
#if !defined(NK_TARGET_RVV) || (NK_TARGET_RVV && !
|
|
206
|
+
#if !defined(NK_TARGET_RVV) || (NK_TARGET_RVV && !NK_TARGET_RISCV64_)
|
|
207
207
|
#if defined(__riscv_v) && (__riscv_v >= 1000000)
|
|
208
208
|
#define NK_TARGET_RVV 1
|
|
209
209
|
#else
|
|
@@ -248,7 +248,7 @@
|
|
|
248
248
|
// Compiling for LoongArch LASX (256-bit SIMD): NK_TARGET_LOONGSONASX
|
|
249
249
|
// LASX provides 32 × 256-bit vector registers, widening integer multiply-accumulate,
|
|
250
250
|
// and f32-to-f64 conversion (xvfcvtl_d_s / xvfcvth_d_s) but no widening FMA.
|
|
251
|
-
#if !defined(NK_TARGET_LOONGSONASX) || (NK_TARGET_LOONGSONASX && !
|
|
251
|
+
#if !defined(NK_TARGET_LOONGSONASX) || (NK_TARGET_LOONGSONASX && !NK_TARGET_LOONGARCH64_)
|
|
252
252
|
#if defined(__loongarch_asx)
|
|
253
253
|
#define NK_TARGET_LOONGSONASX 1
|
|
254
254
|
#else
|
|
@@ -261,7 +261,7 @@
|
|
|
261
261
|
// VSX provides 64 × 128-bit registers, FMA (vec_madd), vec_msum (multiply-sum), hardware f16
|
|
262
262
|
// conversion (vec_extract_fp32_from_shorth/l), length-limited loads (vec_xl_len), per-byte
|
|
263
263
|
// popcount (vec_popcnt), and vec_cmpne. Requires POWER9 (ISA 3.0) or newer.
|
|
264
|
-
#if !defined(NK_TARGET_POWERVSX) || (NK_TARGET_POWERVSX && !
|
|
264
|
+
#if !defined(NK_TARGET_POWERVSX) || (NK_TARGET_POWERVSX && !NK_TARGET_POWER64_)
|
|
265
265
|
#if defined(__VSX__) && defined(__POWER9_VECTOR__)
|
|
266
266
|
#define NK_TARGET_POWERVSX 1
|
|
267
267
|
#else
|
|
@@ -270,19 +270,20 @@
|
|
|
270
270
|
#endif // defined(__VSX__)
|
|
271
271
|
#endif // !defined(NK_TARGET_POWERVSX) || ...
|
|
272
272
|
|
|
273
|
-
// Compiling for Arm: NK_TARGET_NEON
|
|
274
|
-
#if !defined(NK_TARGET_NEON) || (NK_TARGET_NEON && !
|
|
275
|
-
#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
|
|
273
|
+
// Compiling for Arm: NK_TARGET_NEON (AArch64 only, AArch32 NEON is not supported)
|
|
274
|
+
#if !defined(NK_TARGET_NEON) || (NK_TARGET_NEON && !NK_TARGET_ARM64_)
|
|
275
|
+
#if (defined(__ARM_NEON) && defined(__aarch64__)) || (defined(_MSC_VER) && defined(_M_ARM64))
|
|
276
276
|
#define NK_TARGET_NEON 1
|
|
277
277
|
#else
|
|
278
278
|
#undef NK_TARGET_NEON
|
|
279
279
|
#define NK_TARGET_NEON 0
|
|
280
|
-
#endif // defined(__ARM_NEON) || ...
|
|
280
|
+
#endif // (defined(__ARM_NEON) && defined(__aarch64__)) || ...
|
|
281
281
|
#endif // !defined(NK_TARGET_NEON) || ...
|
|
282
282
|
|
|
283
|
-
// Compiling for Arm: NK_TARGET_NEONSDOT (FEAT_DotProd,
|
|
284
|
-
#if !defined(NK_TARGET_NEONSDOT) || (NK_TARGET_NEONSDOT && !
|
|
285
|
-
#if defined(__ARM_FEATURE_DOTPROD)
|
|
283
|
+
// Compiling for Arm: NK_TARGET_NEONSDOT (FEAT_DotProd, AArch64 only)
|
|
284
|
+
#if !defined(NK_TARGET_NEONSDOT) || (NK_TARGET_NEONSDOT && !NK_TARGET_ARM64_)
|
|
285
|
+
#if (defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__)) || \
|
|
286
|
+
(defined(_MSC_VER) && defined(_M_ARM64) && __ARM_ARCH >= 804)
|
|
286
287
|
#define NK_TARGET_NEONSDOT 1
|
|
287
288
|
#else
|
|
288
289
|
#undef NK_TARGET_NEONSDOT
|
|
@@ -290,9 +291,10 @@
|
|
|
290
291
|
#endif
|
|
291
292
|
#endif // !defined(NK_TARGET_NEONSDOT) || ...
|
|
292
293
|
|
|
293
|
-
// Compiling for Arm: NK_TARGET_NEONHALF (FEAT_FP16,
|
|
294
|
-
#if !defined(NK_TARGET_NEONHALF) || (NK_TARGET_NEONHALF && !
|
|
295
|
-
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
|
294
|
+
// Compiling for Arm: NK_TARGET_NEONHALF (FEAT_FP16, AArch64 only)
|
|
295
|
+
#if !defined(NK_TARGET_NEONHALF) || (NK_TARGET_NEONHALF && !NK_TARGET_ARM64_)
|
|
296
|
+
#if (defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarch64__)) || \
|
|
297
|
+
(defined(_MSC_VER) && defined(_M_ARM64) && __ARM_ARCH >= 802)
|
|
296
298
|
#define NK_TARGET_NEONHALF 1
|
|
297
299
|
#else
|
|
298
300
|
#undef NK_TARGET_NEONHALF
|
|
@@ -300,9 +302,10 @@
|
|
|
300
302
|
#endif
|
|
301
303
|
#endif // !defined(NK_TARGET_NEONHALF) || ...
|
|
302
304
|
|
|
303
|
-
// Compiling for Arm: NK_TARGET_NEONFHM (FEAT_FHM,
|
|
304
|
-
#if !defined(NK_TARGET_NEONFHM) || (NK_TARGET_NEONFHM && !
|
|
305
|
-
#if defined(__ARM_FEATURE_FP16_FML)
|
|
305
|
+
// Compiling for Arm: NK_TARGET_NEONFHM (FEAT_FHM, AArch64 only)
|
|
306
|
+
#if !defined(NK_TARGET_NEONFHM) || (NK_TARGET_NEONFHM && !NK_TARGET_ARM64_)
|
|
307
|
+
#if (defined(__ARM_FEATURE_FP16_FML) && defined(__aarch64__)) || \
|
|
308
|
+
(defined(_MSC_VER) && defined(_M_ARM64) && __ARM_ARCH >= 804)
|
|
306
309
|
#define NK_TARGET_NEONFHM 1
|
|
307
310
|
#else
|
|
308
311
|
#undef NK_TARGET_NEONFHM
|
|
@@ -310,9 +313,10 @@
|
|
|
310
313
|
#endif
|
|
311
314
|
#endif // !defined(NK_TARGET_NEONFHM) || ...
|
|
312
315
|
|
|
313
|
-
// Compiling for Arm: NK_TARGET_NEONBFDOT (FEAT_BF16,
|
|
314
|
-
#if !defined(NK_TARGET_NEONBFDOT) || (NK_TARGET_NEONBFDOT && !
|
|
315
|
-
#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)
|
|
316
|
+
// Compiling for Arm: NK_TARGET_NEONBFDOT (FEAT_BF16, AArch64 only)
|
|
317
|
+
#if !defined(NK_TARGET_NEONBFDOT) || (NK_TARGET_NEONBFDOT && !NK_TARGET_ARM64_)
|
|
318
|
+
#if (defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && defined(__aarch64__)) || \
|
|
319
|
+
(defined(_MSC_VER) && defined(_M_ARM64) && __ARM_ARCH >= 806)
|
|
316
320
|
#define NK_TARGET_NEONBFDOT 1
|
|
317
321
|
#else
|
|
318
322
|
#undef NK_TARGET_NEONBFDOT
|
|
@@ -323,8 +327,8 @@
|
|
|
323
327
|
// Compiling for Arm: NK_TARGET_NEONFP8 (NEON FP8 extensions, FEAT_FP8DOT4)
|
|
324
328
|
// ACLE macro __ARM_FEATURE_FP8DOT4 defined by GCC 15+ and Clang 21+ when +fp8dot4 is enabled.
|
|
325
329
|
// Older compilers lack mfloat8x16_t and the fp8dot4 target attribute entirely.
|
|
326
|
-
#if !defined(NK_TARGET_NEONFP8) || (NK_TARGET_NEONFP8 && !
|
|
327
|
-
#if defined(__ARM_FEATURE_FP8DOT4)
|
|
330
|
+
#if !defined(NK_TARGET_NEONFP8) || (NK_TARGET_NEONFP8 && !NK_TARGET_ARM64_)
|
|
331
|
+
#if defined(__ARM_FEATURE_FP8DOT4) && defined(__aarch64__)
|
|
328
332
|
#define NK_TARGET_NEONFP8 1
|
|
329
333
|
#else
|
|
330
334
|
#undef NK_TARGET_NEONFP8
|
|
@@ -333,7 +337,7 @@
|
|
|
333
337
|
#endif // !defined(NK_TARGET_NEONFP8) || ...
|
|
334
338
|
|
|
335
339
|
// Compiling for Arm: NK_TARGET_SVE
|
|
336
|
-
#if !defined(NK_TARGET_SVE) || (NK_TARGET_SVE && !
|
|
340
|
+
#if !defined(NK_TARGET_SVE) || (NK_TARGET_SVE && !NK_TARGET_ARM64_)
|
|
337
341
|
#if defined(__ARM_FEATURE_SVE)
|
|
338
342
|
#define NK_TARGET_SVE 1
|
|
339
343
|
#else
|
|
@@ -343,7 +347,7 @@
|
|
|
343
347
|
#endif // !defined(NK_TARGET_SVE) || ...
|
|
344
348
|
|
|
345
349
|
// Compiling for Arm: NK_TARGET_SVESDOT
|
|
346
|
-
#if !defined(NK_TARGET_SVESDOT) || (NK_TARGET_SVESDOT && !
|
|
350
|
+
#if !defined(NK_TARGET_SVESDOT) || (NK_TARGET_SVESDOT && !NK_TARGET_ARM64_)
|
|
347
351
|
#if defined(__ARM_FEATURE_SVE)
|
|
348
352
|
#define NK_TARGET_SVESDOT 1
|
|
349
353
|
#else
|
|
@@ -353,7 +357,7 @@
|
|
|
353
357
|
#endif // !defined(NK_TARGET_SVESDOT) || ...
|
|
354
358
|
|
|
355
359
|
// Compiling for Arm: NK_TARGET_SVEHALF
|
|
356
|
-
#if !defined(NK_TARGET_SVEHALF) || (NK_TARGET_SVEHALF && !
|
|
360
|
+
#if !defined(NK_TARGET_SVEHALF) || (NK_TARGET_SVEHALF && !NK_TARGET_ARM64_)
|
|
357
361
|
#if defined(__ARM_FEATURE_SVE)
|
|
358
362
|
#define NK_TARGET_SVEHALF 1
|
|
359
363
|
#else
|
|
@@ -363,7 +367,7 @@
|
|
|
363
367
|
#endif // !defined(NK_TARGET_SVEHALF) || ...
|
|
364
368
|
|
|
365
369
|
// Compiling for Arm: NK_TARGET_SVEBFDOT
|
|
366
|
-
#if !defined(NK_TARGET_SVEBFDOT) || (NK_TARGET_SVEBFDOT && !
|
|
370
|
+
#if !defined(NK_TARGET_SVEBFDOT) || (NK_TARGET_SVEBFDOT && !NK_TARGET_ARM64_)
|
|
367
371
|
#if defined(__ARM_FEATURE_SVE)
|
|
368
372
|
#define NK_TARGET_SVEBFDOT 1
|
|
369
373
|
#else
|
|
@@ -373,7 +377,7 @@
|
|
|
373
377
|
#endif // !defined(NK_TARGET_SVEBFDOT) || ...
|
|
374
378
|
|
|
375
379
|
// Compiling for Arm: NK_TARGET_SVE2
|
|
376
|
-
#if !defined(NK_TARGET_SVE2) || (NK_TARGET_SVE2 && !
|
|
380
|
+
#if !defined(NK_TARGET_SVE2) || (NK_TARGET_SVE2 && !NK_TARGET_ARM64_)
|
|
377
381
|
#if defined(__ARM_FEATURE_SVE2)
|
|
378
382
|
#define NK_TARGET_SVE2 1
|
|
379
383
|
#else
|
|
@@ -383,13 +387,13 @@
|
|
|
383
387
|
#endif // !defined(NK_TARGET_SVE2) || ...
|
|
384
388
|
|
|
385
389
|
// Compiling for Arm: NK_TARGET_SVE2P1
|
|
386
|
-
#if !defined(NK_TARGET_SVE2P1) || (NK_TARGET_SVE2P1 && !
|
|
390
|
+
#if !defined(NK_TARGET_SVE2P1) || (NK_TARGET_SVE2P1 && !NK_TARGET_ARM64_)
|
|
387
391
|
#undef NK_TARGET_SVE2P1
|
|
388
392
|
#define NK_TARGET_SVE2P1 0
|
|
389
393
|
#endif // !defined(NK_TARGET_SVE2P1) || ...
|
|
390
394
|
|
|
391
395
|
// Compiling for Arm: NK_TARGET_SME (Scalable Matrix Extension)
|
|
392
|
-
#if !defined(NK_TARGET_SME) || (NK_TARGET_SME && !
|
|
396
|
+
#if !defined(NK_TARGET_SME) || (NK_TARGET_SME && !NK_TARGET_ARM64_)
|
|
393
397
|
#if defined(__ARM_FEATURE_SME)
|
|
394
398
|
#define NK_TARGET_SME 1
|
|
395
399
|
#else
|
|
@@ -398,7 +402,7 @@
|
|
|
398
402
|
#endif // defined(__ARM_FEATURE_SME)
|
|
399
403
|
#endif // !defined(NK_TARGET_SME) || ...
|
|
400
404
|
|
|
401
|
-
#if !defined(NK_TARGET_SME2) || (NK_TARGET_SME2 && !
|
|
405
|
+
#if !defined(NK_TARGET_SME2) || (NK_TARGET_SME2 && !NK_TARGET_ARM64_)
|
|
402
406
|
#if defined(__ARM_FEATURE_SME2)
|
|
403
407
|
#define NK_TARGET_SME2 1
|
|
404
408
|
#else
|
|
@@ -409,7 +413,7 @@
|
|
|
409
413
|
|
|
410
414
|
// Compiling for Arm: NK_TARGET_SME2P1 (FEAT_SME2p1)
|
|
411
415
|
// ACLE macro: __ARM_FEATURE_SME2p1 (note lowercase 'p')
|
|
412
|
-
#if !defined(NK_TARGET_SME2P1) || (NK_TARGET_SME2P1 && !
|
|
416
|
+
#if !defined(NK_TARGET_SME2P1) || (NK_TARGET_SME2P1 && !NK_TARGET_ARM64_)
|
|
413
417
|
#if defined(__ARM_FEATURE_SME2p1)
|
|
414
418
|
#define NK_TARGET_SME2P1 1
|
|
415
419
|
#else
|
|
@@ -420,7 +424,7 @@
|
|
|
420
424
|
|
|
421
425
|
// AppleClang 17 exposes SME sub-features through `arm_sme.h` builtin aliases,
|
|
422
426
|
// not dedicated `__ARM_FEATURE_*` predefines for every matrix subtype.
|
|
423
|
-
#if !defined(NK_TARGET_SMEF64) || (NK_TARGET_SMEF64 && !
|
|
427
|
+
#if !defined(NK_TARGET_SMEF64) || (NK_TARGET_SMEF64 && !NK_TARGET_ARM64_)
|
|
424
428
|
#if defined(__ARM_FEATURE_SME_F64F64) || (defined(__has_builtin) && __has_builtin(__builtin_sme_svmopa_za64_f64_m))
|
|
425
429
|
#define NK_TARGET_SMEF64 1
|
|
426
430
|
#else
|
|
@@ -429,7 +433,7 @@
|
|
|
429
433
|
#endif // defined(__ARM_FEATURE_SME_F64F64) || ...
|
|
430
434
|
#endif // !defined(NK_TARGET_SMEF64) || ...
|
|
431
435
|
|
|
432
|
-
#if !defined(NK_TARGET_SMEBI32) || (NK_TARGET_SMEBI32 && !
|
|
436
|
+
#if !defined(NK_TARGET_SMEBI32) || (NK_TARGET_SMEBI32 && !NK_TARGET_ARM64_)
|
|
433
437
|
#if defined(__has_builtin) && __has_builtin(__builtin_sme_svbmopa_za32_u32_m)
|
|
434
438
|
#define NK_TARGET_SMEBI32 1
|
|
435
439
|
#else
|
|
@@ -438,7 +442,7 @@
|
|
|
438
442
|
#endif // defined(__has_builtin) && __has_builtin(__builtin_sme_svbmopa_za32_u32_m)
|
|
439
443
|
#endif // !defined(NK_TARGET_SMEBI32) || ...
|
|
440
444
|
|
|
441
|
-
#if !defined(NK_TARGET_SMEHALF) || (NK_TARGET_SMEHALF && !
|
|
445
|
+
#if !defined(NK_TARGET_SMEHALF) || (NK_TARGET_SMEHALF && !NK_TARGET_ARM64_)
|
|
442
446
|
#if defined(__ARM_FEATURE_SME_F16F16) || (defined(__has_builtin) && __has_builtin(__builtin_sme_svmopa_za32_f16_m))
|
|
443
447
|
#define NK_TARGET_SMEHALF 1
|
|
444
448
|
#else
|
|
@@ -447,7 +451,7 @@
|
|
|
447
451
|
#endif // defined(__has_builtin) && __has_builtin(__builtin_sme_svmopa_za32_f16_m)
|
|
448
452
|
#endif // !defined(NK_TARGET_SMEHALF) || ...
|
|
449
453
|
|
|
450
|
-
#if !defined(NK_TARGET_SMEBF16) || (NK_TARGET_SMEBF16 && !
|
|
454
|
+
#if !defined(NK_TARGET_SMEBF16) || (NK_TARGET_SMEBF16 && !NK_TARGET_ARM64_)
|
|
451
455
|
#if defined(__has_builtin) && __has_builtin(__builtin_sme_svmopa_za32_bf16_m)
|
|
452
456
|
#define NK_TARGET_SMEBF16 1
|
|
453
457
|
#else
|
|
@@ -456,7 +460,7 @@
|
|
|
456
460
|
#endif // defined(__has_builtin) && __has_builtin(__builtin_sme_svmopa_za32_bf16_m)
|
|
457
461
|
#endif // !defined(NK_TARGET_SMEBF16) || ...
|
|
458
462
|
|
|
459
|
-
#if !defined(NK_TARGET_SMELUT2) || (NK_TARGET_SMELUT2 && !
|
|
463
|
+
#if !defined(NK_TARGET_SMELUT2) || (NK_TARGET_SMELUT2 && !NK_TARGET_ARM64_)
|
|
460
464
|
#if defined(__has_builtin) && __has_builtin(__builtin_sme_svluti2_lane_zt_u8)
|
|
461
465
|
#define NK_TARGET_SMELUT2 1
|
|
462
466
|
#else
|
|
@@ -466,7 +470,7 @@
|
|
|
466
470
|
#endif // !defined(NK_TARGET_SMELUT2) || ...
|
|
467
471
|
|
|
468
472
|
// Compiling for Arm: NK_TARGET_SMEFA64 (FEAT_SME_FA64, full SVE2 in streaming mode)
|
|
469
|
-
#if !defined(NK_TARGET_SMEFA64) || (NK_TARGET_SMEFA64 && !
|
|
473
|
+
#if !defined(NK_TARGET_SMEFA64) || (NK_TARGET_SMEFA64 && !NK_TARGET_ARM64_)
|
|
470
474
|
#if defined(__ARM_FEATURE_SME_FA64)
|
|
471
475
|
#define NK_TARGET_SMEFA64 1
|
|
472
476
|
#else
|
|
@@ -491,7 +495,7 @@
|
|
|
491
495
|
// - _MSC_VER >= 1900 (VS 2015+): AVX2/FMA/F16C (Haswell)
|
|
492
496
|
// - _MSC_VER >= 1920 (VS 2019+): AVX-512 base (Skylake, Icelake), AVX-VNNI (Alder)
|
|
493
497
|
// - _MSC_VER >= 1944 (VS 2022 17.14+): BF16, FP16, VP2INTERSECT, VNNI-INT8 (Sierra), AMX
|
|
494
|
-
#if !defined(NK_TARGET_HASWELL) || (NK_TARGET_HASWELL && !
|
|
498
|
+
#if !defined(NK_TARGET_HASWELL) || (NK_TARGET_HASWELL && !NK_TARGET_X8664_)
|
|
495
499
|
#if (defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)) || (defined(_MSC_VER) && _MSC_VER >= 1900)
|
|
496
500
|
#define NK_TARGET_HASWELL 1
|
|
497
501
|
#else
|
|
@@ -507,7 +511,7 @@
|
|
|
507
511
|
// gcc-12 -march=sapphirerapids -dM -E - < /dev/null | egrep "SSE|AVX" | sort
|
|
508
512
|
// On Arm machines you may want to check for other flags:
|
|
509
513
|
// gcc-12 -march=native -dM -E - < /dev/null | egrep "NEON|SVE|FP16|FMA" | sort
|
|
510
|
-
#if !defined(NK_TARGET_SKYLAKE) || (NK_TARGET_SKYLAKE && !
|
|
514
|
+
#if !defined(NK_TARGET_SKYLAKE) || (NK_TARGET_SKYLAKE && !NK_TARGET_X8664_)
|
|
511
515
|
#if (defined(__AVX512F__) && defined(__AVX512CD__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && \
|
|
512
516
|
defined(__AVX512BW__)) || \
|
|
513
517
|
(defined(_MSC_VER) && _MSC_VER >= 1920)
|
|
@@ -518,7 +522,7 @@
|
|
|
518
522
|
#endif
|
|
519
523
|
#endif // !defined(NK_TARGET_SKYLAKE) || ...
|
|
520
524
|
|
|
521
|
-
#if !defined(NK_TARGET_ICELAKE) || (NK_TARGET_ICELAKE && !
|
|
525
|
+
#if !defined(NK_TARGET_ICELAKE) || (NK_TARGET_ICELAKE && !NK_TARGET_X8664_)
|
|
522
526
|
#if (defined(__AVX512VNNI__) && defined(__AVX512IFMA__) && defined(__AVX512BITALG__) && defined(__AVX512VBMI__) && \
|
|
523
527
|
defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__)) || \
|
|
524
528
|
(defined(_MSC_VER) && _MSC_VER >= 1920)
|
|
@@ -529,7 +533,7 @@
|
|
|
529
533
|
#endif
|
|
530
534
|
#endif // !defined(NK_TARGET_ICELAKE) || ...
|
|
531
535
|
|
|
532
|
-
#if !defined(NK_TARGET_GENOA) || (NK_TARGET_GENOA && !
|
|
536
|
+
#if !defined(NK_TARGET_GENOA) || (NK_TARGET_GENOA && !NK_TARGET_X8664_)
|
|
533
537
|
#if defined(__AVX512BF16__) || (defined(_MSC_VER) && _MSC_VER >= 1944)
|
|
534
538
|
#define NK_TARGET_GENOA 1
|
|
535
539
|
#else
|
|
@@ -542,7 +546,7 @@
|
|
|
542
546
|
// GCC 14+: defines __AVX10_2__ with -mavx10.2-512
|
|
543
547
|
// Clang 19+: defines __AVX10_2__ with -mavx10.2-512
|
|
544
548
|
// MSVC: defines __AVX10_VER__ >= 2 with /arch:AVX10.2 (VS 2026+, not yet released)
|
|
545
|
-
#if !defined(NK_TARGET_DIAMOND) || (NK_TARGET_DIAMOND && !
|
|
549
|
+
#if !defined(NK_TARGET_DIAMOND) || (NK_TARGET_DIAMOND && !NK_TARGET_X8664_)
|
|
546
550
|
#if defined(__AVX10_2__) || (defined(__AVX10_VER__) && __AVX10_VER__ >= 2)
|
|
547
551
|
#define NK_TARGET_DIAMOND 1
|
|
548
552
|
#else
|
|
@@ -551,7 +555,7 @@
|
|
|
551
555
|
#endif // defined(__AVX10_2__) || ...
|
|
552
556
|
#endif // !defined(NK_TARGET_DIAMOND) || ...
|
|
553
557
|
|
|
554
|
-
#if !defined(NK_TARGET_SAPPHIRE) || (NK_TARGET_SAPPHIRE && !
|
|
558
|
+
#if !defined(NK_TARGET_SAPPHIRE) || (NK_TARGET_SAPPHIRE && !NK_TARGET_X8664_)
|
|
555
559
|
#if defined(__AVX512FP16__) || (defined(_MSC_VER) && _MSC_VER >= 1944)
|
|
556
560
|
#define NK_TARGET_SAPPHIRE 1
|
|
557
561
|
#else
|
|
@@ -560,7 +564,7 @@
|
|
|
560
564
|
#endif
|
|
561
565
|
#endif // !defined(NK_TARGET_SAPPHIRE) || ...
|
|
562
566
|
|
|
563
|
-
#if !defined(NK_TARGET_SAPPHIREAMX) || (NK_TARGET_SAPPHIREAMX && !
|
|
567
|
+
#if !defined(NK_TARGET_SAPPHIREAMX) || (NK_TARGET_SAPPHIREAMX && !NK_TARGET_X8664_)
|
|
564
568
|
#if (defined(__AMX_TILE__) && defined(__AMX_BF16__) && defined(__AMX_INT8__)) || (defined(_MSC_VER) && _MSC_VER >= 1944)
|
|
565
569
|
#define NK_TARGET_SAPPHIREAMX 1
|
|
566
570
|
#else
|
|
@@ -569,7 +573,7 @@
|
|
|
569
573
|
#endif
|
|
570
574
|
#endif // !defined(NK_TARGET_SAPPHIREAMX) || ...
|
|
571
575
|
|
|
572
|
-
#if !defined(NK_TARGET_GRANITEAMX) || (NK_TARGET_GRANITEAMX && !
|
|
576
|
+
#if !defined(NK_TARGET_GRANITEAMX) || (NK_TARGET_GRANITEAMX && !NK_TARGET_X8664_)
|
|
573
577
|
#if (defined(__AMX_TILE__) && defined(__AMX_FP16__)) || (defined(_MSC_VER) && _MSC_VER >= 1944)
|
|
574
578
|
#define NK_TARGET_GRANITEAMX 1
|
|
575
579
|
#else
|
|
@@ -578,7 +582,7 @@
|
|
|
578
582
|
#endif
|
|
579
583
|
#endif // !defined(NK_TARGET_GRANITEAMX) || ...
|
|
580
584
|
|
|
581
|
-
#if !defined(NK_TARGET_TURIN) || (NK_TARGET_TURIN && !
|
|
585
|
+
#if !defined(NK_TARGET_TURIN) || (NK_TARGET_TURIN && !NK_TARGET_X8664_)
|
|
582
586
|
#if defined(__AVX512VP2INTERSECT__) || (defined(_MSC_VER) && _MSC_VER >= 1944)
|
|
583
587
|
#define NK_TARGET_TURIN 1
|
|
584
588
|
#else
|
|
@@ -587,7 +591,7 @@
|
|
|
587
591
|
#endif
|
|
588
592
|
#endif // !defined(NK_TARGET_TURIN) || ...
|
|
589
593
|
|
|
590
|
-
#if !defined(NK_TARGET_ALDER) || (NK_TARGET_ALDER && !
|
|
594
|
+
#if !defined(NK_TARGET_ALDER) || (NK_TARGET_ALDER && !NK_TARGET_X8664_)
|
|
591
595
|
#if defined(__AVXVNNI__) || (defined(_MSC_VER) && _MSC_VER >= 1920)
|
|
592
596
|
#define NK_TARGET_ALDER 1
|
|
593
597
|
#else
|
|
@@ -596,7 +600,7 @@
|
|
|
596
600
|
#endif
|
|
597
601
|
#endif // !defined(NK_TARGET_ALDER) || ...
|
|
598
602
|
|
|
599
|
-
#if !defined(NK_TARGET_SIERRA) || (NK_TARGET_SIERRA && !
|
|
603
|
+
#if !defined(NK_TARGET_SIERRA) || (NK_TARGET_SIERRA && !NK_TARGET_X8664_)
|
|
600
604
|
#if defined(__AVXVNNIINT8__) || (defined(_MSC_VER) && _MSC_VER >= 1944)
|
|
601
605
|
#define NK_TARGET_SIERRA 1
|
|
602
606
|
#else
|
|
@@ -671,7 +675,7 @@
|
|
|
671
675
|
* NK_STREAMING_ marks functions that require streaming SVE mode (e.g. FCVTLT).
|
|
672
676
|
* NK_STREAMING_COMPATIBLE_ marks helpers callable from both streaming and non-streaming mode.
|
|
673
677
|
*/
|
|
674
|
-
#if
|
|
678
|
+
#if NK_TARGET_ARM64_ && NK_TARGET_SME
|
|
675
679
|
#define NK_STREAMING_ __arm_streaming
|
|
676
680
|
#define NK_STREAMING_COMPATIBLE_ __arm_streaming_compatible
|
|
677
681
|
#else
|
|
@@ -684,7 +688,7 @@
|
|
|
684
688
|
* MSVC typedefs `__m512bh`, `__m512h`, `__m256bh` as aliases for `__m512i`/`__m256i`,
|
|
685
689
|
* but rejects C-style casts between them. GCC/Clang define them as distinct types.
|
|
686
690
|
*/
|
|
687
|
-
#if
|
|
691
|
+
#if NK_TARGET_X8664_
|
|
688
692
|
#if defined(_MSC_VER)
|
|
689
693
|
#define nk_m512bh_from_m512i_(x) (x)
|
|
690
694
|
#define nk_m512h_from_m512i_(x) (x)
|
|
@@ -804,7 +808,7 @@ typedef unsigned int nk_u32_t;
|
|
|
804
808
|
/* On LP64 targets (Linux ARM64, RISC-V 64), `long` and `long long` are both 64-bit but distinct types.
|
|
805
809
|
* NEON/RVV intrinsics on Linux expect `long*`, while Apple's NEON intrinsics expect `long long*`.
|
|
806
810
|
* Windows uses LLP64 where `long` is 32-bit, so it must use `long long` for 64-bit types. */
|
|
807
|
-
#if ((
|
|
811
|
+
#if ((NK_TARGET_ARM64_ && !defined(NK_DEFINED_APPLE_)) || NK_TARGET_RISCV64_) && !defined(NK_DEFINED_WINDOWS_)
|
|
808
812
|
/** @brief Signed 64-bit integer. Range: [−2⁶³, +2⁶³−1]. */
|
|
809
813
|
typedef signed long nk_i64_t;
|
|
810
814
|
/** @brief Unsigned 64-bit integer. Range: [0, 2⁶⁴−1]. */
|
|
@@ -821,7 +825,7 @@ typedef float nk_f32_t;
|
|
|
821
825
|
/** @brief Double-precision (64-bit) IEEE 754 float. sign(1) + exponent(11) + mantissa(52), bias=1023. */
|
|
822
826
|
typedef double nk_f64_t;
|
|
823
827
|
|
|
824
|
-
#if
|
|
828
|
+
#if NK_TARGET_X8664_ || NK_TARGET_ARM64_ || NK_TARGET_RISCV64_ || NK_TARGET_POWER64_ || NK_TARGET_LOONGARCH64_
|
|
825
829
|
#define NK_IS_64BIT_ 1
|
|
826
830
|
#else
|
|
827
831
|
#define NK_IS_64BIT_ 0
|
|
@@ -1088,7 +1092,7 @@ typedef unsigned short nk_bf16_t;
|
|
|
1088
1092
|
* Some of those are defined as aliases, so we use `#define` preprocessor
|
|
1089
1093
|
* directives instead of `typedef` to avoid errors.
|
|
1090
1094
|
*/
|
|
1091
|
-
#if
|
|
1095
|
+
#if NK_TARGET_ARM64_
|
|
1092
1096
|
#if defined(_MSC_VER)
|
|
1093
1097
|
#define nk_f16_for_arm_simd_t nk_f16_t
|
|
1094
1098
|
#define nk_bf16_for_arm_simd_t nk_bf16_t
|
|
@@ -1102,7 +1106,7 @@ typedef unsigned short nk_bf16_t;
|
|
|
1102
1106
|
* RISC-V Vector (RVV) intrinsics use `_Float16` for half-precision floats.
|
|
1103
1107
|
* This is the standard C23 type, also available in GCC/Clang with RVV extensions.
|
|
1104
1108
|
*/
|
|
1105
|
-
#if
|
|
1109
|
+
#if NK_TARGET_RISCV64_
|
|
1106
1110
|
#define nk_f16_for_rvv_intrinsics_t _Float16
|
|
1107
1111
|
#endif
|
|
1108
1112
|
|
|
@@ -1237,6 +1241,8 @@ typedef union NK_MAY_ALIAS_ nk_b128_vec_t {
|
|
|
1237
1241
|
int32x4_t i32x4;
|
|
1238
1242
|
int64x2_t i64x2;
|
|
1239
1243
|
float32x4_t f32x4;
|
|
1244
|
+
#endif
|
|
1245
|
+
#if NK_TARGET_NEON && NK_TARGET_ARM64_ // double-precision NEON requires AArch64
|
|
1240
1246
|
float64x2_t f64x2;
|
|
1241
1247
|
#endif
|
|
1242
1248
|
#if NK_TARGET_NEONHALF
|
|
@@ -1294,6 +1300,8 @@ typedef union NK_MAY_ALIAS_ nk_b256_vec_t {
|
|
|
1294
1300
|
int32x4_t i32x4s[2];
|
|
1295
1301
|
int64x2_t i64x2s[2];
|
|
1296
1302
|
float32x4_t f32x4s[2];
|
|
1303
|
+
#endif
|
|
1304
|
+
#if NK_TARGET_NEON && NK_TARGET_ARM64_ // double-precision NEON requires AArch64
|
|
1297
1305
|
float64x2_t f64x2s[2];
|
|
1298
1306
|
#endif
|
|
1299
1307
|
#if NK_TARGET_POWERVSX
|
|
@@ -1588,7 +1596,7 @@ NK_INTERNAL int nk_bf16_is_nan_(nk_bf16_t x) {
|
|
|
1588
1596
|
* SMSTART SM / SMSTOP SM so the calling function's ABI is unchanged.
|
|
1589
1597
|
* Inside `__arm_locally_streaming` functions the plain `svcntXX()` intrinsics are fine.
|
|
1590
1598
|
*/
|
|
1591
|
-
#if
|
|
1599
|
+
#if NK_TARGET_ARM64_ && NK_TARGET_SME
|
|
1592
1600
|
/** @brief Streaming SVL byte-element count (SVL/8) via SMSTART SM bracket. */
|
|
1593
1601
|
NK_INTERNAL nk_size_t nk_sme_cntb_(void) {
|
|
1594
1602
|
nk_u64_t r;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "numkong",
|
|
3
|
-
"version": "7.4.
|
|
3
|
+
"version": "7.4.3",
|
|
4
4
|
"description": "Portable mixed-precision math, linear-algebra, & retrieval library with 2000+ SIMD kernels for x86, Arm, RISC-V, LoongArch, Power, & WebAssembly",
|
|
5
5
|
"homepage": "https://github.com/ashvardanian/NumKong",
|
|
6
6
|
"author": "Ash Vardanian",
|
|
@@ -98,11 +98,11 @@
|
|
|
98
98
|
"printWidth": 120
|
|
99
99
|
},
|
|
100
100
|
"optionalDependencies": {
|
|
101
|
-
"@numkong/darwin-arm64": "7.4.
|
|
102
|
-
"@numkong/darwin-x64": "7.4.
|
|
103
|
-
"@numkong/linux-arm64": "7.4.
|
|
104
|
-
"@numkong/linux-x64": "7.4.
|
|
105
|
-
"@numkong/win32-arm64": "7.4.
|
|
106
|
-
"@numkong/win32-x64": "7.4.
|
|
101
|
+
"@numkong/darwin-arm64": "7.4.3",
|
|
102
|
+
"@numkong/darwin-x64": "7.4.3",
|
|
103
|
+
"@numkong/linux-arm64": "7.4.3",
|
|
104
|
+
"@numkong/linux-x64": "7.4.3",
|
|
105
|
+
"@numkong/win32-arm64": "7.4.3",
|
|
106
|
+
"@numkong/win32-x64": "7.4.3"
|
|
107
107
|
}
|
|
108
108
|
}
|