numkong 7.4.2 → 7.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +100 -100
- package/binding.gyp +3 -0
- package/c/numkong.c +1 -1
- package/include/numkong/attention/sapphireamx.h +2 -2
- package/include/numkong/attention/sme.h +2 -2
- package/include/numkong/capabilities.h +47 -47
- package/include/numkong/cast/diamond.h +2 -2
- package/include/numkong/cast/haswell.h +2 -2
- package/include/numkong/cast/icelake.h +2 -2
- package/include/numkong/cast/loongsonasx.h +2 -2
- package/include/numkong/cast/neon.h +2 -2
- package/include/numkong/cast/powervsx.h +2 -2
- package/include/numkong/cast/rvv.h +2 -2
- package/include/numkong/cast/sapphire.h +2 -2
- package/include/numkong/cast/skylake.h +2 -2
- package/include/numkong/curved/genoa.h +2 -2
- package/include/numkong/curved/haswell.h +2 -2
- package/include/numkong/curved/neon.h +2 -2
- package/include/numkong/curved/neonbfdot.h +2 -2
- package/include/numkong/curved/rvv.h +2 -2
- package/include/numkong/curved/skylake.h +2 -2
- package/include/numkong/curved/smef64.h +2 -2
- package/include/numkong/dot/alder.h +2 -2
- package/include/numkong/dot/diamond.h +2 -2
- package/include/numkong/dot/genoa.h +2 -2
- package/include/numkong/dot/haswell.h +2 -2
- package/include/numkong/dot/icelake.h +2 -2
- package/include/numkong/dot/loongsonasx.h +2 -2
- package/include/numkong/dot/neon.h +2 -2
- package/include/numkong/dot/neonbfdot.h +2 -2
- package/include/numkong/dot/neonfhm.h +2 -2
- package/include/numkong/dot/neonfp8.h +2 -2
- package/include/numkong/dot/neonsdot.h +2 -2
- package/include/numkong/dot/rvv.h +2 -2
- package/include/numkong/dot/rvvbb.h +2 -2
- package/include/numkong/dot/rvvbf16.h +2 -2
- package/include/numkong/dot/rvvhalf.h +2 -2
- package/include/numkong/dot/sapphire.h +2 -2
- package/include/numkong/dot/sierra.h +2 -2
- package/include/numkong/dot/skylake.h +2 -2
- package/include/numkong/dot/sve.h +2 -2
- package/include/numkong/dot/svebfdot.h +2 -2
- package/include/numkong/dot/svehalf.h +2 -2
- package/include/numkong/dot/svesdot.h +2 -2
- package/include/numkong/dots/alder.h +2 -2
- package/include/numkong/dots/diamond.h +2 -2
- package/include/numkong/dots/genoa.h +2 -2
- package/include/numkong/dots/haswell.h +2 -2
- package/include/numkong/dots/icelake.h +2 -2
- package/include/numkong/dots/loongsonasx.h +2 -2
- package/include/numkong/dots/neon.h +2 -2
- package/include/numkong/dots/neonbfdot.h +2 -2
- package/include/numkong/dots/neonfhm.h +2 -2
- package/include/numkong/dots/neonfp8.h +2 -2
- package/include/numkong/dots/neonsdot.h +2 -2
- package/include/numkong/dots/powervsx.h +2 -2
- package/include/numkong/dots/rvv.h +2 -2
- package/include/numkong/dots/sapphireamx.h +2 -2
- package/include/numkong/dots/sierra.h +2 -2
- package/include/numkong/dots/skylake.h +2 -2
- package/include/numkong/dots/sme.h +10 -10
- package/include/numkong/dots/smebi32.h +2 -2
- package/include/numkong/dots/smef64.h +2 -2
- package/include/numkong/dots/smehalf.h +2 -2
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +2 -2
- package/include/numkong/each/neon.h +2 -2
- package/include/numkong/each/neonbfdot.h +2 -2
- package/include/numkong/each/neonhalf.h +2 -2
- package/include/numkong/each/rvv.h +2 -2
- package/include/numkong/each/sapphire.h +2 -2
- package/include/numkong/each/skylake.h +2 -2
- package/include/numkong/geospatial/haswell.h +2 -2
- package/include/numkong/geospatial/neon.h +2 -2
- package/include/numkong/geospatial/rvv.h +2 -2
- package/include/numkong/geospatial/skylake.h +2 -2
- package/include/numkong/maxsim/alder.h +2 -2
- package/include/numkong/maxsim/genoa.h +2 -2
- package/include/numkong/maxsim/haswell.h +2 -2
- package/include/numkong/maxsim/icelake.h +2 -2
- package/include/numkong/maxsim/neonsdot.h +2 -2
- package/include/numkong/maxsim/sapphireamx.h +2 -2
- package/include/numkong/maxsim/sme.h +2 -2
- package/include/numkong/mesh/haswell.h +2 -2
- package/include/numkong/mesh/neon.h +2 -2
- package/include/numkong/mesh/neonbfdot.h +2 -2
- package/include/numkong/mesh/rvv.h +2 -2
- package/include/numkong/mesh/skylake.h +2 -2
- package/include/numkong/numkong.h +1 -1
- package/include/numkong/probability/haswell.h +2 -2
- package/include/numkong/probability/neon.h +2 -2
- package/include/numkong/probability/rvv.h +2 -2
- package/include/numkong/probability/skylake.h +2 -2
- package/include/numkong/reduce/alder.h +2 -2
- package/include/numkong/reduce/genoa.h +2 -2
- package/include/numkong/reduce/haswell.h +2 -2
- package/include/numkong/reduce/icelake.h +2 -2
- package/include/numkong/reduce/neon.h +2 -2
- package/include/numkong/reduce/neonbfdot.h +2 -2
- package/include/numkong/reduce/neonfhm.h +2 -2
- package/include/numkong/reduce/neonsdot.h +2 -2
- package/include/numkong/reduce/rvv.h +2 -2
- package/include/numkong/reduce/sierra.h +2 -2
- package/include/numkong/reduce/skylake.h +2 -2
- package/include/numkong/scalar/haswell.h +2 -2
- package/include/numkong/scalar/loongsonasx.h +2 -2
- package/include/numkong/scalar/neon.h +2 -2
- package/include/numkong/scalar/neonhalf.h +2 -2
- package/include/numkong/scalar/powervsx.h +2 -2
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +2 -2
- package/include/numkong/set/haswell.h +2 -2
- package/include/numkong/set/icelake.h +2 -2
- package/include/numkong/set/loongsonasx.h +2 -2
- package/include/numkong/set/neon.h +2 -2
- package/include/numkong/set/powervsx.h +2 -2
- package/include/numkong/set/rvv.h +2 -2
- package/include/numkong/set/rvvbb.h +2 -2
- package/include/numkong/set/sve.h +2 -2
- package/include/numkong/sets/haswell.h +2 -2
- package/include/numkong/sets/icelake.h +2 -2
- package/include/numkong/sets/loongsonasx.h +2 -2
- package/include/numkong/sets/neon.h +2 -2
- package/include/numkong/sets/powervsx.h +2 -2
- package/include/numkong/sets/smebi32.h +2 -2
- package/include/numkong/sparse/icelake.h +2 -2
- package/include/numkong/sparse/neon.h +2 -2
- package/include/numkong/sparse/sve2.h +2 -2
- package/include/numkong/sparse/turin.h +2 -2
- package/include/numkong/spatial/alder.h +2 -2
- package/include/numkong/spatial/diamond.h +2 -2
- package/include/numkong/spatial/genoa.h +2 -2
- package/include/numkong/spatial/haswell.h +2 -2
- package/include/numkong/spatial/icelake.h +2 -2
- package/include/numkong/spatial/loongsonasx.h +2 -2
- package/include/numkong/spatial/neon.h +2 -2
- package/include/numkong/spatial/neonbfdot.h +2 -2
- package/include/numkong/spatial/neonfp8.h +2 -2
- package/include/numkong/spatial/neonsdot.h +2 -2
- package/include/numkong/spatial/powervsx.h +2 -2
- package/include/numkong/spatial/rvv.h +2 -2
- package/include/numkong/spatial/rvvbf16.h +2 -2
- package/include/numkong/spatial/rvvhalf.h +2 -2
- package/include/numkong/spatial/sierra.h +2 -2
- package/include/numkong/spatial/skylake.h +2 -2
- package/include/numkong/spatial/sve.h +2 -2
- package/include/numkong/spatial/svebfdot.h +2 -2
- package/include/numkong/spatial/svehalf.h +2 -2
- package/include/numkong/spatial/svesdot.h +2 -2
- package/include/numkong/spatials/alder.h +2 -2
- package/include/numkong/spatials/diamond.h +2 -2
- package/include/numkong/spatials/genoa.h +2 -2
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/icelake.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +2 -2
- package/include/numkong/spatials/neon.h +2 -2
- package/include/numkong/spatials/neonbfdot.h +2 -2
- package/include/numkong/spatials/neonfhm.h +2 -2
- package/include/numkong/spatials/neonfp8.h +2 -2
- package/include/numkong/spatials/neonsdot.h +2 -2
- package/include/numkong/spatials/powervsx.h +2 -2
- package/include/numkong/spatials/rvv.h +2 -2
- package/include/numkong/spatials/sapphireamx.h +2 -2
- package/include/numkong/spatials/sierra.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +2 -2
- package/include/numkong/spatials/smef64.h +2 -2
- package/include/numkong/trigonometry/haswell.h +2 -2
- package/include/numkong/trigonometry/neon.h +2 -2
- package/include/numkong/trigonometry/rvv.h +2 -2
- package/include/numkong/trigonometry/skylake.h +2 -2
- package/include/numkong/types.h +103 -89
- package/numkong.gypi +3 -0
- package/package.json +7 -7
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
#ifndef NK_SPATIALS_SIERRA_H
|
|
10
10
|
#define NK_SPATIALS_SIERRA_H
|
|
11
11
|
|
|
12
|
-
#if
|
|
12
|
+
#if NK_TARGET_X8664_
|
|
13
13
|
#if NK_TARGET_SIERRA
|
|
14
14
|
|
|
15
15
|
#include "numkong/spatial/haswell.h"
|
|
@@ -92,5 +92,5 @@ nk_define_cross_normalized_symmetric_(euclidean, e2m3, sierra, e2m3, f32, /*norm
|
|
|
92
92
|
#endif
|
|
93
93
|
|
|
94
94
|
#endif // NK_TARGET_SIERRA
|
|
95
|
-
#endif //
|
|
95
|
+
#endif // NK_TARGET_X8664_
|
|
96
96
|
#endif // NK_SPATIALS_SIERRA_H
|
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
#ifndef NK_SPATIALS_SKYLAKE_H
|
|
10
10
|
#define NK_SPATIALS_SKYLAKE_H
|
|
11
11
|
|
|
12
|
-
#if
|
|
12
|
+
#if NK_TARGET_X8664_
|
|
13
13
|
#if NK_TARGET_SKYLAKE
|
|
14
14
|
|
|
15
15
|
#include "numkong/spatial/skylake.h"
|
|
@@ -180,5 +180,5 @@ nk_define_cross_normalized_symmetric_(euclidean, e3m2, skylake, e3m2, f32, /*nor
|
|
|
180
180
|
#endif
|
|
181
181
|
|
|
182
182
|
#endif // NK_TARGET_SKYLAKE
|
|
183
|
-
#endif //
|
|
183
|
+
#endif // NK_TARGET_X8664_
|
|
184
184
|
#endif // NK_SPATIALS_SKYLAKE_H
|
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
#ifndef NK_SPATIALS_SME_H
|
|
10
10
|
#define NK_SPATIALS_SME_H
|
|
11
11
|
|
|
12
|
-
#if
|
|
12
|
+
#if NK_TARGET_ARM64_
|
|
13
13
|
#if NK_TARGET_SME
|
|
14
14
|
|
|
15
15
|
#include "numkong/dots/serial.h"
|
|
@@ -1882,5 +1882,5 @@ NK_PUBLIC void nk_euclideans_symmetric_u4_sme(
|
|
|
1882
1882
|
#endif
|
|
1883
1883
|
|
|
1884
1884
|
#endif // NK_TARGET_SME
|
|
1885
|
-
#endif //
|
|
1885
|
+
#endif // NK_TARGET_ARM64_
|
|
1886
1886
|
#endif // NK_SPATIALS_SME_H
|
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
#ifndef NK_SPATIALS_SMEF64_H
|
|
10
10
|
#define NK_SPATIALS_SMEF64_H
|
|
11
11
|
|
|
12
|
-
#if
|
|
12
|
+
#if NK_TARGET_ARM64_
|
|
13
13
|
#if NK_TARGET_SME
|
|
14
14
|
|
|
15
15
|
#include "numkong/dots/serial.h"
|
|
@@ -470,5 +470,5 @@ NK_PUBLIC void nk_euclideans_symmetric_f64_smef64(
|
|
|
470
470
|
#endif
|
|
471
471
|
|
|
472
472
|
#endif // NK_TARGET_SME
|
|
473
|
-
#endif //
|
|
473
|
+
#endif // NK_TARGET_ARM64_
|
|
474
474
|
#endif // NK_SPATIALS_SMEF64_H
|
|
@@ -23,7 +23,7 @@
|
|
|
23
23
|
#ifndef NK_TRIGONOMETRY_HASWELL_H
|
|
24
24
|
#define NK_TRIGONOMETRY_HASWELL_H
|
|
25
25
|
|
|
26
|
-
#if
|
|
26
|
+
#if NK_TARGET_X8664_
|
|
27
27
|
#if NK_TARGET_HASWELL
|
|
28
28
|
|
|
29
29
|
#include "numkong/types.h"
|
|
@@ -649,5 +649,5 @@ NK_PUBLIC void nk_each_atan_f64_haswell(nk_f64_t const *ins, nk_size_t n, nk_f64
|
|
|
649
649
|
#endif
|
|
650
650
|
|
|
651
651
|
#endif // NK_TARGET_HASWELL
|
|
652
|
-
#endif //
|
|
652
|
+
#endif // NK_TARGET_X8664_
|
|
653
653
|
#endif // NK_TRIGONOMETRY_HASWELL_H
|
|
@@ -32,7 +32,7 @@
|
|
|
32
32
|
#ifndef NK_TRIGONOMETRY_NEON_H
|
|
33
33
|
#define NK_TRIGONOMETRY_NEON_H
|
|
34
34
|
|
|
35
|
-
#if
|
|
35
|
+
#if NK_TARGET_ARM64_
|
|
36
36
|
#if NK_TARGET_NEON
|
|
37
37
|
|
|
38
38
|
#include "numkong/types.h"
|
|
@@ -634,5 +634,5 @@ NK_PUBLIC void nk_each_atan_f64_neon(nk_f64_t const *ins, nk_size_t n, nk_f64_t
|
|
|
634
634
|
#endif
|
|
635
635
|
|
|
636
636
|
#endif // NK_TARGET_NEON
|
|
637
|
-
#endif //
|
|
637
|
+
#endif // NK_TARGET_ARM64_
|
|
638
638
|
#endif // NK_TRIGONOMETRY_NEON_H
|
|
@@ -37,7 +37,7 @@
|
|
|
37
37
|
#ifndef NK_TRIGONOMETRY_RVV_H
|
|
38
38
|
#define NK_TRIGONOMETRY_RVV_H
|
|
39
39
|
|
|
40
|
-
#if
|
|
40
|
+
#if NK_TARGET_RISCV64_
|
|
41
41
|
#if NK_TARGET_RVV
|
|
42
42
|
|
|
43
43
|
#include "numkong/types.h"
|
|
@@ -696,5 +696,5 @@ NK_PUBLIC void nk_each_atan_f16_rvv(nk_f16_t const *ins, nk_size_t n, nk_f16_t *
|
|
|
696
696
|
#endif
|
|
697
697
|
|
|
698
698
|
#endif // NK_TARGET_RVV
|
|
699
|
-
#endif //
|
|
699
|
+
#endif // NK_TARGET_RISCV64_
|
|
700
700
|
#endif // NK_TRIGONOMETRY_RVV_H
|
|
@@ -23,7 +23,7 @@
|
|
|
23
23
|
#ifndef NK_TRIGONOMETRY_SKYLAKE_H
|
|
24
24
|
#define NK_TRIGONOMETRY_SKYLAKE_H
|
|
25
25
|
|
|
26
|
-
#if
|
|
26
|
+
#if NK_TARGET_X8664_
|
|
27
27
|
#if NK_TARGET_SKYLAKE
|
|
28
28
|
|
|
29
29
|
#include "numkong/types.h"
|
|
@@ -721,5 +721,5 @@ NK_PUBLIC void nk_each_atan_f16_skylake(nk_f16_t const *ins, nk_size_t n, nk_f16
|
|
|
721
721
|
#endif
|
|
722
722
|
|
|
723
723
|
#endif // NK_TARGET_SKYLAKE
|
|
724
|
-
#endif //
|
|
724
|
+
#endif // NK_TARGET_X8664_
|
|
725
725
|
#endif // NK_TRIGONOMETRY_SKYLAKE_H
|
package/include/numkong/types.h
CHANGED
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
* Defines:
|
|
8
8
|
*
|
|
9
9
|
* - Sized aliases for numeric types, like: `nk_i32_t` and `nk_f64_t`.
|
|
10
|
-
* - Macros for internal compiler/hardware checks, like: `
|
|
10
|
+
* - Macros for internal compiler/hardware checks, like: `NK_TARGET_ARM64_`.
|
|
11
11
|
* - Macros for feature controls, like: `NK_TARGET_NEON`
|
|
12
12
|
*
|
|
13
13
|
* @section fp8_types FP8 Numeric Types
|
|
@@ -119,6 +119,12 @@
|
|
|
119
119
|
#define NK_MAY_ALIAS_
|
|
120
120
|
#endif
|
|
121
121
|
|
|
122
|
+
#if defined(__has_builtin)
|
|
123
|
+
#define nk_has_builtin_(x) __has_builtin(x)
|
|
124
|
+
#else
|
|
125
|
+
#define nk_has_builtin_(x) 0
|
|
126
|
+
#endif
|
|
127
|
+
|
|
122
128
|
// Allow SIMD kernels to redirect small inputs to serial implementations.
|
|
123
129
|
// Enabled by default for production use. Tests and benchmarks may disable
|
|
124
130
|
// this to isolate SIMD path behavior on small inputs.
|
|
@@ -126,52 +132,52 @@
|
|
|
126
132
|
#define NK_ALLOW_ISA_REDIRECT 1
|
|
127
133
|
#endif
|
|
128
134
|
|
|
129
|
-
// Compiling for Arm:
|
|
135
|
+
// Compiling for 64-bit Arm: NK_TARGET_ARM64_
|
|
130
136
|
// https://arm-software.github.io/acle/main/acle.html
|
|
131
|
-
#if !defined(
|
|
137
|
+
#if !defined(NK_TARGET_ARM64_)
|
|
132
138
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
|
133
|
-
#define
|
|
139
|
+
#define NK_TARGET_ARM64_ 1
|
|
134
140
|
#else
|
|
135
|
-
#define
|
|
141
|
+
#define NK_TARGET_ARM64_ 0
|
|
136
142
|
#endif // defined(__aarch64__) || defined(_M_ARM64)
|
|
137
|
-
#endif // !defined(
|
|
143
|
+
#endif // !defined(NK_TARGET_ARM64_)
|
|
138
144
|
|
|
139
|
-
// Compiling for x86:
|
|
145
|
+
// Compiling for x86: NK_TARGET_X8664_
|
|
140
146
|
// https://www.intel.com/content/www/us/en/docs/dpcpp-cpp-compiler/developer-guide-reference/2024-2/additional-predefined-macros.html
|
|
141
|
-
#if !defined(
|
|
147
|
+
#if !defined(NK_TARGET_X8664_)
|
|
142
148
|
#if defined(__x86_64__) || defined(_M_X64)
|
|
143
|
-
#define
|
|
149
|
+
#define NK_TARGET_X8664_ 1
|
|
144
150
|
#else
|
|
145
|
-
#define
|
|
151
|
+
#define NK_TARGET_X8664_ 0
|
|
146
152
|
#endif // defined(__x86_64__) || defined(_M_X64)
|
|
147
|
-
#endif // !defined(
|
|
153
|
+
#endif // !defined(NK_TARGET_X8664_)
|
|
148
154
|
|
|
149
|
-
// Compiling for RISC-V:
|
|
150
|
-
#if !defined(
|
|
155
|
+
// Compiling for RISC-V: NK_TARGET_RISCV64_
|
|
156
|
+
#if !defined(NK_TARGET_RISCV64_)
|
|
151
157
|
#if defined(__riscv) && (__riscv_xlen == 64)
|
|
152
|
-
#define
|
|
158
|
+
#define NK_TARGET_RISCV64_ 1
|
|
153
159
|
#else
|
|
154
|
-
#define
|
|
160
|
+
#define NK_TARGET_RISCV64_ 0
|
|
155
161
|
#endif // defined(__riscv) && (__riscv_xlen == 64)
|
|
156
|
-
#endif // !defined(
|
|
162
|
+
#endif // !defined(NK_TARGET_RISCV64_)
|
|
157
163
|
|
|
158
|
-
// Compiling for LoongArch:
|
|
159
|
-
#if !defined(
|
|
164
|
+
// Compiling for LoongArch: NK_TARGET_LOONGARCH64_
|
|
165
|
+
#if !defined(NK_TARGET_LOONGARCH64_)
|
|
160
166
|
#if defined(__loongarch__)
|
|
161
|
-
#define
|
|
167
|
+
#define NK_TARGET_LOONGARCH64_ 1
|
|
162
168
|
#else
|
|
163
|
-
#define
|
|
169
|
+
#define NK_TARGET_LOONGARCH64_ 0
|
|
164
170
|
#endif // defined(__loongarch__)
|
|
165
|
-
#endif // !defined(
|
|
171
|
+
#endif // !defined(NK_TARGET_LOONGARCH64_)
|
|
166
172
|
|
|
167
|
-
// Compiling for Power:
|
|
168
|
-
#if !defined(
|
|
173
|
+
// Compiling for Power: NK_TARGET_POWER64_
|
|
174
|
+
#if !defined(NK_TARGET_POWER64_)
|
|
169
175
|
#if defined(__powerpc64__) || defined(__ppc64__) || defined(_ARCH_PPC64)
|
|
170
|
-
#define
|
|
176
|
+
#define NK_TARGET_POWER64_ 1
|
|
171
177
|
#else
|
|
172
|
-
#define
|
|
178
|
+
#define NK_TARGET_POWER64_ 0
|
|
173
179
|
#endif // defined(__powerpc64__) || defined(__ppc64__) || defined(_ARCH_PPC64)
|
|
174
|
-
#endif // !defined(
|
|
180
|
+
#endif // !defined(NK_TARGET_POWER64_)
|
|
175
181
|
|
|
176
182
|
// Compiling for WASM: NK_TARGET_WASM_
|
|
177
183
|
#if !defined(NK_TARGET_WASM_)
|
|
@@ -203,7 +209,7 @@
|
|
|
203
209
|
#endif // !defined(NK_TARGET_V128RELAXED) || ...
|
|
204
210
|
|
|
205
211
|
// Compiling for RISC-V Vector: NK_TARGET_RVV
|
|
206
|
-
#if !defined(NK_TARGET_RVV) || (NK_TARGET_RVV && !
|
|
212
|
+
#if !defined(NK_TARGET_RVV) || (NK_TARGET_RVV && !NK_TARGET_RISCV64_)
|
|
207
213
|
#if defined(__riscv_v) && (__riscv_v >= 1000000)
|
|
208
214
|
#define NK_TARGET_RVV 1
|
|
209
215
|
#else
|
|
@@ -248,7 +254,7 @@
|
|
|
248
254
|
// Compiling for LoongArch LASX (256-bit SIMD): NK_TARGET_LOONGSONASX
|
|
249
255
|
// LASX provides 32 × 256-bit vector registers, widening integer multiply-accumulate,
|
|
250
256
|
// and f32-to-f64 conversion (xvfcvtl_d_s / xvfcvth_d_s) but no widening FMA.
|
|
251
|
-
#if !defined(NK_TARGET_LOONGSONASX) || (NK_TARGET_LOONGSONASX && !
|
|
257
|
+
#if !defined(NK_TARGET_LOONGSONASX) || (NK_TARGET_LOONGSONASX && !NK_TARGET_LOONGARCH64_)
|
|
252
258
|
#if defined(__loongarch_asx)
|
|
253
259
|
#define NK_TARGET_LOONGSONASX 1
|
|
254
260
|
#else
|
|
@@ -261,7 +267,7 @@
|
|
|
261
267
|
// VSX provides 64 × 128-bit registers, FMA (vec_madd), vec_msum (multiply-sum), hardware f16
|
|
262
268
|
// conversion (vec_extract_fp32_from_shorth/l), length-limited loads (vec_xl_len), per-byte
|
|
263
269
|
// popcount (vec_popcnt), and vec_cmpne. Requires POWER9 (ISA 3.0) or newer.
|
|
264
|
-
#if !defined(NK_TARGET_POWERVSX) || (NK_TARGET_POWERVSX && !
|
|
270
|
+
#if !defined(NK_TARGET_POWERVSX) || (NK_TARGET_POWERVSX && !NK_TARGET_POWER64_)
|
|
265
271
|
#if defined(__VSX__) && defined(__POWER9_VECTOR__)
|
|
266
272
|
#define NK_TARGET_POWERVSX 1
|
|
267
273
|
#else
|
|
@@ -270,19 +276,20 @@
|
|
|
270
276
|
#endif // defined(__VSX__)
|
|
271
277
|
#endif // !defined(NK_TARGET_POWERVSX) || ...
|
|
272
278
|
|
|
273
|
-
// Compiling for Arm: NK_TARGET_NEON
|
|
274
|
-
#if !defined(NK_TARGET_NEON) || (NK_TARGET_NEON && !
|
|
275
|
-
#if defined(__ARM_NEON) || (defined(_MSC_VER) && defined(_M_ARM64))
|
|
279
|
+
// Compiling for Arm: NK_TARGET_NEON (AArch64 only, AArch32 NEON is not supported)
|
|
280
|
+
#if !defined(NK_TARGET_NEON) || (NK_TARGET_NEON && !NK_TARGET_ARM64_)
|
|
281
|
+
#if (defined(__ARM_NEON) && defined(__aarch64__)) || (defined(_MSC_VER) && defined(_M_ARM64))
|
|
276
282
|
#define NK_TARGET_NEON 1
|
|
277
283
|
#else
|
|
278
284
|
#undef NK_TARGET_NEON
|
|
279
285
|
#define NK_TARGET_NEON 0
|
|
280
|
-
#endif // defined(__ARM_NEON) || ...
|
|
286
|
+
#endif // (defined(__ARM_NEON) && defined(__aarch64__)) || ...
|
|
281
287
|
#endif // !defined(NK_TARGET_NEON) || ...
|
|
282
288
|
|
|
283
|
-
// Compiling for Arm: NK_TARGET_NEONSDOT (FEAT_DotProd,
|
|
284
|
-
#if !defined(NK_TARGET_NEONSDOT) || (NK_TARGET_NEONSDOT && !
|
|
285
|
-
#if defined(__ARM_FEATURE_DOTPROD)
|
|
289
|
+
// Compiling for Arm: NK_TARGET_NEONSDOT (FEAT_DotProd, AArch64 only)
|
|
290
|
+
#if !defined(NK_TARGET_NEONSDOT) || (NK_TARGET_NEONSDOT && !NK_TARGET_ARM64_)
|
|
291
|
+
#if (defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__)) || \
|
|
292
|
+
(defined(_MSC_VER) && defined(_M_ARM64) && __ARM_ARCH >= 804)
|
|
286
293
|
#define NK_TARGET_NEONSDOT 1
|
|
287
294
|
#else
|
|
288
295
|
#undef NK_TARGET_NEONSDOT
|
|
@@ -290,9 +297,10 @@
|
|
|
290
297
|
#endif
|
|
291
298
|
#endif // !defined(NK_TARGET_NEONSDOT) || ...
|
|
292
299
|
|
|
293
|
-
// Compiling for Arm: NK_TARGET_NEONHALF (FEAT_FP16,
|
|
294
|
-
#if !defined(NK_TARGET_NEONHALF) || (NK_TARGET_NEONHALF && !
|
|
295
|
-
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
|
300
|
+
// Compiling for Arm: NK_TARGET_NEONHALF (FEAT_FP16, AArch64 only)
|
|
301
|
+
#if !defined(NK_TARGET_NEONHALF) || (NK_TARGET_NEONHALF && !NK_TARGET_ARM64_)
|
|
302
|
+
#if (defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarch64__)) || \
|
|
303
|
+
(defined(_MSC_VER) && defined(_M_ARM64) && __ARM_ARCH >= 802)
|
|
296
304
|
#define NK_TARGET_NEONHALF 1
|
|
297
305
|
#else
|
|
298
306
|
#undef NK_TARGET_NEONHALF
|
|
@@ -300,9 +308,10 @@
|
|
|
300
308
|
#endif
|
|
301
309
|
#endif // !defined(NK_TARGET_NEONHALF) || ...
|
|
302
310
|
|
|
303
|
-
// Compiling for Arm: NK_TARGET_NEONFHM (FEAT_FHM,
|
|
304
|
-
#if !defined(NK_TARGET_NEONFHM) || (NK_TARGET_NEONFHM && !
|
|
305
|
-
#if defined(__ARM_FEATURE_FP16_FML)
|
|
311
|
+
// Compiling for Arm: NK_TARGET_NEONFHM (FEAT_FHM, AArch64 only)
|
|
312
|
+
#if !defined(NK_TARGET_NEONFHM) || (NK_TARGET_NEONFHM && !NK_TARGET_ARM64_)
|
|
313
|
+
#if (defined(__ARM_FEATURE_FP16_FML) && defined(__aarch64__)) || \
|
|
314
|
+
(defined(_MSC_VER) && defined(_M_ARM64) && __ARM_ARCH >= 804)
|
|
306
315
|
#define NK_TARGET_NEONFHM 1
|
|
307
316
|
#else
|
|
308
317
|
#undef NK_TARGET_NEONFHM
|
|
@@ -310,9 +319,10 @@
|
|
|
310
319
|
#endif
|
|
311
320
|
#endif // !defined(NK_TARGET_NEONFHM) || ...
|
|
312
321
|
|
|
313
|
-
// Compiling for Arm: NK_TARGET_NEONBFDOT (FEAT_BF16,
|
|
314
|
-
#if !defined(NK_TARGET_NEONBFDOT) || (NK_TARGET_NEONBFDOT && !
|
|
315
|
-
#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)
|
|
322
|
+
// Compiling for Arm: NK_TARGET_NEONBFDOT (FEAT_BF16, AArch64 only)
|
|
323
|
+
#if !defined(NK_TARGET_NEONBFDOT) || (NK_TARGET_NEONBFDOT && !NK_TARGET_ARM64_)
|
|
324
|
+
#if (defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && defined(__aarch64__)) || \
|
|
325
|
+
(defined(_MSC_VER) && defined(_M_ARM64) && __ARM_ARCH >= 806)
|
|
316
326
|
#define NK_TARGET_NEONBFDOT 1
|
|
317
327
|
#else
|
|
318
328
|
#undef NK_TARGET_NEONBFDOT
|
|
@@ -323,8 +333,8 @@
|
|
|
323
333
|
// Compiling for Arm: NK_TARGET_NEONFP8 (NEON FP8 extensions, FEAT_FP8DOT4)
|
|
324
334
|
// ACLE macro __ARM_FEATURE_FP8DOT4 defined by GCC 15+ and Clang 21+ when +fp8dot4 is enabled.
|
|
325
335
|
// Older compilers lack mfloat8x16_t and the fp8dot4 target attribute entirely.
|
|
326
|
-
#if !defined(NK_TARGET_NEONFP8) || (NK_TARGET_NEONFP8 && !
|
|
327
|
-
#if defined(__ARM_FEATURE_FP8DOT4)
|
|
336
|
+
#if !defined(NK_TARGET_NEONFP8) || (NK_TARGET_NEONFP8 && !NK_TARGET_ARM64_)
|
|
337
|
+
#if defined(__ARM_FEATURE_FP8DOT4) && defined(__aarch64__)
|
|
328
338
|
#define NK_TARGET_NEONFP8 1
|
|
329
339
|
#else
|
|
330
340
|
#undef NK_TARGET_NEONFP8
|
|
@@ -333,7 +343,7 @@
|
|
|
333
343
|
#endif // !defined(NK_TARGET_NEONFP8) || ...
|
|
334
344
|
|
|
335
345
|
// Compiling for Arm: NK_TARGET_SVE
|
|
336
|
-
#if !defined(NK_TARGET_SVE) || (NK_TARGET_SVE && !
|
|
346
|
+
#if !defined(NK_TARGET_SVE) || (NK_TARGET_SVE && !NK_TARGET_ARM64_)
|
|
337
347
|
#if defined(__ARM_FEATURE_SVE)
|
|
338
348
|
#define NK_TARGET_SVE 1
|
|
339
349
|
#else
|
|
@@ -343,7 +353,7 @@
|
|
|
343
353
|
#endif // !defined(NK_TARGET_SVE) || ...
|
|
344
354
|
|
|
345
355
|
// Compiling for Arm: NK_TARGET_SVESDOT
|
|
346
|
-
#if !defined(NK_TARGET_SVESDOT) || (NK_TARGET_SVESDOT && !
|
|
356
|
+
#if !defined(NK_TARGET_SVESDOT) || (NK_TARGET_SVESDOT && !NK_TARGET_ARM64_)
|
|
347
357
|
#if defined(__ARM_FEATURE_SVE)
|
|
348
358
|
#define NK_TARGET_SVESDOT 1
|
|
349
359
|
#else
|
|
@@ -353,7 +363,7 @@
|
|
|
353
363
|
#endif // !defined(NK_TARGET_SVESDOT) || ...
|
|
354
364
|
|
|
355
365
|
// Compiling for Arm: NK_TARGET_SVEHALF
|
|
356
|
-
#if !defined(NK_TARGET_SVEHALF) || (NK_TARGET_SVEHALF && !
|
|
366
|
+
#if !defined(NK_TARGET_SVEHALF) || (NK_TARGET_SVEHALF && !NK_TARGET_ARM64_)
|
|
357
367
|
#if defined(__ARM_FEATURE_SVE)
|
|
358
368
|
#define NK_TARGET_SVEHALF 1
|
|
359
369
|
#else
|
|
@@ -363,7 +373,7 @@
|
|
|
363
373
|
#endif // !defined(NK_TARGET_SVEHALF) || ...
|
|
364
374
|
|
|
365
375
|
// Compiling for Arm: NK_TARGET_SVEBFDOT
|
|
366
|
-
#if !defined(NK_TARGET_SVEBFDOT) || (NK_TARGET_SVEBFDOT && !
|
|
376
|
+
#if !defined(NK_TARGET_SVEBFDOT) || (NK_TARGET_SVEBFDOT && !NK_TARGET_ARM64_)
|
|
367
377
|
#if defined(__ARM_FEATURE_SVE)
|
|
368
378
|
#define NK_TARGET_SVEBFDOT 1
|
|
369
379
|
#else
|
|
@@ -373,7 +383,7 @@
|
|
|
373
383
|
#endif // !defined(NK_TARGET_SVEBFDOT) || ...
|
|
374
384
|
|
|
375
385
|
// Compiling for Arm: NK_TARGET_SVE2
|
|
376
|
-
#if !defined(NK_TARGET_SVE2) || (NK_TARGET_SVE2 && !
|
|
386
|
+
#if !defined(NK_TARGET_SVE2) || (NK_TARGET_SVE2 && !NK_TARGET_ARM64_)
|
|
377
387
|
#if defined(__ARM_FEATURE_SVE2)
|
|
378
388
|
#define NK_TARGET_SVE2 1
|
|
379
389
|
#else
|
|
@@ -383,13 +393,13 @@
|
|
|
383
393
|
#endif // !defined(NK_TARGET_SVE2) || ...
|
|
384
394
|
|
|
385
395
|
// Compiling for Arm: NK_TARGET_SVE2P1
|
|
386
|
-
#if !defined(NK_TARGET_SVE2P1) || (NK_TARGET_SVE2P1 && !
|
|
396
|
+
#if !defined(NK_TARGET_SVE2P1) || (NK_TARGET_SVE2P1 && !NK_TARGET_ARM64_)
|
|
387
397
|
#undef NK_TARGET_SVE2P1
|
|
388
398
|
#define NK_TARGET_SVE2P1 0
|
|
389
399
|
#endif // !defined(NK_TARGET_SVE2P1) || ...
|
|
390
400
|
|
|
391
401
|
// Compiling for Arm: NK_TARGET_SME (Scalable Matrix Extension)
|
|
392
|
-
#if !defined(NK_TARGET_SME) || (NK_TARGET_SME && !
|
|
402
|
+
#if !defined(NK_TARGET_SME) || (NK_TARGET_SME && !NK_TARGET_ARM64_)
|
|
393
403
|
#if defined(__ARM_FEATURE_SME)
|
|
394
404
|
#define NK_TARGET_SME 1
|
|
395
405
|
#else
|
|
@@ -398,7 +408,7 @@
|
|
|
398
408
|
#endif // defined(__ARM_FEATURE_SME)
|
|
399
409
|
#endif // !defined(NK_TARGET_SME) || ...
|
|
400
410
|
|
|
401
|
-
#if !defined(NK_TARGET_SME2) || (NK_TARGET_SME2 && !
|
|
411
|
+
#if !defined(NK_TARGET_SME2) || (NK_TARGET_SME2 && !NK_TARGET_ARM64_)
|
|
402
412
|
#if defined(__ARM_FEATURE_SME2)
|
|
403
413
|
#define NK_TARGET_SME2 1
|
|
404
414
|
#else
|
|
@@ -409,7 +419,7 @@
|
|
|
409
419
|
|
|
410
420
|
// Compiling for Arm: NK_TARGET_SME2P1 (FEAT_SME2p1)
|
|
411
421
|
// ACLE macro: __ARM_FEATURE_SME2p1 (note lowercase 'p')
|
|
412
|
-
#if !defined(NK_TARGET_SME2P1) || (NK_TARGET_SME2P1 && !
|
|
422
|
+
#if !defined(NK_TARGET_SME2P1) || (NK_TARGET_SME2P1 && !NK_TARGET_ARM64_)
|
|
413
423
|
#if defined(__ARM_FEATURE_SME2p1)
|
|
414
424
|
#define NK_TARGET_SME2P1 1
|
|
415
425
|
#else
|
|
@@ -420,8 +430,8 @@
|
|
|
420
430
|
|
|
421
431
|
// AppleClang 17 exposes SME sub-features through `arm_sme.h` builtin aliases,
|
|
422
432
|
// not dedicated `__ARM_FEATURE_*` predefines for every matrix subtype.
|
|
423
|
-
#if !defined(NK_TARGET_SMEF64) || (NK_TARGET_SMEF64 && !
|
|
424
|
-
#if defined(__ARM_FEATURE_SME_F64F64) || (
|
|
433
|
+
#if !defined(NK_TARGET_SMEF64) || (NK_TARGET_SMEF64 && !NK_TARGET_ARM64_)
|
|
434
|
+
#if defined(__ARM_FEATURE_SME_F64F64) || nk_has_builtin_(__builtin_sme_svmopa_za64_f64_m)
|
|
425
435
|
#define NK_TARGET_SMEF64 1
|
|
426
436
|
#else
|
|
427
437
|
#undef NK_TARGET_SMEF64
|
|
@@ -429,44 +439,44 @@
|
|
|
429
439
|
#endif // defined(__ARM_FEATURE_SME_F64F64) || ...
|
|
430
440
|
#endif // !defined(NK_TARGET_SMEF64) || ...
|
|
431
441
|
|
|
432
|
-
#if !defined(NK_TARGET_SMEBI32) || (NK_TARGET_SMEBI32 && !
|
|
433
|
-
#if
|
|
442
|
+
#if !defined(NK_TARGET_SMEBI32) || (NK_TARGET_SMEBI32 && !NK_TARGET_ARM64_)
|
|
443
|
+
#if nk_has_builtin_(__builtin_sme_svbmopa_za32_u32_m)
|
|
434
444
|
#define NK_TARGET_SMEBI32 1
|
|
435
445
|
#else
|
|
436
446
|
#undef NK_TARGET_SMEBI32
|
|
437
447
|
#define NK_TARGET_SMEBI32 0
|
|
438
|
-
#endif //
|
|
448
|
+
#endif // nk_has_builtin_(__builtin_sme_svbmopa_za32_u32_m)
|
|
439
449
|
#endif // !defined(NK_TARGET_SMEBI32) || ...
|
|
440
450
|
|
|
441
|
-
#if !defined(NK_TARGET_SMEHALF) || (NK_TARGET_SMEHALF && !
|
|
442
|
-
#if defined(__ARM_FEATURE_SME_F16F16) || (
|
|
451
|
+
#if !defined(NK_TARGET_SMEHALF) || (NK_TARGET_SMEHALF && !NK_TARGET_ARM64_)
|
|
452
|
+
#if defined(__ARM_FEATURE_SME_F16F16) || nk_has_builtin_(__builtin_sme_svmopa_za32_f16_m)
|
|
443
453
|
#define NK_TARGET_SMEHALF 1
|
|
444
454
|
#else
|
|
445
455
|
#undef NK_TARGET_SMEHALF
|
|
446
456
|
#define NK_TARGET_SMEHALF 0
|
|
447
|
-
#endif //
|
|
457
|
+
#endif // nk_has_builtin_(__builtin_sme_svmopa_za32_f16_m)
|
|
448
458
|
#endif // !defined(NK_TARGET_SMEHALF) || ...
|
|
449
459
|
|
|
450
|
-
#if !defined(NK_TARGET_SMEBF16) || (NK_TARGET_SMEBF16 && !
|
|
451
|
-
#if
|
|
460
|
+
#if !defined(NK_TARGET_SMEBF16) || (NK_TARGET_SMEBF16 && !NK_TARGET_ARM64_)
|
|
461
|
+
#if nk_has_builtin_(__builtin_sme_svmopa_za32_bf16_m)
|
|
452
462
|
#define NK_TARGET_SMEBF16 1
|
|
453
463
|
#else
|
|
454
464
|
#undef NK_TARGET_SMEBF16
|
|
455
465
|
#define NK_TARGET_SMEBF16 0
|
|
456
|
-
#endif //
|
|
466
|
+
#endif // nk_has_builtin_(__builtin_sme_svmopa_za32_bf16_m)
|
|
457
467
|
#endif // !defined(NK_TARGET_SMEBF16) || ...
|
|
458
468
|
|
|
459
|
-
#if !defined(NK_TARGET_SMELUT2) || (NK_TARGET_SMELUT2 && !
|
|
460
|
-
#if
|
|
469
|
+
#if !defined(NK_TARGET_SMELUT2) || (NK_TARGET_SMELUT2 && !NK_TARGET_ARM64_)
|
|
470
|
+
#if nk_has_builtin_(__builtin_sme_svluti2_lane_zt_u8)
|
|
461
471
|
#define NK_TARGET_SMELUT2 1
|
|
462
472
|
#else
|
|
463
473
|
#undef NK_TARGET_SMELUT2
|
|
464
474
|
#define NK_TARGET_SMELUT2 0
|
|
465
|
-
#endif //
|
|
475
|
+
#endif // nk_has_builtin_(__builtin_sme_svluti2_lane_zt_u8)
|
|
466
476
|
#endif // !defined(NK_TARGET_SMELUT2) || ...
|
|
467
477
|
|
|
468
478
|
// Compiling for Arm: NK_TARGET_SMEFA64 (FEAT_SME_FA64, full SVE2 in streaming mode)
|
|
469
|
-
#if !defined(NK_TARGET_SMEFA64) || (NK_TARGET_SMEFA64 && !
|
|
479
|
+
#if !defined(NK_TARGET_SMEFA64) || (NK_TARGET_SMEFA64 && !NK_TARGET_ARM64_)
|
|
470
480
|
#if defined(__ARM_FEATURE_SME_FA64)
|
|
471
481
|
#define NK_TARGET_SMEFA64 1
|
|
472
482
|
#else
|
|
@@ -491,7 +501,7 @@
|
|
|
491
501
|
// - _MSC_VER >= 1900 (VS 2015+): AVX2/FMA/F16C (Haswell)
|
|
492
502
|
// - _MSC_VER >= 1920 (VS 2019+): AVX-512 base (Skylake, Icelake), AVX-VNNI (Alder)
|
|
493
503
|
// - _MSC_VER >= 1944 (VS 2022 17.14+): BF16, FP16, VP2INTERSECT, VNNI-INT8 (Sierra), AMX
|
|
494
|
-
#if !defined(NK_TARGET_HASWELL) || (NK_TARGET_HASWELL && !
|
|
504
|
+
#if !defined(NK_TARGET_HASWELL) || (NK_TARGET_HASWELL && !NK_TARGET_X8664_)
|
|
495
505
|
#if (defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)) || (defined(_MSC_VER) && _MSC_VER >= 1900)
|
|
496
506
|
#define NK_TARGET_HASWELL 1
|
|
497
507
|
#else
|
|
@@ -507,7 +517,7 @@
|
|
|
507
517
|
// gcc-12 -march=sapphirerapids -dM -E - < /dev/null | egrep "SSE|AVX" | sort
|
|
508
518
|
// On Arm machines you may want to check for other flags:
|
|
509
519
|
// gcc-12 -march=native -dM -E - < /dev/null | egrep "NEON|SVE|FP16|FMA" | sort
|
|
510
|
-
#if !defined(NK_TARGET_SKYLAKE) || (NK_TARGET_SKYLAKE && !
|
|
520
|
+
#if !defined(NK_TARGET_SKYLAKE) || (NK_TARGET_SKYLAKE && !NK_TARGET_X8664_)
|
|
511
521
|
#if (defined(__AVX512F__) && defined(__AVX512CD__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && \
|
|
512
522
|
defined(__AVX512BW__)) || \
|
|
513
523
|
(defined(_MSC_VER) && _MSC_VER >= 1920)
|
|
@@ -518,7 +528,7 @@
|
|
|
518
528
|
#endif
|
|
519
529
|
#endif // !defined(NK_TARGET_SKYLAKE) || ...
|
|
520
530
|
|
|
521
|
-
#if !defined(NK_TARGET_ICELAKE) || (NK_TARGET_ICELAKE && !
|
|
531
|
+
#if !defined(NK_TARGET_ICELAKE) || (NK_TARGET_ICELAKE && !NK_TARGET_X8664_)
|
|
522
532
|
#if (defined(__AVX512VNNI__) && defined(__AVX512IFMA__) && defined(__AVX512BITALG__) && defined(__AVX512VBMI__) && \
|
|
523
533
|
defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__)) || \
|
|
524
534
|
(defined(_MSC_VER) && _MSC_VER >= 1920)
|
|
@@ -529,7 +539,7 @@
|
|
|
529
539
|
#endif
|
|
530
540
|
#endif // !defined(NK_TARGET_ICELAKE) || ...
|
|
531
541
|
|
|
532
|
-
#if !defined(NK_TARGET_GENOA) || (NK_TARGET_GENOA && !
|
|
542
|
+
#if !defined(NK_TARGET_GENOA) || (NK_TARGET_GENOA && !NK_TARGET_X8664_)
|
|
533
543
|
#if defined(__AVX512BF16__) || (defined(_MSC_VER) && _MSC_VER >= 1944)
|
|
534
544
|
#define NK_TARGET_GENOA 1
|
|
535
545
|
#else
|
|
@@ -542,7 +552,7 @@
|
|
|
542
552
|
// GCC 14+: defines __AVX10_2__ with -mavx10.2-512
|
|
543
553
|
// Clang 19+: defines __AVX10_2__ with -mavx10.2-512
|
|
544
554
|
// MSVC: defines __AVX10_VER__ >= 2 with /arch:AVX10.2 (VS 2026+, not yet released)
|
|
545
|
-
#if !defined(NK_TARGET_DIAMOND) || (NK_TARGET_DIAMOND && !
|
|
555
|
+
#if !defined(NK_TARGET_DIAMOND) || (NK_TARGET_DIAMOND && !NK_TARGET_X8664_)
|
|
546
556
|
#if defined(__AVX10_2__) || (defined(__AVX10_VER__) && __AVX10_VER__ >= 2)
|
|
547
557
|
#define NK_TARGET_DIAMOND 1
|
|
548
558
|
#else
|
|
@@ -551,7 +561,7 @@
|
|
|
551
561
|
#endif // defined(__AVX10_2__) || ...
|
|
552
562
|
#endif // !defined(NK_TARGET_DIAMOND) || ...
|
|
553
563
|
|
|
554
|
-
#if !defined(NK_TARGET_SAPPHIRE) || (NK_TARGET_SAPPHIRE && !
|
|
564
|
+
#if !defined(NK_TARGET_SAPPHIRE) || (NK_TARGET_SAPPHIRE && !NK_TARGET_X8664_)
|
|
555
565
|
#if defined(__AVX512FP16__) || (defined(_MSC_VER) && _MSC_VER >= 1944)
|
|
556
566
|
#define NK_TARGET_SAPPHIRE 1
|
|
557
567
|
#else
|
|
@@ -560,7 +570,7 @@
|
|
|
560
570
|
#endif
|
|
561
571
|
#endif // !defined(NK_TARGET_SAPPHIRE) || ...
|
|
562
572
|
|
|
563
|
-
#if !defined(NK_TARGET_SAPPHIREAMX) || (NK_TARGET_SAPPHIREAMX && !
|
|
573
|
+
#if !defined(NK_TARGET_SAPPHIREAMX) || (NK_TARGET_SAPPHIREAMX && !NK_TARGET_X8664_)
|
|
564
574
|
#if (defined(__AMX_TILE__) && defined(__AMX_BF16__) && defined(__AMX_INT8__)) || (defined(_MSC_VER) && _MSC_VER >= 1944)
|
|
565
575
|
#define NK_TARGET_SAPPHIREAMX 1
|
|
566
576
|
#else
|
|
@@ -569,7 +579,7 @@
|
|
|
569
579
|
#endif
|
|
570
580
|
#endif // !defined(NK_TARGET_SAPPHIREAMX) || ...
|
|
571
581
|
|
|
572
|
-
#if !defined(NK_TARGET_GRANITEAMX) || (NK_TARGET_GRANITEAMX && !
|
|
582
|
+
#if !defined(NK_TARGET_GRANITEAMX) || (NK_TARGET_GRANITEAMX && !NK_TARGET_X8664_)
|
|
573
583
|
#if (defined(__AMX_TILE__) && defined(__AMX_FP16__)) || (defined(_MSC_VER) && _MSC_VER >= 1944)
|
|
574
584
|
#define NK_TARGET_GRANITEAMX 1
|
|
575
585
|
#else
|
|
@@ -578,7 +588,7 @@
|
|
|
578
588
|
#endif
|
|
579
589
|
#endif // !defined(NK_TARGET_GRANITEAMX) || ...
|
|
580
590
|
|
|
581
|
-
#if !defined(NK_TARGET_TURIN) || (NK_TARGET_TURIN && !
|
|
591
|
+
#if !defined(NK_TARGET_TURIN) || (NK_TARGET_TURIN && !NK_TARGET_X8664_)
|
|
582
592
|
#if defined(__AVX512VP2INTERSECT__) || (defined(_MSC_VER) && _MSC_VER >= 1944)
|
|
583
593
|
#define NK_TARGET_TURIN 1
|
|
584
594
|
#else
|
|
@@ -587,7 +597,7 @@
|
|
|
587
597
|
#endif
|
|
588
598
|
#endif // !defined(NK_TARGET_TURIN) || ...
|
|
589
599
|
|
|
590
|
-
#if !defined(NK_TARGET_ALDER) || (NK_TARGET_ALDER && !
|
|
600
|
+
#if !defined(NK_TARGET_ALDER) || (NK_TARGET_ALDER && !NK_TARGET_X8664_)
|
|
591
601
|
#if defined(__AVXVNNI__) || (defined(_MSC_VER) && _MSC_VER >= 1920)
|
|
592
602
|
#define NK_TARGET_ALDER 1
|
|
593
603
|
#else
|
|
@@ -596,7 +606,7 @@
|
|
|
596
606
|
#endif
|
|
597
607
|
#endif // !defined(NK_TARGET_ALDER) || ...
|
|
598
608
|
|
|
599
|
-
#if !defined(NK_TARGET_SIERRA) || (NK_TARGET_SIERRA && !
|
|
609
|
+
#if !defined(NK_TARGET_SIERRA) || (NK_TARGET_SIERRA && !NK_TARGET_X8664_)
|
|
600
610
|
#if defined(__AVXVNNIINT8__) || (defined(_MSC_VER) && _MSC_VER >= 1944)
|
|
601
611
|
#define NK_TARGET_SIERRA 1
|
|
602
612
|
#else
|
|
@@ -671,7 +681,7 @@
|
|
|
671
681
|
* NK_STREAMING_ marks functions that require streaming SVE mode (e.g. FCVTLT).
|
|
672
682
|
* NK_STREAMING_COMPATIBLE_ marks helpers callable from both streaming and non-streaming mode.
|
|
673
683
|
*/
|
|
674
|
-
#if
|
|
684
|
+
#if NK_TARGET_ARM64_ && NK_TARGET_SME
|
|
675
685
|
#define NK_STREAMING_ __arm_streaming
|
|
676
686
|
#define NK_STREAMING_COMPATIBLE_ __arm_streaming_compatible
|
|
677
687
|
#else
|
|
@@ -684,7 +694,7 @@
|
|
|
684
694
|
* MSVC typedefs `__m512bh`, `__m512h`, `__m256bh` as aliases for `__m512i`/`__m256i`,
|
|
685
695
|
* but rejects C-style casts between them. GCC/Clang define them as distinct types.
|
|
686
696
|
*/
|
|
687
|
-
#if
|
|
697
|
+
#if NK_TARGET_X8664_
|
|
688
698
|
#if defined(_MSC_VER)
|
|
689
699
|
#define nk_m512bh_from_m512i_(x) (x)
|
|
690
700
|
#define nk_m512h_from_m512i_(x) (x)
|
|
@@ -804,7 +814,7 @@ typedef unsigned int nk_u32_t;
|
|
|
804
814
|
/* On LP64 targets (Linux ARM64, RISC-V 64), `long` and `long long` are both 64-bit but distinct types.
|
|
805
815
|
* NEON/RVV intrinsics on Linux expect `long*`, while Apple's NEON intrinsics expect `long long*`.
|
|
806
816
|
* Windows uses LLP64 where `long` is 32-bit, so it must use `long long` for 64-bit types. */
|
|
807
|
-
#if ((
|
|
817
|
+
#if ((NK_TARGET_ARM64_ && !defined(NK_DEFINED_APPLE_)) || NK_TARGET_RISCV64_) && !defined(NK_DEFINED_WINDOWS_)
|
|
808
818
|
/** @brief Signed 64-bit integer. Range: [−2⁶³, +2⁶³−1]. */
|
|
809
819
|
typedef signed long nk_i64_t;
|
|
810
820
|
/** @brief Unsigned 64-bit integer. Range: [0, 2⁶⁴−1]. */
|
|
@@ -821,7 +831,7 @@ typedef float nk_f32_t;
|
|
|
821
831
|
/** @brief Double-precision (64-bit) IEEE 754 float. sign(1) + exponent(11) + mantissa(52), bias=1023. */
|
|
822
832
|
typedef double nk_f64_t;
|
|
823
833
|
|
|
824
|
-
#if
|
|
834
|
+
#if NK_TARGET_X8664_ || NK_TARGET_ARM64_ || NK_TARGET_RISCV64_ || NK_TARGET_POWER64_ || NK_TARGET_LOONGARCH64_
|
|
825
835
|
#define NK_IS_64BIT_ 1
|
|
826
836
|
#else
|
|
827
837
|
#define NK_IS_64BIT_ 0
|
|
@@ -1088,7 +1098,7 @@ typedef unsigned short nk_bf16_t;
|
|
|
1088
1098
|
* Some of those are defined as aliases, so we use `#define` preprocessor
|
|
1089
1099
|
* directives instead of `typedef` to avoid errors.
|
|
1090
1100
|
*/
|
|
1091
|
-
#if
|
|
1101
|
+
#if NK_TARGET_ARM64_
|
|
1092
1102
|
#if defined(_MSC_VER)
|
|
1093
1103
|
#define nk_f16_for_arm_simd_t nk_f16_t
|
|
1094
1104
|
#define nk_bf16_for_arm_simd_t nk_bf16_t
|
|
@@ -1102,7 +1112,7 @@ typedef unsigned short nk_bf16_t;
|
|
|
1102
1112
|
* RISC-V Vector (RVV) intrinsics use `_Float16` for half-precision floats.
|
|
1103
1113
|
* This is the standard C23 type, also available in GCC/Clang with RVV extensions.
|
|
1104
1114
|
*/
|
|
1105
|
-
#if
|
|
1115
|
+
#if NK_TARGET_RISCV64_
|
|
1106
1116
|
#define nk_f16_for_rvv_intrinsics_t _Float16
|
|
1107
1117
|
#endif
|
|
1108
1118
|
|
|
@@ -1237,6 +1247,8 @@ typedef union NK_MAY_ALIAS_ nk_b128_vec_t {
|
|
|
1237
1247
|
int32x4_t i32x4;
|
|
1238
1248
|
int64x2_t i64x2;
|
|
1239
1249
|
float32x4_t f32x4;
|
|
1250
|
+
#endif
|
|
1251
|
+
#if NK_TARGET_NEON && NK_TARGET_ARM64_ // double-precision NEON requires AArch64
|
|
1240
1252
|
float64x2_t f64x2;
|
|
1241
1253
|
#endif
|
|
1242
1254
|
#if NK_TARGET_NEONHALF
|
|
@@ -1294,6 +1306,8 @@ typedef union NK_MAY_ALIAS_ nk_b256_vec_t {
|
|
|
1294
1306
|
int32x4_t i32x4s[2];
|
|
1295
1307
|
int64x2_t i64x2s[2];
|
|
1296
1308
|
float32x4_t f32x4s[2];
|
|
1309
|
+
#endif
|
|
1310
|
+
#if NK_TARGET_NEON && NK_TARGET_ARM64_ // double-precision NEON requires AArch64
|
|
1297
1311
|
float64x2_t f64x2s[2];
|
|
1298
1312
|
#endif
|
|
1299
1313
|
#if NK_TARGET_POWERVSX
|
|
@@ -1588,7 +1602,7 @@ NK_INTERNAL int nk_bf16_is_nan_(nk_bf16_t x) {
|
|
|
1588
1602
|
* SMSTART SM / SMSTOP SM so the calling function's ABI is unchanged.
|
|
1589
1603
|
* Inside `__arm_locally_streaming` functions the plain `svcntXX()` intrinsics are fine.
|
|
1590
1604
|
*/
|
|
1591
|
-
#if
|
|
1605
|
+
#if NK_TARGET_ARM64_ && NK_TARGET_SME
|
|
1592
1606
|
/** @brief Streaming SVL byte-element count (SVL/8) via SMSTART SM bracket. */
|
|
1593
1607
|
NK_INTERNAL nk_size_t nk_sme_cntb_(void) {
|
|
1594
1608
|
nk_u64_t r;
|