numkong 7.4.5 → 7.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/binding.gyp +99 -5
- package/c/dispatch_e5m2.c +23 -3
- package/c/dispatch_f16.c +23 -0
- package/c/numkong.c +0 -13
- package/include/numkong/attention/sme.h +34 -31
- package/include/numkong/capabilities.h +2 -15
- package/include/numkong/cast/README.md +3 -0
- package/include/numkong/cast/haswell.h +28 -64
- package/include/numkong/cast/neon.h +15 -0
- package/include/numkong/cast/serial.h +17 -0
- package/include/numkong/cast/skylake.h +67 -52
- package/include/numkong/cast.h +1 -0
- package/include/numkong/curved/smef64.h +82 -62
- package/include/numkong/dot/README.md +1 -0
- package/include/numkong/dot/haswell.h +92 -13
- package/include/numkong/dot/rvvbf16.h +1 -1
- package/include/numkong/dot/rvvhalf.h +1 -1
- package/include/numkong/dot/serial.h +15 -0
- package/include/numkong/dot/skylake.h +61 -14
- package/include/numkong/dot/sve.h +6 -5
- package/include/numkong/dot/svebfdot.h +2 -1
- package/include/numkong/dot/svehalf.h +6 -5
- package/include/numkong/dot/svesdot.h +3 -2
- package/include/numkong/dots/README.md +2 -0
- package/include/numkong/dots/graniteamx.h +1167 -0
- package/include/numkong/dots/haswell.h +28 -28
- package/include/numkong/dots/sapphireamx.h +1 -1
- package/include/numkong/dots/serial.h +33 -11
- package/include/numkong/dots/skylake.h +28 -23
- package/include/numkong/dots/sme.h +172 -140
- package/include/numkong/dots/smebi32.h +14 -11
- package/include/numkong/dots/smef64.h +31 -26
- package/include/numkong/dots.h +41 -3
- package/include/numkong/each/serial.h +39 -0
- package/include/numkong/geospatial/haswell.h +1 -1
- package/include/numkong/geospatial/neon.h +1 -1
- package/include/numkong/geospatial/serial.h +15 -4
- package/include/numkong/geospatial/skylake.h +1 -1
- package/include/numkong/maxsim/serial.h +15 -0
- package/include/numkong/maxsim/sme.h +34 -33
- package/include/numkong/mesh/README.md +50 -44
- package/include/numkong/mesh/genoa.h +462 -0
- package/include/numkong/mesh/haswell.h +806 -933
- package/include/numkong/mesh/neon.h +871 -943
- package/include/numkong/mesh/neonbfdot.h +382 -522
- package/include/numkong/mesh/neonfhm.h +676 -0
- package/include/numkong/mesh/rvv.h +404 -319
- package/include/numkong/mesh/serial.h +225 -161
- package/include/numkong/mesh/skylake.h +1029 -1585
- package/include/numkong/mesh/v128relaxed.h +403 -377
- package/include/numkong/mesh.h +38 -0
- package/include/numkong/reduce/neon.h +29 -0
- package/include/numkong/reduce/neonbfdot.h +2 -2
- package/include/numkong/reduce/neonfhm.h +4 -4
- package/include/numkong/reduce/serial.h +15 -1
- package/include/numkong/reduce/sve.h +52 -0
- package/include/numkong/reduce.h +4 -0
- package/include/numkong/set/sve.h +6 -5
- package/include/numkong/sets/smebi32.h +35 -30
- package/include/numkong/sparse/serial.h +17 -2
- package/include/numkong/sparse/sve2.h +3 -2
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +98 -56
- package/include/numkong/spatial/serial.h +15 -0
- package/include/numkong/spatial/skylake.h +114 -54
- package/include/numkong/spatial/sve.h +7 -6
- package/include/numkong/spatial/svebfdot.h +7 -4
- package/include/numkong/spatial/svehalf.h +5 -4
- package/include/numkong/spatial/svesdot.h +9 -8
- package/include/numkong/spatial.h +0 -12
- package/include/numkong/spatials/graniteamx.h +301 -0
- package/include/numkong/spatials/serial.h +39 -0
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +391 -350
- package/include/numkong/spatials/smef64.h +79 -70
- package/include/numkong/spatials.h +54 -4
- package/include/numkong/tensor.hpp +107 -23
- package/include/numkong/types.h +59 -0
- package/javascript/dist/cjs/numkong.js +13 -0
- package/javascript/dist/esm/numkong.js +13 -0
- package/javascript/numkong.c +59 -14
- package/javascript/numkong.ts +13 -0
- package/package.json +7 -7
- package/probes/probe.js +2 -2
- package/wasm/numkong.wasm +0 -0
package/README.md
CHANGED
|
@@ -341,6 +341,7 @@ NumKong provides two dispatch mechanisms.
|
|
|
341
341
|
__Compile-time dispatch__ selects the fastest kernel supported by the target platform at build time — thinner binaries, no indirection overhead, but requires knowing your deployment hardware.
|
|
342
342
|
__Run-time dispatch__ compiles every supported kernel into the binary and picks the best one on the target machine via `nk_capabilities()` — one pointer indirection per call, but a single binary runs everywhere.
|
|
343
343
|
The run-time path is common in DBMS products (ClickHouse), web browsers (Chromium), and other upstream projects that ship to heterogeneous fleets.
|
|
344
|
+
Distributed artifacts (Rust crate, Python wheels, JS native modules, shared libs from the default CMake build) pin the translation-unit baseline to each architecture's ABI floor so the library runs on any CPU matching the ABI, not just the build host — see [CONTRIBUTING.md](CONTRIBUTING.md#target-baseline-policy) for the per-arch table and the `NK_MARCH_NATIVE` override used for host-tuned local builds.
|
|
344
345
|
|
|
345
346
|
All kernel names follow the pattern `nk_{operation}_{type}_{backend}`.
|
|
346
347
|
If you need to resolve the best kernel manually, use `nk_find_kernel_punned` with a `nk_kernel_kind_t`, `nk_dtype_t`, and a viable capabilities mask:
|
package/binding.gyp
CHANGED
|
@@ -39,11 +39,13 @@
|
|
|
39
39
|
"defines": [
|
|
40
40
|
"NK_NATIVE_F16=0",
|
|
41
41
|
"NK_NATIVE_BF16=0",
|
|
42
|
-
"NK_DYNAMIC_DISPATCH=1"
|
|
42
|
+
"NK_DYNAMIC_DISPATCH=1",
|
|
43
|
+
"NK_USE_OPENMP=1"
|
|
43
44
|
],
|
|
44
45
|
"cflags": [
|
|
45
46
|
"-std=c11",
|
|
46
47
|
"-O3",
|
|
48
|
+
"-fopenmp",
|
|
47
49
|
"-Wno-unknown-pragmas",
|
|
48
50
|
"-Wno-maybe-uninitialized",
|
|
49
51
|
"-Wno-cast-function-type",
|
|
@@ -52,31 +54,116 @@
|
|
|
52
54
|
"-include",
|
|
53
55
|
"<(module_root_dir)/nk_probes.h",
|
|
54
56
|
],
|
|
57
|
+
"ldflags": [
|
|
58
|
+
"-fopenmp"
|
|
59
|
+
],
|
|
55
60
|
"msvs_settings": {
|
|
56
61
|
"VCCLCompilerTool": {
|
|
57
62
|
"ForcedIncludeFiles": [
|
|
58
63
|
"<(module_root_dir)/nk_probes.h"
|
|
59
64
|
],
|
|
60
65
|
"AdditionalOptions": [
|
|
61
|
-
"/Zc:preprocessor"
|
|
66
|
+
"/Zc:preprocessor",
|
|
67
|
+
"/openmp:llvm"
|
|
62
68
|
],
|
|
63
69
|
},
|
|
64
70
|
},
|
|
65
71
|
"conditions": [
|
|
72
|
+
# Pin TU baseline to each arch's ABI floor; SIMD kernels use per-function pragmas.
|
|
73
|
+
# Keep per-arch table in sync with cmake/nk_compiler_flags.cmake, build.rs, setup.py.
|
|
74
|
+
[
|
|
75
|
+
"OS!='win' and target_arch=='arm64'",
|
|
76
|
+
{
|
|
77
|
+
"cflags": [
|
|
78
|
+
"-march=armv8-a"
|
|
79
|
+
]
|
|
80
|
+
}
|
|
81
|
+
],
|
|
82
|
+
[
|
|
83
|
+
"OS!='win' and target_arch=='x64'",
|
|
84
|
+
{
|
|
85
|
+
"cflags": [
|
|
86
|
+
"-march=x86-64"
|
|
87
|
+
]
|
|
88
|
+
}
|
|
89
|
+
],
|
|
90
|
+
[
|
|
91
|
+
"OS!='win' and target_arch=='riscv64'",
|
|
92
|
+
{
|
|
93
|
+
"cflags": [
|
|
94
|
+
"-march=rv64gc"
|
|
95
|
+
]
|
|
96
|
+
}
|
|
97
|
+
],
|
|
98
|
+
[
|
|
99
|
+
"OS!='win' and target_arch=='ppc64'",
|
|
100
|
+
{
|
|
101
|
+
"cflags": [
|
|
102
|
+
"-mcpu=power8"
|
|
103
|
+
]
|
|
104
|
+
}
|
|
105
|
+
],
|
|
106
|
+
[
|
|
107
|
+
"OS!='win' and target_arch=='loong64'",
|
|
108
|
+
{
|
|
109
|
+
"cflags": [
|
|
110
|
+
"-march=loongarch64",
|
|
111
|
+
"-mlasx"
|
|
112
|
+
]
|
|
113
|
+
}
|
|
114
|
+
],
|
|
115
|
+
# Forbid auto-vectorization so serial fallbacks don't get silently
|
|
116
|
+
# promoted to NEON/SSE2/VSX. SIMD kernels use explicit intrinsics
|
|
117
|
+
# and per-function `target` pragmas; unaffected. MSVC has no
|
|
118
|
+
# command-line vectorizer toggle.
|
|
119
|
+
[
|
|
120
|
+
"OS!='win'",
|
|
121
|
+
{
|
|
122
|
+
"cflags": [
|
|
123
|
+
"-fno-tree-vectorize",
|
|
124
|
+
"-fno-tree-slp-vectorize"
|
|
125
|
+
]
|
|
126
|
+
}
|
|
127
|
+
],
|
|
66
128
|
[
|
|
67
129
|
"OS=='mac'",
|
|
68
130
|
{
|
|
69
131
|
"xcode_settings": {
|
|
70
|
-
"MACOSX_DEPLOYMENT_TARGET": "11.0"
|
|
132
|
+
"MACOSX_DEPLOYMENT_TARGET": "11.0",
|
|
133
|
+
# Apple Clang ships no `omp.h`; the CI step
|
|
134
|
+
# `brew install libomp` makes it keg-only under
|
|
135
|
+
# `/opt/homebrew/opt/libomp` (arm64) or
|
|
136
|
+
# `/usr/local/opt/libomp` (x86_64). Clang silently
|
|
137
|
+
# ignores `-I` / `-L` dirs that don't exist, so
|
|
138
|
+
# listing both keeps the file arch-agnostic.
|
|
139
|
+
"OTHER_CFLAGS": [
|
|
140
|
+
"-Xpreprocessor",
|
|
141
|
+
"-fopenmp",
|
|
142
|
+
"-I/opt/homebrew/opt/libomp/include",
|
|
143
|
+
"-I/usr/local/opt/libomp/include"
|
|
144
|
+
],
|
|
145
|
+
"OTHER_LDFLAGS": [
|
|
146
|
+
"-lomp",
|
|
147
|
+
"-L/opt/homebrew/opt/libomp/lib",
|
|
148
|
+
"-L/usr/local/opt/libomp/lib"
|
|
149
|
+
]
|
|
71
150
|
}
|
|
72
151
|
}
|
|
73
152
|
],
|
|
153
|
+
# MSVC: no per-function target pragma; these match defaults.
|
|
74
154
|
[
|
|
75
155
|
"OS=='win' and target_arch=='arm64'",
|
|
76
156
|
{
|
|
77
157
|
"defines": [
|
|
78
158
|
"_ARM64_"
|
|
79
|
-
]
|
|
159
|
+
],
|
|
160
|
+
"msvs_settings": {
|
|
161
|
+
"VCCLCompilerTool": {
|
|
162
|
+
"AdditionalOptions": [
|
|
163
|
+
"/arch:armv8.0"
|
|
164
|
+
]
|
|
165
|
+
}
|
|
166
|
+
}
|
|
80
167
|
}
|
|
81
168
|
],
|
|
82
169
|
[
|
|
@@ -84,7 +171,14 @@
|
|
|
84
171
|
{
|
|
85
172
|
"defines": [
|
|
86
173
|
"_AMD64_"
|
|
87
|
-
]
|
|
174
|
+
],
|
|
175
|
+
"msvs_settings": {
|
|
176
|
+
"VCCLCompilerTool": {
|
|
177
|
+
"AdditionalOptions": [
|
|
178
|
+
"/arch:SSE2"
|
|
179
|
+
]
|
|
180
|
+
}
|
|
181
|
+
}
|
|
88
182
|
}
|
|
89
183
|
],
|
|
90
184
|
],
|
package/c/dispatch_e5m2.c
CHANGED
|
@@ -113,6 +113,29 @@ void nk_dispatch_e5m2_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_pun
|
|
|
113
113
|
default: break;
|
|
114
114
|
}
|
|
115
115
|
#endif
|
|
116
|
+
#if NK_TARGET_GRANITEAMX
|
|
117
|
+
if (v & nk_cap_graniteamx_k) switch (k) {
|
|
118
|
+
case nk_kernel_dots_packed_size_k:
|
|
119
|
+
*m = (m_t)&nk_dots_packed_size_e5m2_graniteamx, *c = nk_cap_graniteamx_k;
|
|
120
|
+
return;
|
|
121
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e5m2_graniteamx, *c = nk_cap_graniteamx_k; return;
|
|
122
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e5m2_graniteamx, *c = nk_cap_graniteamx_k; return;
|
|
123
|
+
case nk_kernel_angulars_packed_k:
|
|
124
|
+
*m = (m_t)&nk_angulars_packed_e5m2_graniteamx, *c = nk_cap_graniteamx_k;
|
|
125
|
+
return;
|
|
126
|
+
case nk_kernel_euclideans_packed_k:
|
|
127
|
+
*m = (m_t)&nk_euclideans_packed_e5m2_graniteamx, *c = nk_cap_graniteamx_k;
|
|
128
|
+
return;
|
|
129
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e5m2_graniteamx, *c = nk_cap_graniteamx_k; return;
|
|
130
|
+
case nk_kernel_angulars_symmetric_k:
|
|
131
|
+
*m = (m_t)&nk_angulars_symmetric_e5m2_graniteamx, *c = nk_cap_graniteamx_k;
|
|
132
|
+
return;
|
|
133
|
+
case nk_kernel_euclideans_symmetric_k:
|
|
134
|
+
*m = (m_t)&nk_euclideans_symmetric_e5m2_graniteamx, *c = nk_cap_graniteamx_k;
|
|
135
|
+
return;
|
|
136
|
+
default: break;
|
|
137
|
+
}
|
|
138
|
+
#endif
|
|
116
139
|
#if NK_TARGET_SAPPHIREAMX
|
|
117
140
|
if (v & nk_cap_sapphireamx_k) switch (k) {
|
|
118
141
|
case nk_kernel_dots_packed_size_k:
|
|
@@ -162,9 +185,6 @@ void nk_dispatch_e5m2_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_pun
|
|
|
162
185
|
#if NK_TARGET_GENOA
|
|
163
186
|
if (v & nk_cap_genoa_k) switch (k) {
|
|
164
187
|
case nk_kernel_dot_k: *m = (m_t)&nk_dot_e5m2_genoa, *c = nk_cap_genoa_k; return;
|
|
165
|
-
case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e5m2_genoa, *c = nk_cap_genoa_k; return;
|
|
166
|
-
case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e5m2_genoa, *c = nk_cap_genoa_k; return;
|
|
167
|
-
case nk_kernel_angular_k: *m = (m_t)&nk_angular_e5m2_genoa, *c = nk_cap_genoa_k; return;
|
|
168
188
|
case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e5m2_genoa, *c = nk_cap_genoa_k; return;
|
|
169
189
|
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e5m2_genoa, *c = nk_cap_genoa_k; return;
|
|
170
190
|
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e5m2_genoa, *c = nk_cap_genoa_k; return;
|
package/c/dispatch_f16.c
CHANGED
|
@@ -137,6 +137,29 @@ void nk_dispatch_f16_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punn
|
|
|
137
137
|
default: break;
|
|
138
138
|
}
|
|
139
139
|
#endif
|
|
140
|
+
#if NK_TARGET_GRANITEAMX
|
|
141
|
+
if (v & nk_cap_graniteamx_k) switch (k) {
|
|
142
|
+
case nk_kernel_dots_packed_size_k:
|
|
143
|
+
*m = (m_t)&nk_dots_packed_size_f16_graniteamx, *c = nk_cap_graniteamx_k;
|
|
144
|
+
return;
|
|
145
|
+
case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_f16_graniteamx, *c = nk_cap_graniteamx_k; return;
|
|
146
|
+
case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_f16_graniteamx, *c = nk_cap_graniteamx_k; return;
|
|
147
|
+
case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_f16_graniteamx, *c = nk_cap_graniteamx_k; return;
|
|
148
|
+
case nk_kernel_angulars_packed_k:
|
|
149
|
+
*m = (m_t)&nk_angulars_packed_f16_graniteamx, *c = nk_cap_graniteamx_k;
|
|
150
|
+
return;
|
|
151
|
+
case nk_kernel_angulars_symmetric_k:
|
|
152
|
+
*m = (m_t)&nk_angulars_symmetric_f16_graniteamx, *c = nk_cap_graniteamx_k;
|
|
153
|
+
return;
|
|
154
|
+
case nk_kernel_euclideans_packed_k:
|
|
155
|
+
*m = (m_t)&nk_euclideans_packed_f16_graniteamx, *c = nk_cap_graniteamx_k;
|
|
156
|
+
return;
|
|
157
|
+
case nk_kernel_euclideans_symmetric_k:
|
|
158
|
+
*m = (m_t)&nk_euclideans_symmetric_f16_graniteamx, *c = nk_cap_graniteamx_k;
|
|
159
|
+
return;
|
|
160
|
+
default: break;
|
|
161
|
+
}
|
|
162
|
+
#endif
|
|
140
163
|
#if NK_TARGET_SAPPHIREAMX
|
|
141
164
|
if (v & nk_cap_sapphireamx_k) switch (k) {
|
|
142
165
|
case nk_kernel_maxsim_packed_size_k:
|
package/c/numkong.c
CHANGED
|
@@ -935,19 +935,6 @@ BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpReserved) {
|
|
|
935
935
|
#endif
|
|
936
936
|
#endif
|
|
937
937
|
|
|
938
|
-
// SME ABI runtime stubs — provide the lazy-ZA-save helpers that compiler-rt
|
|
939
|
-
// may not ship (e.g., Apple's toolchain). Called by compiler-generated code
|
|
940
|
-
// in __arm_new("za") prologues/epilogues (used by dots streaming functions).
|
|
941
|
-
//
|
|
942
|
-
// In NumKong, TPIDR2_EL0 is always null at entry because no NK_PUBLIC function
|
|
943
|
-
// carries ZA state. So __arm_tpidr2_save is always a no-op and
|
|
944
|
-
// __arm_tpidr2_restore has nothing to restore.
|
|
945
|
-
// Weak linkage lets a real compiler-rt override these if available.
|
|
946
|
-
#if NK_TARGET_ARM64_ && NK_TARGET_SME
|
|
947
|
-
__attribute__((weak, visibility("default"))) void __arm_tpidr2_save(void) {}
|
|
948
|
-
__attribute__((weak, visibility("default"))) void __arm_tpidr2_restore(void *blk) { nk_unused_(blk); }
|
|
949
|
-
#endif
|
|
950
|
-
|
|
951
938
|
#ifdef __cplusplus
|
|
952
939
|
}
|
|
953
940
|
#endif
|
|
@@ -249,10 +249,9 @@ NK_PUBLIC nk_size_t nk_attention_packed_kv_size_f16_sme(nk_size_t num_kv_heads,
|
|
|
249
249
|
return nk_attention_packed_kv_size_bf16_sme(num_kv_heads, head_dim, max_seq_len);
|
|
250
250
|
}
|
|
251
251
|
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
nk_size_t v_stride, void *kv_packed) {
|
|
252
|
+
static void nk_attention_pack_kv_bf16_sme_ssve_( //
|
|
253
|
+
nk_bf16_t const *k, nk_bf16_t const *v, nk_size_t num_kv_heads, nk_size_t head_dim, nk_size_t seq_len,
|
|
254
|
+
nk_size_t k_stride, nk_size_t v_stride, void *kv_packed) NK_STREAMING_ {
|
|
256
255
|
|
|
257
256
|
nk_attention_sme_packed_header_t *header = (nk_attention_sme_packed_header_t *)kv_packed;
|
|
258
257
|
nk_size_t head_dim_padded = (head_dim + 31) / 32 * 32;
|
|
@@ -315,16 +314,17 @@ __arm_locally_streaming static void nk_attention_pack_kv_bf16_sme_streaming_(nk_
|
|
|
315
314
|
}
|
|
316
315
|
}
|
|
317
316
|
|
|
318
|
-
NK_PUBLIC void nk_attention_pack_kv_bf16_sme(
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
317
|
+
NK_PUBLIC void nk_attention_pack_kv_bf16_sme( //
|
|
318
|
+
nk_bf16_t const *k, nk_bf16_t const *v, nk_size_t num_kv_heads, nk_size_t head_dim, nk_size_t seq_len,
|
|
319
|
+
nk_size_t k_stride, nk_size_t v_stride, void *kv_packed) {
|
|
320
|
+
nk_sme_start_streaming_();
|
|
321
|
+
nk_attention_pack_kv_bf16_sme_ssve_(k, v, num_kv_heads, head_dim, seq_len, k_stride, v_stride, kv_packed);
|
|
322
|
+
nk_sme_stop_streaming_();
|
|
322
323
|
}
|
|
323
324
|
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
nk_size_t v_stride, void *kv_packed) {
|
|
325
|
+
static void nk_attention_pack_kv_f16_sme_ssve_( //
|
|
326
|
+
nk_f16_t const *k, nk_f16_t const *v, nk_size_t num_kv_heads, nk_size_t head_dim, nk_size_t seq_len,
|
|
327
|
+
nk_size_t k_stride, nk_size_t v_stride, void *kv_packed) NK_STREAMING_ {
|
|
328
328
|
|
|
329
329
|
nk_attention_sme_packed_header_t *header = (nk_attention_sme_packed_header_t *)kv_packed;
|
|
330
330
|
nk_size_t head_dim_padded = (head_dim + 31) / 32 * 32;
|
|
@@ -385,10 +385,12 @@ __arm_locally_streaming static void nk_attention_pack_kv_f16_sme_streaming_(nk_f
|
|
|
385
385
|
}
|
|
386
386
|
}
|
|
387
387
|
|
|
388
|
-
NK_PUBLIC void nk_attention_pack_kv_f16_sme(
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
388
|
+
NK_PUBLIC void nk_attention_pack_kv_f16_sme( //
|
|
389
|
+
nk_f16_t const *k, nk_f16_t const *v, nk_size_t num_kv_heads, nk_size_t head_dim, nk_size_t seq_len,
|
|
390
|
+
nk_size_t k_stride, nk_size_t v_stride, void *kv_packed) {
|
|
391
|
+
nk_sme_start_streaming_();
|
|
392
|
+
nk_attention_pack_kv_f16_sme_ssve_(k, v, num_kv_heads, head_dim, seq_len, k_stride, v_stride, kv_packed);
|
|
393
|
+
nk_sme_stop_streaming_();
|
|
392
394
|
}
|
|
393
395
|
|
|
394
396
|
/**
|
|
@@ -402,13 +404,13 @@ NK_PUBLIC void nk_attention_pack_kv_f16_sme(nk_f16_t const *k, nk_f16_t const *v
|
|
|
402
404
|
* - Correction skip when running max is unchanged
|
|
403
405
|
* - Decode path (valid_query_count==1) remains element-wise SVE (BFMOPA overhead too high)
|
|
404
406
|
*/
|
|
405
|
-
|
|
407
|
+
__arm_new("za") static void nk_attention_bf16_sme_streaming_(
|
|
406
408
|
nk_bf16_t const *q, // [query_len, head_dim]
|
|
407
409
|
nk_bf16_t const *k, // [kv_len, head_dim_padded] BFMOPA-interleaved
|
|
408
410
|
nk_bf16_t const *v_packed, // BFMOPA-interleaved V for this KV head
|
|
409
411
|
nk_bf16_t *output, // [query_len, head_dim]
|
|
410
412
|
nk_size_t query_len, nk_size_t kv_len, nk_size_t head_dim, nk_size_t head_dim_padded, nk_size_t dim_tile_count,
|
|
411
|
-
nk_f32_t scale) {
|
|
413
|
+
nk_f32_t scale) NK_STREAMING_ {
|
|
412
414
|
|
|
413
415
|
svbool_t const predicate_all_b32x = svptrue_b32();
|
|
414
416
|
svbool_t const predicate_all_b16x = svptrue_b16();
|
|
@@ -1184,9 +1186,9 @@ __arm_locally_streaming __arm_new("za") static void nk_attention_bf16_sme_stream
|
|
|
1184
1186
|
}
|
|
1185
1187
|
}
|
|
1186
1188
|
|
|
1187
|
-
NK_PUBLIC void nk_attention_bf16_sme(
|
|
1188
|
-
|
|
1189
|
-
|
|
1189
|
+
NK_PUBLIC void nk_attention_bf16_sme( //
|
|
1190
|
+
nk_bf16_t const *q, void const *kv_packed, nk_bf16_t *output, nk_size_t num_heads, nk_size_t num_kv_heads,
|
|
1191
|
+
nk_size_t query_len, nk_size_t kv_len, nk_size_t head_dim, nk_f32_t scale) {
|
|
1190
1192
|
|
|
1191
1193
|
nk_attention_sme_packed_header_t const *header = (nk_attention_sme_packed_header_t const *)kv_packed;
|
|
1192
1194
|
nk_size_t head_dim_padded = header->head_dim_padded;
|
|
@@ -1199,6 +1201,7 @@ NK_PUBLIC void nk_attention_bf16_sme(nk_bf16_t const *q, void const *kv_packed,
|
|
|
1199
1201
|
|
|
1200
1202
|
nk_size_t group_size = (num_kv_heads > 0) ? num_heads / num_kv_heads : 1;
|
|
1201
1203
|
|
|
1204
|
+
nk_sme_start_streaming_();
|
|
1202
1205
|
for (nk_size_t q_head = 0; q_head < num_heads; q_head++) {
|
|
1203
1206
|
nk_size_t kv_head = q_head / group_size;
|
|
1204
1207
|
|
|
@@ -1214,15 +1217,13 @@ NK_PUBLIC void nk_attention_bf16_sme(nk_bf16_t const *q, void const *kv_packed,
|
|
|
1214
1217
|
q_block_len, kv_len, head_dim, head_dim_padded, dim_tile_count, scale);
|
|
1215
1218
|
}
|
|
1216
1219
|
}
|
|
1220
|
+
nk_sme_stop_streaming_();
|
|
1217
1221
|
}
|
|
1218
1222
|
|
|
1219
|
-
|
|
1220
|
-
nk_f16_t const *q,
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
nk_f16_t *output, // [query_len, head_dim]
|
|
1224
|
-
nk_size_t query_len, nk_size_t kv_len, nk_size_t head_dim, nk_size_t head_dim_padded, nk_size_t dim_tile_count,
|
|
1225
|
-
nk_f32_t scale) {
|
|
1223
|
+
__arm_new("za") static void nk_attention_f16_sme_streaming_( //
|
|
1224
|
+
nk_f16_t const *q, nk_f16_t const *k, nk_f16_t const *v_packed, nk_f16_t *output, nk_size_t query_len,
|
|
1225
|
+
nk_size_t kv_len, nk_size_t head_dim, nk_size_t head_dim_padded, nk_size_t dim_tile_count,
|
|
1226
|
+
nk_f32_t scale) NK_STREAMING_ {
|
|
1226
1227
|
|
|
1227
1228
|
svbool_t const predicate_all_b32x = svptrue_b32();
|
|
1228
1229
|
svbool_t const predicate_all_b16x = svptrue_b16();
|
|
@@ -2008,9 +2009,9 @@ __arm_locally_streaming __arm_new("za") static void nk_attention_f16_sme_streami
|
|
|
2008
2009
|
}
|
|
2009
2010
|
}
|
|
2010
2011
|
|
|
2011
|
-
NK_PUBLIC void nk_attention_f16_sme(
|
|
2012
|
-
|
|
2013
|
-
|
|
2012
|
+
NK_PUBLIC void nk_attention_f16_sme( //
|
|
2013
|
+
nk_f16_t const *q, void const *kv_packed, nk_f16_t *output, nk_size_t num_heads, nk_size_t num_kv_heads,
|
|
2014
|
+
nk_size_t query_len, nk_size_t kv_len, nk_size_t head_dim, nk_f32_t scale) {
|
|
2014
2015
|
|
|
2015
2016
|
nk_attention_sme_packed_header_t const *header = (nk_attention_sme_packed_header_t const *)kv_packed;
|
|
2016
2017
|
nk_size_t head_dim_padded = header->head_dim_padded;
|
|
@@ -2024,6 +2025,7 @@ NK_PUBLIC void nk_attention_f16_sme(nk_f16_t const *q, void const *kv_packed, nk
|
|
|
2024
2025
|
|
|
2025
2026
|
nk_size_t group_size = (num_kv_heads > 0) ? num_heads / num_kv_heads : 1;
|
|
2026
2027
|
|
|
2028
|
+
nk_sme_start_streaming_();
|
|
2027
2029
|
for (nk_size_t q_head = 0; q_head < num_heads; q_head++) {
|
|
2028
2030
|
nk_size_t kv_head = q_head / group_size;
|
|
2029
2031
|
|
|
@@ -2039,6 +2041,7 @@ NK_PUBLIC void nk_attention_f16_sme(nk_f16_t const *q, void const *kv_packed, nk
|
|
|
2039
2041
|
q_block_len, kv_len, head_dim, head_dim_padded, dim_tile_count, scale);
|
|
2040
2042
|
}
|
|
2041
2043
|
}
|
|
2044
|
+
nk_sme_stop_streaming_();
|
|
2042
2045
|
}
|
|
2043
2046
|
|
|
2044
2047
|
NK_PUBLIC void nk_attention_causal_bf16_sme(nk_bf16_t const *q, void const *kv_packed, nk_bf16_t *output,
|
|
@@ -95,8 +95,8 @@
|
|
|
95
95
|
#include "numkong/types.h" // `nk_u64_t`, `NK_DEFINED_LINUX_`
|
|
96
96
|
|
|
97
97
|
#define NK_VERSION_MAJOR 7
|
|
98
|
-
#define NK_VERSION_MINOR
|
|
99
|
-
#define NK_VERSION_PATCH
|
|
98
|
+
#define NK_VERSION_MINOR 6
|
|
99
|
+
#define NK_VERSION_PATCH 0
|
|
100
100
|
|
|
101
101
|
/**
|
|
102
102
|
* @brief Removes compile-time dispatching, and replaces it with runtime dispatching.
|
|
@@ -500,13 +500,6 @@ NK_PUBLIC nk_capability_t nk_capabilities_x8664_(void) {
|
|
|
500
500
|
|
|
501
501
|
#if NK_TARGET_ARM64_
|
|
502
502
|
|
|
503
|
-
#if defined(__clang__)
|
|
504
|
-
#pragma clang attribute push(__attribute__((target("arch=armv8.5-a+sve"))), apply_to = function)
|
|
505
|
-
#elif defined(__GNUC__)
|
|
506
|
-
#pragma GCC push_options
|
|
507
|
-
#pragma GCC target("arch=armv8.5-a+sve")
|
|
508
|
-
#endif
|
|
509
|
-
|
|
510
503
|
#if NK_HAS_POSIX_EXTENSIONS_
|
|
511
504
|
static sigjmp_buf nk_mrs_arm64_jump_buffer_;
|
|
512
505
|
static void nk_mrs_arm64_sigill_handler_(int sig) {
|
|
@@ -716,12 +709,6 @@ NK_PUBLIC nk_capability_t nk_capabilities_arm64_(void) {
|
|
|
716
709
|
#endif
|
|
717
710
|
}
|
|
718
711
|
|
|
719
|
-
#if defined(__clang__)
|
|
720
|
-
#pragma clang attribute pop
|
|
721
|
-
#elif defined(__GNUC__)
|
|
722
|
-
#pragma GCC pop_options
|
|
723
|
-
#endif
|
|
724
|
-
|
|
725
712
|
#endif // NK_TARGET_ARM64_
|
|
726
713
|
|
|
727
714
|
#if NK_TARGET_RISCV64_
|
|
@@ -93,6 +93,9 @@ NEON backend uses `vreinterpretq_u16_u8` + `vzip` for zero-extension; Haswell us
|
|
|
93
93
|
`nk_f16_to_f32_haswell`, `nk_f32_to_f16_haswell` use the F16C extension instructions `VCVTPH2PS` / `VCVTPS2PH` — single-instruction conversion of 8 elements with correct denormal handling, NaN propagation, and RNE rounding.
|
|
94
94
|
The serial fallback (`nk_f16_to_f32_serial`) must handle denormals via explicit exponent/mantissa extraction and conditional re-normalization — ~15 integer ops per element vs 1 instruction with F16C.
|
|
95
95
|
AVX-512 (`nk_cast_skylake`) doubles throughput to 16 elements per instruction.
|
|
96
|
+
F16C also unlocks a cheaper FP8 → F32 path that bypasses i32-lane bit math: `nk_e5m2x16_to_f32x16_skylake_` and `nk_e5m2x8_to_f32x8_haswell_` widen u8 → u16 and left-shift by 8 (E5M2 shares F16's bias 15, so the result is a bit-exact F16 encoding of every input including subnormals and NaN), then feed `VCVTPH2PS` — three ops total.
|
|
97
|
+
E4M3 can't use a plain shift (bias 7 vs 15), but the Giesen-style fake-F16 `((byte & 0x7F) << 7) | ((byte & 0x80) << 8)` gives an F16 whose value differs from the E4M3 magnitude by exactly 2⁸; `nk_e4m3x16_to_f32x16_skylake_` and `nk_e4m3x8_to_f32x8_haswell_` widen through `VCVTPH2PS`, multiply by 256 in F32 to correct, and blend in F32 NaN for the lone `|byte|==0x7F` encoding.
|
|
98
|
+
For E4M3 GEMM specifically, `nk_e4m3x16_to_f16x16_skylake_` produces TRUE F16 (bias-corrected, with a small subnormal LUT and NaN blend) so the packed buffer stores 2 bytes/element instead of 4 — the inner loop reads F16 and widens to F32 once per B-load, trading ~10% compute for 50% pack memory.
|
|
96
99
|
|
|
97
100
|
## Performance
|
|
98
101
|
|
|
@@ -172,72 +172,36 @@ NK_INTERNAL __m128i nk_f32x8_to_u8x8_haswell_(__m256 f32x8) {
|
|
|
172
172
|
return _mm_packus_epi16(u16x8, _mm_setzero_si128());
|
|
173
173
|
}
|
|
174
174
|
|
|
175
|
-
/** @brief Convert 8x e4m3 → 8x f32 via
|
|
176
|
-
* E4M3
|
|
177
|
-
*
|
|
178
|
-
*
|
|
175
|
+
/** @brief Convert 8x e4m3 → 8x f32 via Giesen-style fake-F16 cast (AVX2 + F16C).
|
|
176
|
+
* E4M3 `byte = S EEEE MMM` (bias 7). Shifting the magnitude into F16 positions
|
|
177
|
+
* `((byte & 0x7F) << 7) | ((byte & 0x80) << 8)` yields a fake F16 whose F16 value
|
|
178
|
+
* differs from the true E4M3 magnitude by exactly 2⁸ (bias delta 15 − 7). The
|
|
179
|
+
* fake F16 is widened via `vcvtph2ps` and corrected by ×256 in F32. Subnormal
|
|
180
|
+
* handling falls out for free via F16 subnormal semantics. NaN (|byte|==0x7F)
|
|
181
|
+
* is blended explicitly with F16 quiet-NaN bits. */
|
|
179
182
|
NK_INTERNAL __m256 nk_e4m3x8_to_f32x8_haswell_(__m128i e4m3_i8x8) {
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
_mm256_castps_si256(_mm256_permutevar8x32_ps(subnorm_lut_f32x8, mant_i32x8)), f32_sign_i32x8);
|
|
199
|
-
|
|
200
|
-
// Bitwise select: if exp==0, use subnormal; otherwise use normal
|
|
201
|
-
__m256i exp_zero_mask = _mm256_cmpeq_epi32(exp_i32x8, _mm256_setzero_si256());
|
|
202
|
-
__m256i result_i32x8 = _mm256_or_si256( //
|
|
203
|
-
_mm256_and_si256(exp_zero_mask, subnorm_bits_i32x8), //
|
|
204
|
-
_mm256_andnot_si256(exp_zero_mask, normal_bits_i32x8));
|
|
205
|
-
|
|
206
|
-
// NaN: E4M3FN has NaN only at magnitude 0x7F (exp=15, mant=7)
|
|
207
|
-
__m256i lower7_i32x8 = _mm256_and_si256(e4m3_i32x8, _mm256_set1_epi32(0x7F));
|
|
208
|
-
__m256i is_nan_mask = _mm256_cmpeq_epi32(lower7_i32x8, _mm256_set1_epi32(0x7F));
|
|
209
|
-
__m256i nan_i32x8 = _mm256_or_si256(f32_sign_i32x8, _mm256_set1_epi32(0x7FC00000));
|
|
210
|
-
result_i32x8 = _mm256_or_si256( //
|
|
211
|
-
_mm256_and_si256(is_nan_mask, nan_i32x8), //
|
|
212
|
-
_mm256_andnot_si256(is_nan_mask, result_i32x8));
|
|
213
|
-
return _mm256_castsi256_ps(result_i32x8);
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
/** @brief Convert 8x e5m2 → 8x f32 via bit manipulation (AVX2).
|
|
217
|
-
* E5M2 format: S EEEEE MM (bias=15). F32: sign<<31, (exp+112)<<23, mant<<21.
|
|
218
|
-
* Subnormals (exp=0): value = mantissa × 2⁽¹⁻¹⁵⁾ × 2⁻² = mantissa ÷ 65536. */
|
|
183
|
+
__m128i const magnitude_mask_u16x8 = _mm_set1_epi16(0x7F);
|
|
184
|
+
__m128i const sign_mask_u16x8 = _mm_set1_epi16((short)0x80);
|
|
185
|
+
__m128i const f16_nan_u16x8 = _mm_set1_epi16(0x7E00);
|
|
186
|
+
__m128i word_u16x8 = _mm_cvtepu8_epi16(e4m3_i8x8);
|
|
187
|
+
__m128i magnitude_u16x8 = _mm_and_si128(word_u16x8, magnitude_mask_u16x8);
|
|
188
|
+
__m128i is_nan_u16x8 = _mm_cmpeq_epi16(magnitude_u16x8, magnitude_mask_u16x8);
|
|
189
|
+
__m128i shifted_magnitude_u16x8 = _mm_slli_epi16(magnitude_u16x8, 7);
|
|
190
|
+
__m128i shifted_sign_u16x8 = _mm_slli_epi16(_mm_and_si128(word_u16x8, sign_mask_u16x8), 8);
|
|
191
|
+
__m128i f16_bits_u16x8 = _mm_or_si128(shifted_magnitude_u16x8, shifted_sign_u16x8);
|
|
192
|
+
f16_bits_u16x8 = _mm_blendv_epi8(f16_bits_u16x8, f16_nan_u16x8, is_nan_u16x8);
|
|
193
|
+
__m256 fake_f32x8 = _mm256_cvtph_ps(f16_bits_u16x8);
|
|
194
|
+
return _mm256_mul_ps(fake_f32x8, _mm256_set1_ps(256.0f));
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
/** @brief Convert 8x e5m2 → 8x f32 via free-shift widen (AVX2 + F16C).
|
|
198
|
+
* E5M2 shares F16's exponent bias (15): `(byte << 8)` is the matching F16 bit
|
|
199
|
+
* pattern for every E5M2 value (normals, subnormals, zero, ±Inf, NaN — all
|
|
200
|
+
* bit-exact). Widen u8 → u16, shift, then VCVTPH2PS to F32. Three ops total. */
|
|
219
201
|
NK_INTERNAL __m256 nk_e5m2x8_to_f32x8_haswell_(__m128i e5m2_i8x8) {
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
__m256i exp_i32x8 = _mm256_and_si256(_mm256_srli_epi32(e5m2_i32x8, 2), _mm256_set1_epi32(0x1F));
|
|
224
|
-
__m256i mant_i32x8 = _mm256_and_si256(e5m2_i32x8, _mm256_set1_epi32(0x03));
|
|
225
|
-
|
|
226
|
-
// Build F32 sign bit
|
|
227
|
-
__m256i f32_sign_i32x8 = _mm256_slli_epi32(_mm256_srli_epi32(e5m2_i32x8, 7), 31);
|
|
228
|
-
|
|
229
|
-
// Normal path: sign | ((exp+112)<<23) | (mant<<21)
|
|
230
|
-
__m256i f32_exp_i32x8 = _mm256_slli_epi32(_mm256_add_epi32(exp_i32x8, _mm256_set1_epi32(112)), 23);
|
|
231
|
-
__m256i f32_mant_i32x8 = _mm256_slli_epi32(mant_i32x8, 21);
|
|
232
|
-
__m256i normal_bits_i32x8 = _mm256_or_si256(f32_sign_i32x8, _mm256_or_si256(f32_exp_i32x8, f32_mant_i32x8));
|
|
233
|
-
|
|
234
|
-
// Subnormal path: value = mantissa / 65536.0f, then apply sign
|
|
235
|
-
__m256 subnorm_abs_f32x8 = _mm256_mul_ps(_mm256_cvtepi32_ps(mant_i32x8), _mm256_set1_ps(1.0f / 65536.0f));
|
|
236
|
-
__m256 subnorm_f32x8 = _mm256_or_ps(subnorm_abs_f32x8, _mm256_castsi256_ps(f32_sign_i32x8));
|
|
237
|
-
|
|
238
|
-
// Blend: if exp==0, use subnormal result; otherwise use normal bits
|
|
239
|
-
__m256i exp_zero_mask = _mm256_cmpeq_epi32(exp_i32x8, _mm256_setzero_si256());
|
|
240
|
-
return _mm256_blendv_ps(_mm256_castsi256_ps(normal_bits_i32x8), subnorm_f32x8, _mm256_castsi256_ps(exp_zero_mask));
|
|
202
|
+
__m128i e5m2_u16x8 = _mm_cvtepu8_epi16(e5m2_i8x8);
|
|
203
|
+
__m128i f16_bits_u16x8 = _mm_slli_epi16(e5m2_u16x8, 8);
|
|
204
|
+
return _mm256_cvtph_ps(f16_bits_u16x8);
|
|
241
205
|
}
|
|
242
206
|
|
|
243
207
|
/** @brief Convert 8x f32 → 8x e4m3 via bit manipulation (AVX2).
|
|
@@ -104,6 +104,21 @@ NK_INTERNAL void nk_store_b256_neon_(nk_b256_vec_t const *src, void *dst) {
|
|
|
104
104
|
/** @brief Type-agnostic 64-bit full load (NEON). */
|
|
105
105
|
NK_INTERNAL void nk_load_b64_neon_(void const *src, nk_b64_vec_t *dst) { dst->u8x8 = vld1_u8((nk_u8_t const *)src); }
|
|
106
106
|
|
|
107
|
+
/**
|
|
108
|
+
* @brief 8-lane `uint16x8_t` splat that hides the source from the optimizer.
|
|
109
|
+
*
|
|
110
|
+
* GCC 13 lowers `vdupq_n_u16(X)` to `fmov v.8h, #imm` (a FEAT_FP16 encoding) whenever X matches a
|
|
111
|
+
* representable FP16 immediate, including bf16 bit patterns like 1.0 (`0x3F80`). That fails to
|
|
112
|
+
* assemble under a `+bf16`-only pragma. The empty `__asm__` constraint forces `mov w; dup v.8h, w`
|
|
113
|
+
* instead, valid on plain `armv8-a+simd`. No-op on Clang; skipped on MSVC (neither is affected).
|
|
114
|
+
*/
|
|
115
|
+
NK_INTERNAL uint16x8_t nk_u16x8_splat_(nk_u16_t bits) {
|
|
116
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
117
|
+
__asm__("" : "+r"(bits));
|
|
118
|
+
#endif
|
|
119
|
+
return vdupq_n_u16(bits);
|
|
120
|
+
}
|
|
121
|
+
|
|
107
122
|
#pragma endregion Type Punned Loads and Stores
|
|
108
123
|
|
|
109
124
|
#pragma region Vectorized Conversions
|
|
@@ -13,6 +13,17 @@
|
|
|
13
13
|
extern "C" {
|
|
14
14
|
#endif
|
|
15
15
|
|
|
16
|
+
/* Keep the serial conversions below actually scalar, regardless of build type.
|
|
17
|
+
* Without this, -O3 + LTO can vectorize or clone the serial kernels under AVX-512
|
|
18
|
+
* callers in dispatch_*.c, which wastes binary and breaks the nk_*_serial-as-scalar-oracle
|
|
19
|
+
* contract. See dots/serial.h. */
|
|
20
|
+
#if defined(__clang__)
|
|
21
|
+
#pragma clang attribute push(__attribute__((noinline)), apply_to = function)
|
|
22
|
+
#elif defined(__GNUC__)
|
|
23
|
+
#pragma GCC push_options
|
|
24
|
+
#pragma GCC optimize("no-tree-vectorize", "no-tree-slp-vectorize", "no-ipa-cp-clone", "no-inline")
|
|
25
|
+
#endif
|
|
26
|
+
|
|
16
27
|
#pragma region Type Punned Loads and Stores
|
|
17
28
|
|
|
18
29
|
/** @brief Type-agnostic 32-bit full load (scalar). */
|
|
@@ -2329,6 +2340,12 @@ NK_PUBLIC void nk_e3m2_to_bf16(nk_e3m2_t const *src, nk_bf16_t *dest) {
|
|
|
2329
2340
|
|
|
2330
2341
|
#pragma endregion Public API
|
|
2331
2342
|
|
|
2343
|
+
#if defined(__clang__)
|
|
2344
|
+
#pragma clang attribute pop
|
|
2345
|
+
#elif defined(__GNUC__)
|
|
2346
|
+
#pragma GCC pop_options
|
|
2347
|
+
#endif
|
|
2348
|
+
|
|
2332
2349
|
#if defined(__cplusplus)
|
|
2333
2350
|
} // extern "C"
|
|
2334
2351
|
#endif
|