numkong 7.4.4 → 7.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/README.md +1 -0
  2. package/binding.gyp +81 -5
  3. package/c/dispatch_f16.c +23 -0
  4. package/c/numkong.c +0 -13
  5. package/include/numkong/attention/sme.h +34 -31
  6. package/include/numkong/capabilities.h +2 -15
  7. package/include/numkong/cast/neon.h +15 -0
  8. package/include/numkong/curved/smef64.h +82 -62
  9. package/include/numkong/dot/rvvbf16.h +1 -1
  10. package/include/numkong/dot/rvvhalf.h +1 -1
  11. package/include/numkong/dot/sve.h +6 -5
  12. package/include/numkong/dot/svebfdot.h +2 -1
  13. package/include/numkong/dot/svehalf.h +6 -5
  14. package/include/numkong/dot/svesdot.h +3 -2
  15. package/include/numkong/dots/graniteamx.h +733 -0
  16. package/include/numkong/dots/serial.h +11 -4
  17. package/include/numkong/dots/sme.h +172 -140
  18. package/include/numkong/dots/smebi32.h +14 -11
  19. package/include/numkong/dots/smef64.h +31 -26
  20. package/include/numkong/dots.h +29 -3
  21. package/include/numkong/each/serial.h +22 -0
  22. package/include/numkong/geospatial/haswell.h +1 -1
  23. package/include/numkong/geospatial/neon.h +1 -1
  24. package/include/numkong/geospatial/serial.h +1 -1
  25. package/include/numkong/geospatial/skylake.h +1 -1
  26. package/include/numkong/maxsim/sme.h +94 -55
  27. package/include/numkong/mesh/README.md +13 -27
  28. package/include/numkong/mesh/haswell.h +25 -122
  29. package/include/numkong/mesh/neon.h +21 -110
  30. package/include/numkong/mesh/neonbfdot.h +4 -43
  31. package/include/numkong/mesh/rvv.h +7 -82
  32. package/include/numkong/mesh/serial.h +48 -53
  33. package/include/numkong/mesh/skylake.h +7 -123
  34. package/include/numkong/mesh/v128relaxed.h +9 -93
  35. package/include/numkong/mesh.h +2 -2
  36. package/include/numkong/mesh.hpp +35 -96
  37. package/include/numkong/reduce/neon.h +29 -0
  38. package/include/numkong/reduce/neonbfdot.h +2 -2
  39. package/include/numkong/reduce/neonfhm.h +4 -4
  40. package/include/numkong/reduce/sve.h +52 -0
  41. package/include/numkong/reduce.h +4 -0
  42. package/include/numkong/set/sve.h +6 -5
  43. package/include/numkong/sets/smebi32.h +35 -30
  44. package/include/numkong/sparse/sve2.h +3 -2
  45. package/include/numkong/spatial/sve.h +7 -6
  46. package/include/numkong/spatial/svebfdot.h +7 -4
  47. package/include/numkong/spatial/svehalf.h +5 -4
  48. package/include/numkong/spatial/svesdot.h +9 -8
  49. package/include/numkong/spatials/graniteamx.h +173 -0
  50. package/include/numkong/spatials/serial.h +22 -0
  51. package/include/numkong/spatials/sme.h +391 -350
  52. package/include/numkong/spatials/smef64.h +79 -70
  53. package/include/numkong/spatials.h +37 -4
  54. package/include/numkong/types.h +59 -0
  55. package/javascript/dist/cjs/numkong.js +13 -0
  56. package/javascript/dist/esm/numkong.js +13 -0
  57. package/javascript/numkong.c +56 -12
  58. package/javascript/numkong.ts +13 -0
  59. package/package.json +7 -7
  60. package/probes/probe.js +2 -2
  61. package/wasm/numkong.wasm +0 -0
package/README.md CHANGED
@@ -341,6 +341,7 @@ NumKong provides two dispatch mechanisms.
341
341
  __Compile-time dispatch__ selects the fastest kernel supported by the target platform at build time — thinner binaries, no indirection overhead, but requires knowing your deployment hardware.
342
342
  __Run-time dispatch__ compiles every supported kernel into the binary and picks the best one on the target machine via `nk_capabilities()` — one pointer indirection per call, but a single binary runs everywhere.
343
343
  The run-time path is common in DBMS products (ClickHouse), web browsers (Chromium), and other upstream projects that ship to heterogeneous fleets.
344
+ Distributed artifacts (Rust crate, Python wheels, JS native modules, shared libs from the default CMake build) pin the translation-unit baseline to each architecture's ABI floor so the library runs on any CPU matching the ABI, not just the build host — see [CONTRIBUTING.md](CONTRIBUTING.md#target-baseline-policy) for the per-arch table and the `NK_MARCH_NATIVE` override used for host-tuned local builds.
344
345
 
345
346
  All kernel names follow the pattern `nk_{operation}_{type}_{backend}`.
346
347
  If you need to resolve the best kernel manually, use `nk_find_kernel_punned` with a `nk_kernel_kind_t`, `nk_dtype_t`, and a viable capabilities mask:
package/binding.gyp CHANGED
@@ -39,11 +39,13 @@
39
39
  "defines": [
40
40
  "NK_NATIVE_F16=0",
41
41
  "NK_NATIVE_BF16=0",
42
- "NK_DYNAMIC_DISPATCH=1"
42
+ "NK_DYNAMIC_DISPATCH=1",
43
+ "NK_USE_OPENMP=1"
43
44
  ],
44
45
  "cflags": [
45
46
  "-std=c11",
46
47
  "-O3",
48
+ "-fopenmp",
47
49
  "-Wno-unknown-pragmas",
48
50
  "-Wno-maybe-uninitialized",
49
51
  "-Wno-cast-function-type",
@@ -52,31 +54,98 @@
52
54
  "-include",
53
55
  "<(module_root_dir)/nk_probes.h",
54
56
  ],
57
+ "ldflags": [
58
+ "-fopenmp"
59
+ ],
55
60
  "msvs_settings": {
56
61
  "VCCLCompilerTool": {
57
62
  "ForcedIncludeFiles": [
58
63
  "<(module_root_dir)/nk_probes.h"
59
64
  ],
60
65
  "AdditionalOptions": [
61
- "/Zc:preprocessor"
66
+ "/Zc:preprocessor",
67
+ "/openmp:llvm"
62
68
  ],
63
69
  },
64
70
  },
65
71
  "conditions": [
72
+ # Pin TU baseline to each arch's ABI floor; SIMD kernels use per-function pragmas.
73
+ [
74
+ "OS!='win' and target_arch=='arm64'",
75
+ {
76
+ "cflags": [
77
+ "-march=armv8-a"
78
+ ]
79
+ }
80
+ ],
81
+ [
82
+ "OS!='win' and target_arch=='x64'",
83
+ {
84
+ "cflags": [
85
+ "-march=x86-64"
86
+ ]
87
+ }
88
+ ],
89
+ [
90
+ "OS!='win' and target_arch=='riscv64'",
91
+ {
92
+ "cflags": [
93
+ "-march=rv64gc"
94
+ ]
95
+ }
96
+ ],
97
+ # Forbid auto-vectorization so serial fallbacks don't get silently
98
+ # promoted to NEON/SSE2/VSX. SIMD kernels use explicit intrinsics
99
+ # and per-function `target` pragmas; unaffected. MSVC has no
100
+ # command-line vectorizer toggle.
101
+ [
102
+ "OS!='win'",
103
+ {
104
+ "cflags": [
105
+ "-fno-tree-vectorize",
106
+ "-fno-tree-slp-vectorize"
107
+ ]
108
+ }
109
+ ],
66
110
  [
67
111
  "OS=='mac'",
68
112
  {
69
113
  "xcode_settings": {
70
- "MACOSX_DEPLOYMENT_TARGET": "11.0"
114
+ "MACOSX_DEPLOYMENT_TARGET": "11.0",
115
+ # Apple Clang ships no `omp.h`; the CI step
116
+ # `brew install libomp` makes it keg-only under
117
+ # `/opt/homebrew/opt/libomp` (arm64) or
118
+ # `/usr/local/opt/libomp` (x86_64). Clang silently
119
+ # ignores `-I` / `-L` dirs that don't exist, so
120
+ # listing both keeps the file arch-agnostic.
121
+ "OTHER_CFLAGS": [
122
+ "-Xpreprocessor",
123
+ "-fopenmp",
124
+ "-I/opt/homebrew/opt/libomp/include",
125
+ "-I/usr/local/opt/libomp/include"
126
+ ],
127
+ "OTHER_LDFLAGS": [
128
+ "-lomp",
129
+ "-L/opt/homebrew/opt/libomp/lib",
130
+ "-L/usr/local/opt/libomp/lib"
131
+ ]
71
132
  }
72
133
  }
73
134
  ],
135
+ # MSVC: no per-function target pragma; these match defaults.
74
136
  [
75
137
  "OS=='win' and target_arch=='arm64'",
76
138
  {
77
139
  "defines": [
78
140
  "_ARM64_"
79
- ]
141
+ ],
142
+ "msvs_settings": {
143
+ "VCCLCompilerTool": {
144
+ "AdditionalOptions": [
145
+ "/arch:armv8.0"
146
+ ]
147
+ }
148
+ }
80
149
  }
81
150
  ],
82
151
  [
@@ -84,7 +153,14 @@
84
153
  {
85
154
  "defines": [
86
155
  "_AMD64_"
87
- ]
156
+ ],
157
+ "msvs_settings": {
158
+ "VCCLCompilerTool": {
159
+ "AdditionalOptions": [
160
+ "/arch:SSE2"
161
+ ]
162
+ }
163
+ }
88
164
  }
89
165
  ],
90
166
  ],
package/c/dispatch_f16.c CHANGED
@@ -137,6 +137,29 @@ void nk_dispatch_f16_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punn
137
137
  default: break;
138
138
  }
139
139
  #endif
140
+ #if NK_TARGET_GRANITEAMX
141
+ if (v & nk_cap_graniteamx_k) switch (k) {
142
+ case nk_kernel_dots_packed_size_k:
143
+ *m = (m_t)&nk_dots_packed_size_f16_graniteamx, *c = nk_cap_graniteamx_k;
144
+ return;
145
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_f16_graniteamx, *c = nk_cap_graniteamx_k; return;
146
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_f16_graniteamx, *c = nk_cap_graniteamx_k; return;
147
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_f16_graniteamx, *c = nk_cap_graniteamx_k; return;
148
+ case nk_kernel_angulars_packed_k:
149
+ *m = (m_t)&nk_angulars_packed_f16_graniteamx, *c = nk_cap_graniteamx_k;
150
+ return;
151
+ case nk_kernel_angulars_symmetric_k:
152
+ *m = (m_t)&nk_angulars_symmetric_f16_graniteamx, *c = nk_cap_graniteamx_k;
153
+ return;
154
+ case nk_kernel_euclideans_packed_k:
155
+ *m = (m_t)&nk_euclideans_packed_f16_graniteamx, *c = nk_cap_graniteamx_k;
156
+ return;
157
+ case nk_kernel_euclideans_symmetric_k:
158
+ *m = (m_t)&nk_euclideans_symmetric_f16_graniteamx, *c = nk_cap_graniteamx_k;
159
+ return;
160
+ default: break;
161
+ }
162
+ #endif
140
163
  #if NK_TARGET_SAPPHIREAMX
141
164
  if (v & nk_cap_sapphireamx_k) switch (k) {
142
165
  case nk_kernel_maxsim_packed_size_k:
package/c/numkong.c CHANGED
@@ -935,19 +935,6 @@ BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpReserved) {
935
935
  #endif
936
936
  #endif
937
937
 
938
- // SME ABI runtime stubs — provide the lazy-ZA-save helpers that compiler-rt
939
- // may not ship (e.g., Apple's toolchain). Called by compiler-generated code
940
- // in __arm_new("za") prologues/epilogues (used by dots streaming functions).
941
- //
942
- // In NumKong, TPIDR2_EL0 is always null at entry because no NK_PUBLIC function
943
- // carries ZA state. So __arm_tpidr2_save is always a no-op and
944
- // __arm_tpidr2_restore has nothing to restore.
945
- // Weak linkage lets a real compiler-rt override these if available.
946
- #if NK_TARGET_ARM64_ && NK_TARGET_SME
947
- __attribute__((weak, visibility("default"))) void __arm_tpidr2_save(void) {}
948
- __attribute__((weak, visibility("default"))) void __arm_tpidr2_restore(void *blk) { nk_unused_(blk); }
949
- #endif
950
-
951
938
  #ifdef __cplusplus
952
939
  }
953
940
  #endif
@@ -249,10 +249,9 @@ NK_PUBLIC nk_size_t nk_attention_packed_kv_size_f16_sme(nk_size_t num_kv_heads,
249
249
  return nk_attention_packed_kv_size_bf16_sme(num_kv_heads, head_dim, max_seq_len);
250
250
  }
251
251
 
252
- __arm_locally_streaming static void nk_attention_pack_kv_bf16_sme_streaming_(nk_bf16_t const *k, nk_bf16_t const *v,
253
- nk_size_t num_kv_heads, nk_size_t head_dim,
254
- nk_size_t seq_len, nk_size_t k_stride,
255
- nk_size_t v_stride, void *kv_packed) {
252
+ static void nk_attention_pack_kv_bf16_sme_ssve_( //
253
+ nk_bf16_t const *k, nk_bf16_t const *v, nk_size_t num_kv_heads, nk_size_t head_dim, nk_size_t seq_len,
254
+ nk_size_t k_stride, nk_size_t v_stride, void *kv_packed) NK_STREAMING_ {
256
255
 
257
256
  nk_attention_sme_packed_header_t *header = (nk_attention_sme_packed_header_t *)kv_packed;
258
257
  nk_size_t head_dim_padded = (head_dim + 31) / 32 * 32;
@@ -315,16 +314,17 @@ __arm_locally_streaming static void nk_attention_pack_kv_bf16_sme_streaming_(nk_
315
314
  }
316
315
  }
317
316
 
318
- NK_PUBLIC void nk_attention_pack_kv_bf16_sme(nk_bf16_t const *k, nk_bf16_t const *v, nk_size_t num_kv_heads,
319
- nk_size_t head_dim, nk_size_t seq_len, nk_size_t k_stride,
320
- nk_size_t v_stride, void *kv_packed) {
321
- nk_attention_pack_kv_bf16_sme_streaming_(k, v, num_kv_heads, head_dim, seq_len, k_stride, v_stride, kv_packed);
317
+ NK_PUBLIC void nk_attention_pack_kv_bf16_sme( //
318
+ nk_bf16_t const *k, nk_bf16_t const *v, nk_size_t num_kv_heads, nk_size_t head_dim, nk_size_t seq_len,
319
+ nk_size_t k_stride, nk_size_t v_stride, void *kv_packed) {
320
+ nk_sme_start_streaming_();
321
+ nk_attention_pack_kv_bf16_sme_ssve_(k, v, num_kv_heads, head_dim, seq_len, k_stride, v_stride, kv_packed);
322
+ nk_sme_stop_streaming_();
322
323
  }
323
324
 
324
- __arm_locally_streaming static void nk_attention_pack_kv_f16_sme_streaming_(nk_f16_t const *k, nk_f16_t const *v,
325
- nk_size_t num_kv_heads, nk_size_t head_dim,
326
- nk_size_t seq_len, nk_size_t k_stride,
327
- nk_size_t v_stride, void *kv_packed) {
325
+ static void nk_attention_pack_kv_f16_sme_ssve_( //
326
+ nk_f16_t const *k, nk_f16_t const *v, nk_size_t num_kv_heads, nk_size_t head_dim, nk_size_t seq_len,
327
+ nk_size_t k_stride, nk_size_t v_stride, void *kv_packed) NK_STREAMING_ {
328
328
 
329
329
  nk_attention_sme_packed_header_t *header = (nk_attention_sme_packed_header_t *)kv_packed;
330
330
  nk_size_t head_dim_padded = (head_dim + 31) / 32 * 32;
@@ -385,10 +385,12 @@ __arm_locally_streaming static void nk_attention_pack_kv_f16_sme_streaming_(nk_f
385
385
  }
386
386
  }
387
387
 
388
- NK_PUBLIC void nk_attention_pack_kv_f16_sme(nk_f16_t const *k, nk_f16_t const *v, nk_size_t num_kv_heads,
389
- nk_size_t head_dim, nk_size_t seq_len, nk_size_t k_stride,
390
- nk_size_t v_stride, void *kv_packed) {
391
- nk_attention_pack_kv_f16_sme_streaming_(k, v, num_kv_heads, head_dim, seq_len, k_stride, v_stride, kv_packed);
388
+ NK_PUBLIC void nk_attention_pack_kv_f16_sme( //
389
+ nk_f16_t const *k, nk_f16_t const *v, nk_size_t num_kv_heads, nk_size_t head_dim, nk_size_t seq_len,
390
+ nk_size_t k_stride, nk_size_t v_stride, void *kv_packed) {
391
+ nk_sme_start_streaming_();
392
+ nk_attention_pack_kv_f16_sme_ssve_(k, v, num_kv_heads, head_dim, seq_len, k_stride, v_stride, kv_packed);
393
+ nk_sme_stop_streaming_();
392
394
  }
393
395
 
394
396
  /**
@@ -402,13 +404,13 @@ NK_PUBLIC void nk_attention_pack_kv_f16_sme(nk_f16_t const *k, nk_f16_t const *v
402
404
  * - Correction skip when running max is unchanged
403
405
  * - Decode path (valid_query_count==1) remains element-wise SVE (BFMOPA overhead too high)
404
406
  */
405
- __arm_locally_streaming __arm_new("za") static void nk_attention_bf16_sme_streaming_(
407
+ __arm_new("za") static void nk_attention_bf16_sme_streaming_(
406
408
  nk_bf16_t const *q, // [query_len, head_dim]
407
409
  nk_bf16_t const *k, // [kv_len, head_dim_padded] BFMOPA-interleaved
408
410
  nk_bf16_t const *v_packed, // BFMOPA-interleaved V for this KV head
409
411
  nk_bf16_t *output, // [query_len, head_dim]
410
412
  nk_size_t query_len, nk_size_t kv_len, nk_size_t head_dim, nk_size_t head_dim_padded, nk_size_t dim_tile_count,
411
- nk_f32_t scale) {
413
+ nk_f32_t scale) NK_STREAMING_ {
412
414
 
413
415
  svbool_t const predicate_all_b32x = svptrue_b32();
414
416
  svbool_t const predicate_all_b16x = svptrue_b16();
@@ -1184,9 +1186,9 @@ __arm_locally_streaming __arm_new("za") static void nk_attention_bf16_sme_stream
1184
1186
  }
1185
1187
  }
1186
1188
 
1187
- NK_PUBLIC void nk_attention_bf16_sme(nk_bf16_t const *q, void const *kv_packed, nk_bf16_t *output, nk_size_t num_heads,
1188
- nk_size_t num_kv_heads, nk_size_t query_len, nk_size_t kv_len, nk_size_t head_dim,
1189
- nk_f32_t scale) {
1189
+ NK_PUBLIC void nk_attention_bf16_sme( //
1190
+ nk_bf16_t const *q, void const *kv_packed, nk_bf16_t *output, nk_size_t num_heads, nk_size_t num_kv_heads,
1191
+ nk_size_t query_len, nk_size_t kv_len, nk_size_t head_dim, nk_f32_t scale) {
1190
1192
 
1191
1193
  nk_attention_sme_packed_header_t const *header = (nk_attention_sme_packed_header_t const *)kv_packed;
1192
1194
  nk_size_t head_dim_padded = header->head_dim_padded;
@@ -1199,6 +1201,7 @@ NK_PUBLIC void nk_attention_bf16_sme(nk_bf16_t const *q, void const *kv_packed,
1199
1201
 
1200
1202
  nk_size_t group_size = (num_kv_heads > 0) ? num_heads / num_kv_heads : 1;
1201
1203
 
1204
+ nk_sme_start_streaming_();
1202
1205
  for (nk_size_t q_head = 0; q_head < num_heads; q_head++) {
1203
1206
  nk_size_t kv_head = q_head / group_size;
1204
1207
 
@@ -1214,15 +1217,13 @@ NK_PUBLIC void nk_attention_bf16_sme(nk_bf16_t const *q, void const *kv_packed,
1214
1217
  q_block_len, kv_len, head_dim, head_dim_padded, dim_tile_count, scale);
1215
1218
  }
1216
1219
  }
1220
+ nk_sme_stop_streaming_();
1217
1221
  }
1218
1222
 
1219
- __arm_locally_streaming __arm_new("za") static void nk_attention_f16_sme_streaming_(
1220
- nk_f16_t const *q, // [query_len, head_dim]
1221
- nk_f16_t const *k, // [kv_len, head_dim_padded] FMOPA-interleaved
1222
- nk_f16_t const *v_packed, // FMOPA-interleaved V for this KV head
1223
- nk_f16_t *output, // [query_len, head_dim]
1224
- nk_size_t query_len, nk_size_t kv_len, nk_size_t head_dim, nk_size_t head_dim_padded, nk_size_t dim_tile_count,
1225
- nk_f32_t scale) {
1223
+ __arm_new("za") static void nk_attention_f16_sme_streaming_( //
1224
+ nk_f16_t const *q, nk_f16_t const *k, nk_f16_t const *v_packed, nk_f16_t *output, nk_size_t query_len,
1225
+ nk_size_t kv_len, nk_size_t head_dim, nk_size_t head_dim_padded, nk_size_t dim_tile_count,
1226
+ nk_f32_t scale) NK_STREAMING_ {
1226
1227
 
1227
1228
  svbool_t const predicate_all_b32x = svptrue_b32();
1228
1229
  svbool_t const predicate_all_b16x = svptrue_b16();
@@ -2008,9 +2009,9 @@ __arm_locally_streaming __arm_new("za") static void nk_attention_f16_sme_streami
2008
2009
  }
2009
2010
  }
2010
2011
 
2011
- NK_PUBLIC void nk_attention_f16_sme(nk_f16_t const *q, void const *kv_packed, nk_f16_t *output, nk_size_t num_heads,
2012
- nk_size_t num_kv_heads, nk_size_t query_len, nk_size_t kv_len, nk_size_t head_dim,
2013
- nk_f32_t scale) {
2012
+ NK_PUBLIC void nk_attention_f16_sme( //
2013
+ nk_f16_t const *q, void const *kv_packed, nk_f16_t *output, nk_size_t num_heads, nk_size_t num_kv_heads,
2014
+ nk_size_t query_len, nk_size_t kv_len, nk_size_t head_dim, nk_f32_t scale) {
2014
2015
 
2015
2016
  nk_attention_sme_packed_header_t const *header = (nk_attention_sme_packed_header_t const *)kv_packed;
2016
2017
  nk_size_t head_dim_padded = header->head_dim_padded;
@@ -2024,6 +2025,7 @@ NK_PUBLIC void nk_attention_f16_sme(nk_f16_t const *q, void const *kv_packed, nk
2024
2025
 
2025
2026
  nk_size_t group_size = (num_kv_heads > 0) ? num_heads / num_kv_heads : 1;
2026
2027
 
2028
+ nk_sme_start_streaming_();
2027
2029
  for (nk_size_t q_head = 0; q_head < num_heads; q_head++) {
2028
2030
  nk_size_t kv_head = q_head / group_size;
2029
2031
 
@@ -2039,6 +2041,7 @@ NK_PUBLIC void nk_attention_f16_sme(nk_f16_t const *q, void const *kv_packed, nk
2039
2041
  q_block_len, kv_len, head_dim, head_dim_padded, dim_tile_count, scale);
2040
2042
  }
2041
2043
  }
2044
+ nk_sme_stop_streaming_();
2042
2045
  }
2043
2046
 
2044
2047
  NK_PUBLIC void nk_attention_causal_bf16_sme(nk_bf16_t const *q, void const *kv_packed, nk_bf16_t *output,
@@ -95,8 +95,8 @@
95
95
  #include "numkong/types.h" // `nk_u64_t`, `NK_DEFINED_LINUX_`
96
96
 
97
97
  #define NK_VERSION_MAJOR 7
98
- #define NK_VERSION_MINOR 4
99
- #define NK_VERSION_PATCH 4
98
+ #define NK_VERSION_MINOR 5
99
+ #define NK_VERSION_PATCH 0
100
100
 
101
101
  /**
102
102
  * @brief Removes compile-time dispatching, and replaces it with runtime dispatching.
@@ -500,13 +500,6 @@ NK_PUBLIC nk_capability_t nk_capabilities_x8664_(void) {
500
500
 
501
501
  #if NK_TARGET_ARM64_
502
502
 
503
- #if defined(__clang__)
504
- #pragma clang attribute push(__attribute__((target("arch=armv8.5-a+sve"))), apply_to = function)
505
- #elif defined(__GNUC__)
506
- #pragma GCC push_options
507
- #pragma GCC target("arch=armv8.5-a+sve")
508
- #endif
509
-
510
503
  #if NK_HAS_POSIX_EXTENSIONS_
511
504
  static sigjmp_buf nk_mrs_arm64_jump_buffer_;
512
505
  static void nk_mrs_arm64_sigill_handler_(int sig) {
@@ -716,12 +709,6 @@ NK_PUBLIC nk_capability_t nk_capabilities_arm64_(void) {
716
709
  #endif
717
710
  }
718
711
 
719
- #if defined(__clang__)
720
- #pragma clang attribute pop
721
- #elif defined(__GNUC__)
722
- #pragma GCC pop_options
723
- #endif
724
-
725
712
  #endif // NK_TARGET_ARM64_
726
713
 
727
714
  #if NK_TARGET_RISCV64_
@@ -104,6 +104,21 @@ NK_INTERNAL void nk_store_b256_neon_(nk_b256_vec_t const *src, void *dst) {
104
104
  /** @brief Type-agnostic 64-bit full load (NEON). */
105
105
  NK_INTERNAL void nk_load_b64_neon_(void const *src, nk_b64_vec_t *dst) { dst->u8x8 = vld1_u8((nk_u8_t const *)src); }
106
106
 
107
+ /**
108
+ * @brief 8-lane `uint16x8_t` splat that hides the source from the optimizer.
109
+ *
110
+ * GCC 13 lowers `vdupq_n_u16(X)` to `fmov v.8h, #imm` (a FEAT_FP16 encoding) whenever X matches a
111
+ * representable FP16 immediate, including bf16 bit patterns like 1.0 (`0x3F80`). That fails to
112
+ * assemble under a `+bf16`-only pragma. The empty `__asm__` constraint forces `mov w; dup v.8h, w`
113
+ * instead, valid on plain `armv8-a+simd`. No-op on Clang; skipped on MSVC (neither is affected).
114
+ */
115
+ NK_INTERNAL uint16x8_t nk_u16x8_splat_(nk_u16_t bits) {
116
+ #if defined(__GNUC__) || defined(__clang__)
117
+ __asm__("" : "+r"(bits));
118
+ #endif
119
+ return vdupq_n_u16(bits);
120
+ }
121
+
107
122
  #pragma endregion Type Punned Loads and Stores
108
123
 
109
124
  #pragma region Vectorized Conversions