numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,389 @@
1
+ /**
2
+ * @brief Dispatch Initialization for BF16 Data Types.
3
+ * @file c/dispatch_bf16.c
4
+ * @author Ash Vardanian
5
+ * @date February 3, 2026
6
+ */
7
+ #include "dispatch.h"
8
+
9
+ void nk_dispatch_bf16_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punned_t *m, nk_capability_t *c) {
10
+ typedef nk_kernel_punned_t m_t;
11
+
12
+ #if NK_TARGET_V128RELAXED
13
+ if (v & nk_cap_v128relaxed_k) switch (k) {
14
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_bf16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
15
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_bf16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
16
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_bf16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
17
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_bf16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
18
+ case nk_kernel_reduce_moments_k:
19
+ *m = (m_t)&nk_reduce_moments_bf16_v128relaxed, *c = nk_cap_v128relaxed_k;
20
+ return;
21
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_bf16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
22
+ case nk_kernel_dots_packed_size_k:
23
+ *m = (m_t)&nk_dots_packed_size_bf16_v128relaxed, *c = nk_cap_v128relaxed_k;
24
+ return;
25
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_bf16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
26
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_bf16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
27
+ case nk_kernel_dots_symmetric_k:
28
+ *m = (m_t)&nk_dots_symmetric_bf16_v128relaxed, *c = nk_cap_v128relaxed_k;
29
+ return;
30
+ case nk_kernel_angulars_packed_k:
31
+ *m = (m_t)&nk_angulars_packed_bf16_v128relaxed, *c = nk_cap_v128relaxed_k;
32
+ return;
33
+ case nk_kernel_angulars_symmetric_k:
34
+ *m = (m_t)&nk_angulars_symmetric_bf16_v128relaxed, *c = nk_cap_v128relaxed_k;
35
+ return;
36
+ case nk_kernel_euclideans_packed_k:
37
+ *m = (m_t)&nk_euclideans_packed_bf16_v128relaxed, *c = nk_cap_v128relaxed_k;
38
+ return;
39
+ case nk_kernel_euclideans_symmetric_k:
40
+ *m = (m_t)&nk_euclideans_symmetric_bf16_v128relaxed, *c = nk_cap_v128relaxed_k;
41
+ return;
42
+ case nk_kernel_maxsim_packed_size_k:
43
+ *m = (m_t)&nk_maxsim_packed_size_bf16_v128relaxed, *c = nk_cap_v128relaxed_k;
44
+ return;
45
+ case nk_kernel_maxsim_pack_k: *m = (m_t)&nk_maxsim_pack_bf16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
46
+ case nk_kernel_maxsim_packed_k: *m = (m_t)&nk_maxsim_packed_bf16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
47
+ default: break;
48
+ }
49
+ #endif
50
+ #if NK_TARGET_SME
51
+ if (v & nk_cap_sme_k) switch (k) {
52
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_bf16_sme, *c = nk_cap_sme_k; return;
53
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_bf16_sme, *c = nk_cap_sme_k; return;
54
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_bf16_sme, *c = nk_cap_sme_k; return;
55
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_bf16_sme, *c = nk_cap_sme_k; return;
56
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_bf16_sme, *c = nk_cap_sme_k; return;
57
+ case nk_kernel_angulars_symmetric_k: *m = (m_t)&nk_angulars_symmetric_bf16_sme, *c = nk_cap_sme_k; return;
58
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_bf16_sme, *c = nk_cap_sme_k; return;
59
+ case nk_kernel_euclideans_symmetric_k: *m = (m_t)&nk_euclideans_symmetric_bf16_sme, *c = nk_cap_sme_k; return;
60
+ case nk_kernel_maxsim_packed_size_k: *m = (m_t)&nk_maxsim_packed_size_bf16_sme, *c = nk_cap_sme_k; return;
61
+ case nk_kernel_maxsim_pack_k: *m = (m_t)&nk_maxsim_pack_bf16_sme, *c = nk_cap_sme_k; return;
62
+ case nk_kernel_maxsim_packed_k: *m = (m_t)&nk_maxsim_packed_bf16_sme, *c = nk_cap_sme_k; return;
63
+ default: break;
64
+ }
65
+ #endif
66
+ #if NK_TARGET_SVE2 && NK_TARGET_SVEBFDOT
67
+ if (v & nk_cap_sve2_k) switch (k) {
68
+ case nk_kernel_sparse_dot_k: *m = (m_t)&nk_sparse_dot_u16bf16_sve2, *c = nk_cap_sve2_k; return;
69
+ default: break;
70
+ }
71
+ #endif
72
+ #if NK_TARGET_SVEBFDOT
73
+ if (v & nk_cap_svebfdot_k) switch (k) {
74
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_bf16_svebfdot, *c = nk_cap_svebfdot_k; return;
75
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_bf16_svebfdot, *c = nk_cap_svebfdot_k; return;
76
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_bf16_svebfdot, *c = nk_cap_svebfdot_k; return;
77
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_bf16_svebfdot, *c = nk_cap_svebfdot_k; return;
78
+ default: break;
79
+ }
80
+ #endif
81
+ #if NK_TARGET_NEONBFDOT
82
+ if (v & nk_cap_neonbfdot_k) switch (k) {
83
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
84
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
85
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
86
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
87
+ case nk_kernel_bilinear_k: *m = (m_t)&nk_bilinear_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
88
+ case nk_kernel_mahalanobis_k: *m = (m_t)&nk_mahalanobis_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
89
+ case nk_kernel_rmsd_k: *m = (m_t)&nk_rmsd_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
90
+ case nk_kernel_kabsch_k: *m = (m_t)&nk_kabsch_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
91
+ case nk_kernel_umeyama_k: *m = (m_t)&nk_umeyama_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
92
+ case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
93
+ case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
94
+ case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
95
+ case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
96
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
97
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
98
+ case nk_kernel_dots_packed_size_k:
99
+ *m = (m_t)&nk_dots_packed_size_bf16_neonbfdot, *c = nk_cap_neonbfdot_k;
100
+ return;
101
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
102
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
103
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
104
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_bf16_neonbfdot, *c = nk_cap_neonbfdot_k; return;
105
+ case nk_kernel_angulars_symmetric_k:
106
+ *m = (m_t)&nk_angulars_symmetric_bf16_neonbfdot, *c = nk_cap_neonbfdot_k;
107
+ return;
108
+ case nk_kernel_euclideans_packed_k:
109
+ *m = (m_t)&nk_euclideans_packed_bf16_neonbfdot, *c = nk_cap_neonbfdot_k;
110
+ return;
111
+ case nk_kernel_euclideans_symmetric_k:
112
+ *m = (m_t)&nk_euclideans_symmetric_bf16_neonbfdot, *c = nk_cap_neonbfdot_k;
113
+ return;
114
+ default: break;
115
+ }
116
+ #endif
117
+ #if NK_TARGET_NEON
118
+ if (v & nk_cap_neon_k) switch (k) {
119
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_bf16_neon, *c = nk_cap_neon_k; return;
120
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_bf16_neon, *c = nk_cap_neon_k; return;
121
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_bf16_neon, *c = nk_cap_neon_k; return;
122
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_bf16_neon, *c = nk_cap_neon_k; return;
123
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_bf16_neon, *c = nk_cap_neon_k; return;
124
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_bf16_neon, *c = nk_cap_neon_k; return;
125
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_bf16_neon, *c = nk_cap_neon_k; return;
126
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_bf16_neon, *c = nk_cap_neon_k; return;
127
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_bf16_neon, *c = nk_cap_neon_k; return;
128
+ case nk_kernel_angulars_symmetric_k: *m = (m_t)&nk_angulars_symmetric_bf16_neon, *c = nk_cap_neon_k; return;
129
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_bf16_neon, *c = nk_cap_neon_k; return;
130
+ case nk_kernel_euclideans_symmetric_k: *m = (m_t)&nk_euclideans_symmetric_bf16_neon, *c = nk_cap_neon_k; return;
131
+ default: break;
132
+ }
133
+ #endif
134
+ #if NK_TARGET_SAPPHIREAMX
135
+ if (v & nk_cap_sapphireamx_k) switch (k) {
136
+ case nk_kernel_dots_packed_size_k:
137
+ *m = (m_t)&nk_dots_packed_size_bf16_sapphireamx, *c = nk_cap_sapphireamx_k;
138
+ return;
139
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_bf16_sapphireamx, *c = nk_cap_sapphireamx_k; return;
140
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_bf16_sapphireamx, *c = nk_cap_sapphireamx_k; return;
141
+ case nk_kernel_dots_symmetric_k:
142
+ *m = (m_t)&nk_dots_symmetric_bf16_sapphireamx, *c = nk_cap_sapphireamx_k;
143
+ return;
144
+ case nk_kernel_angulars_packed_k:
145
+ *m = (m_t)&nk_angulars_packed_bf16_sapphireamx, *c = nk_cap_sapphireamx_k;
146
+ return;
147
+ case nk_kernel_angulars_symmetric_k:
148
+ *m = (m_t)&nk_angulars_symmetric_bf16_sapphireamx, *c = nk_cap_sapphireamx_k;
149
+ return;
150
+ case nk_kernel_euclideans_packed_k:
151
+ *m = (m_t)&nk_euclideans_packed_bf16_sapphireamx, *c = nk_cap_sapphireamx_k;
152
+ return;
153
+ case nk_kernel_euclideans_symmetric_k:
154
+ *m = (m_t)&nk_euclideans_symmetric_bf16_sapphireamx, *c = nk_cap_sapphireamx_k;
155
+ return;
156
+ case nk_kernel_maxsim_packed_size_k:
157
+ *m = (m_t)&nk_maxsim_packed_size_bf16_sapphireamx, *c = nk_cap_sapphireamx_k;
158
+ return;
159
+ case nk_kernel_maxsim_pack_k: *m = (m_t)&nk_maxsim_pack_bf16_sapphireamx, *c = nk_cap_sapphireamx_k; return;
160
+ case nk_kernel_maxsim_packed_k: *m = (m_t)&nk_maxsim_packed_bf16_sapphireamx, *c = nk_cap_sapphireamx_k; return;
161
+ default: break;
162
+ }
163
+ #endif
164
+ #if NK_TARGET_TURIN
165
+ if (v & nk_cap_turin_k) switch (k) {
166
+ case nk_kernel_sparse_dot_k: *m = (m_t)&nk_sparse_dot_u16bf16_turin, *c = nk_cap_turin_k; return;
167
+ default: break;
168
+ }
169
+ #endif
170
+ #if NK_TARGET_GENOA
171
+ if (v & nk_cap_genoa_k) switch (k) {
172
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_bf16_genoa, *c = nk_cap_genoa_k; return;
173
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_bf16_genoa, *c = nk_cap_genoa_k; return;
174
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_bf16_genoa, *c = nk_cap_genoa_k; return;
175
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_bf16_genoa, *c = nk_cap_genoa_k; return;
176
+ case nk_kernel_bilinear_k: *m = (m_t)&nk_bilinear_bf16_genoa, *c = nk_cap_genoa_k; return;
177
+ case nk_kernel_mahalanobis_k: *m = (m_t)&nk_mahalanobis_bf16_genoa, *c = nk_cap_genoa_k; return;
178
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_bf16_genoa, *c = nk_cap_genoa_k; return;
179
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_bf16_genoa, *c = nk_cap_genoa_k; return;
180
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_bf16_genoa, *c = nk_cap_genoa_k; return;
181
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_bf16_genoa, *c = nk_cap_genoa_k; return;
182
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_bf16_genoa, *c = nk_cap_genoa_k; return;
183
+ case nk_kernel_angulars_symmetric_k: *m = (m_t)&nk_angulars_symmetric_bf16_genoa, *c = nk_cap_genoa_k; return;
184
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_bf16_genoa, *c = nk_cap_genoa_k; return;
185
+ case nk_kernel_euclideans_symmetric_k:
186
+ *m = (m_t)&nk_euclideans_symmetric_bf16_genoa, *c = nk_cap_genoa_k;
187
+ return;
188
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_bf16_genoa, *c = nk_cap_genoa_k; return;
189
+ case nk_kernel_maxsim_packed_size_k: *m = (m_t)&nk_maxsim_packed_size_bf16_genoa, *c = nk_cap_genoa_k; return;
190
+ case nk_kernel_maxsim_pack_k: *m = (m_t)&nk_maxsim_pack_bf16_genoa, *c = nk_cap_genoa_k; return;
191
+ case nk_kernel_maxsim_packed_k: *m = (m_t)&nk_maxsim_packed_bf16_genoa, *c = nk_cap_genoa_k; return;
192
+ default: break;
193
+ }
194
+ #endif
195
+ #if NK_TARGET_SKYLAKE
196
+ if (v & nk_cap_skylake_k) switch (k) {
197
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_bf16_skylake, *c = nk_cap_skylake_k; return;
198
+ case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_bf16_skylake, *c = nk_cap_skylake_k; return;
199
+ case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_bf16_skylake, *c = nk_cap_skylake_k; return;
200
+ case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_bf16_skylake, *c = nk_cap_skylake_k; return;
201
+ case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_bf16_skylake, *c = nk_cap_skylake_k; return;
202
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_bf16_skylake, *c = nk_cap_skylake_k; return;
203
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_bf16_skylake, *c = nk_cap_skylake_k; return;
204
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_bf16_skylake, *c = nk_cap_skylake_k; return;
205
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_bf16_skylake, *c = nk_cap_skylake_k; return;
206
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_bf16_skylake, *c = nk_cap_skylake_k; return;
207
+ case nk_kernel_angulars_symmetric_k:
208
+ *m = (m_t)&nk_angulars_symmetric_bf16_skylake, *c = nk_cap_skylake_k;
209
+ return;
210
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_bf16_skylake, *c = nk_cap_skylake_k; return;
211
+ case nk_kernel_euclideans_symmetric_k:
212
+ *m = (m_t)&nk_euclideans_symmetric_bf16_skylake, *c = nk_cap_skylake_k;
213
+ return;
214
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_bf16_skylake, *c = nk_cap_skylake_k; return;
215
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_bf16_skylake, *c = nk_cap_skylake_k; return;
216
+ default: break;
217
+ }
218
+ #endif
219
+ #if NK_TARGET_ALDER
220
+ if (v & nk_cap_alder_k) switch (k) {
221
+ case nk_kernel_maxsim_packed_size_k: *m = (m_t)&nk_maxsim_packed_size_bf16_alder, *c = nk_cap_alder_k; return;
222
+ case nk_kernel_maxsim_pack_k: *m = (m_t)&nk_maxsim_pack_bf16_alder, *c = nk_cap_alder_k; return;
223
+ case nk_kernel_maxsim_packed_k: *m = (m_t)&nk_maxsim_packed_bf16_alder, *c = nk_cap_alder_k; return;
224
+ default: break;
225
+ }
226
+ #endif
227
+ #if NK_TARGET_HASWELL
228
+ if (v & nk_cap_haswell_k) switch (k) {
229
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_bf16_haswell, *c = nk_cap_haswell_k; return;
230
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_bf16_haswell, *c = nk_cap_haswell_k; return;
231
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_bf16_haswell, *c = nk_cap_haswell_k; return;
232
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_bf16_haswell, *c = nk_cap_haswell_k; return;
233
+ case nk_kernel_bilinear_k: *m = (m_t)&nk_bilinear_bf16_haswell, *c = nk_cap_haswell_k; return;
234
+ case nk_kernel_mahalanobis_k: *m = (m_t)&nk_mahalanobis_bf16_haswell, *c = nk_cap_haswell_k; return;
235
+ case nk_kernel_rmsd_k: *m = (m_t)&nk_rmsd_bf16_haswell, *c = nk_cap_haswell_k; return;
236
+ case nk_kernel_kabsch_k: *m = (m_t)&nk_kabsch_bf16_haswell, *c = nk_cap_haswell_k; return;
237
+ case nk_kernel_umeyama_k: *m = (m_t)&nk_umeyama_bf16_haswell, *c = nk_cap_haswell_k; return;
238
+ case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_bf16_haswell, *c = nk_cap_haswell_k; return;
239
+ case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_bf16_haswell, *c = nk_cap_haswell_k; return;
240
+ case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_bf16_haswell, *c = nk_cap_haswell_k; return;
241
+ case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_bf16_haswell, *c = nk_cap_haswell_k; return;
242
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_bf16_haswell, *c = nk_cap_haswell_k; return;
243
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_bf16_haswell, *c = nk_cap_haswell_k; return;
244
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_bf16_haswell, *c = nk_cap_haswell_k; return;
245
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_bf16_haswell, *c = nk_cap_haswell_k; return;
246
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_bf16_haswell, *c = nk_cap_haswell_k; return;
247
+ case nk_kernel_angulars_symmetric_k:
248
+ *m = (m_t)&nk_angulars_symmetric_bf16_haswell, *c = nk_cap_haswell_k;
249
+ return;
250
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_bf16_haswell, *c = nk_cap_haswell_k; return;
251
+ case nk_kernel_euclideans_symmetric_k:
252
+ *m = (m_t)&nk_euclideans_symmetric_bf16_haswell, *c = nk_cap_haswell_k;
253
+ return;
254
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_bf16_haswell, *c = nk_cap_haswell_k; return;
255
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_bf16_haswell, *c = nk_cap_haswell_k; return;
256
+ case nk_kernel_maxsim_packed_size_k:
257
+ *m = (m_t)&nk_maxsim_packed_size_bf16_haswell, *c = nk_cap_haswell_k;
258
+ return;
259
+ case nk_kernel_maxsim_pack_k: *m = (m_t)&nk_maxsim_pack_bf16_haswell, *c = nk_cap_haswell_k; return;
260
+ case nk_kernel_maxsim_packed_k: *m = (m_t)&nk_maxsim_packed_bf16_haswell, *c = nk_cap_haswell_k; return;
261
+ default: break;
262
+ }
263
+ #endif
264
+ #if NK_TARGET_NEONSDOT
265
+ if (v & nk_cap_neonsdot_k) switch (k) {
266
+ case nk_kernel_maxsim_packed_size_k:
267
+ *m = (m_t)&nk_maxsim_packed_size_bf16_neonsdot, *c = nk_cap_neonsdot_k;
268
+ return;
269
+ case nk_kernel_maxsim_pack_k: *m = (m_t)&nk_maxsim_pack_bf16_neonsdot, *c = nk_cap_neonsdot_k; return;
270
+ case nk_kernel_maxsim_packed_k: *m = (m_t)&nk_maxsim_packed_bf16_neonsdot, *c = nk_cap_neonsdot_k; return;
271
+ default: break;
272
+ }
273
+ #endif
274
+ #if NK_TARGET_RVVBF16
275
+ if (v & nk_cap_rvvbf16_k) switch (k) {
276
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_bf16_rvvbf16, *c = nk_cap_rvvbf16_k; return;
277
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_bf16_rvvbf16, *c = nk_cap_rvvbf16_k; return;
278
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_bf16_rvvbf16, *c = nk_cap_rvvbf16_k; return;
279
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_bf16_rvvbf16, *c = nk_cap_rvvbf16_k; return;
280
+ default: break;
281
+ }
282
+ #endif
283
+ #if NK_TARGET_RVV
284
+ if (v & nk_cap_rvv_k) switch (k) {
285
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_bf16_rvv, *c = nk_cap_rvv_k; return;
286
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_bf16_rvv, *c = nk_cap_rvv_k; return;
287
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_bf16_rvv, *c = nk_cap_rvv_k; return;
288
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_bf16_rvv, *c = nk_cap_rvv_k; return;
289
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_bf16_rvv, *c = nk_cap_rvv_k; return;
290
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_bf16_rvv, *c = nk_cap_rvv_k; return;
291
+ case nk_kernel_kld_k: *m = (m_t)&nk_kld_bf16_rvv, *c = nk_cap_rvv_k; return;
292
+ case nk_kernel_jsd_k: *m = (m_t)&nk_jsd_bf16_rvv, *c = nk_cap_rvv_k; return;
293
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_bf16_rvv, *c = nk_cap_rvv_k; return;
294
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_bf16_rvv, *c = nk_cap_rvv_k; return;
295
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_bf16_rvv, *c = nk_cap_rvv_k; return;
296
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_bf16_rvv, *c = nk_cap_rvv_k; return;
297
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_bf16_rvv, *c = nk_cap_rvv_k; return;
298
+ case nk_kernel_angulars_symmetric_k: *m = (m_t)&nk_angulars_symmetric_bf16_rvv, *c = nk_cap_rvv_k; return;
299
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_bf16_rvv, *c = nk_cap_rvv_k; return;
300
+ case nk_kernel_euclideans_symmetric_k: *m = (m_t)&nk_euclideans_symmetric_bf16_rvv, *c = nk_cap_rvv_k; return;
301
+ case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_bf16_rvv, *c = nk_cap_rvv_k; return;
302
+ case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_bf16_rvv, *c = nk_cap_rvv_k; return;
303
+ case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_bf16_rvv, *c = nk_cap_rvv_k; return;
304
+ case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_bf16_rvv, *c = nk_cap_rvv_k; return;
305
+ case nk_kernel_bilinear_k: *m = (m_t)&nk_bilinear_bf16_rvv, *c = nk_cap_rvv_k; return;
306
+ case nk_kernel_mahalanobis_k: *m = (m_t)&nk_mahalanobis_bf16_rvv, *c = nk_cap_rvv_k; return;
307
+ case nk_kernel_rmsd_k: *m = (m_t)&nk_rmsd_bf16_rvv, *c = nk_cap_rvv_k; return;
308
+ case nk_kernel_kabsch_k: *m = (m_t)&nk_kabsch_bf16_rvv, *c = nk_cap_rvv_k; return;
309
+ case nk_kernel_umeyama_k: *m = (m_t)&nk_umeyama_bf16_rvv, *c = nk_cap_rvv_k; return;
310
+ default: break;
311
+ }
312
+ #endif
313
+ if (v & nk_cap_serial_k) switch (k) {
314
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_bf16_serial, *c = nk_cap_serial_k; return;
315
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_bf16_serial, *c = nk_cap_serial_k; return;
316
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_bf16_serial, *c = nk_cap_serial_k; return;
317
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_bf16_serial, *c = nk_cap_serial_k; return;
318
+ case nk_kernel_jsd_k: *m = (m_t)&nk_jsd_bf16_serial, *c = nk_cap_serial_k; return;
319
+ case nk_kernel_kld_k: *m = (m_t)&nk_kld_bf16_serial, *c = nk_cap_serial_k; return;
320
+ case nk_kernel_bilinear_k: *m = (m_t)&nk_bilinear_bf16_serial, *c = nk_cap_serial_k; return;
321
+ case nk_kernel_mahalanobis_k: *m = (m_t)&nk_mahalanobis_bf16_serial, *c = nk_cap_serial_k; return;
322
+ case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_bf16_serial, *c = nk_cap_serial_k; return;
323
+ case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_bf16_serial, *c = nk_cap_serial_k; return;
324
+ case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_bf16_serial, *c = nk_cap_serial_k; return;
325
+ case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_bf16_serial, *c = nk_cap_serial_k; return;
326
+ case nk_kernel_rmsd_k: *m = (m_t)&nk_rmsd_bf16_serial, *c = nk_cap_serial_k; return;
327
+ case nk_kernel_kabsch_k: *m = (m_t)&nk_kabsch_bf16_serial, *c = nk_cap_serial_k; return;
328
+ case nk_kernel_umeyama_k: *m = (m_t)&nk_umeyama_bf16_serial, *c = nk_cap_serial_k; return;
329
+ case nk_kernel_sparse_dot_k: *m = (m_t)&nk_sparse_dot_u16bf16_serial, *c = nk_cap_serial_k; return;
330
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_bf16_serial, *c = nk_cap_serial_k; return;
331
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_bf16_serial, *c = nk_cap_serial_k; return;
332
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_bf16_serial, *c = nk_cap_serial_k; return;
333
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_bf16_serial, *c = nk_cap_serial_k; return;
334
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_bf16_serial, *c = nk_cap_serial_k; return;
335
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_bf16_serial, *c = nk_cap_serial_k; return;
336
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_bf16_serial, *c = nk_cap_serial_k; return;
337
+ case nk_kernel_angulars_symmetric_k: *m = (m_t)&nk_angulars_symmetric_bf16_serial, *c = nk_cap_serial_k; return;
338
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_bf16_serial, *c = nk_cap_serial_k; return;
339
+ case nk_kernel_euclideans_symmetric_k:
340
+ *m = (m_t)&nk_euclideans_symmetric_bf16_serial, *c = nk_cap_serial_k;
341
+ return;
342
+ case nk_kernel_maxsim_packed_size_k: *m = (m_t)&nk_maxsim_packed_size_bf16_serial, *c = nk_cap_serial_k; return;
343
+ case nk_kernel_maxsim_pack_k: *m = (m_t)&nk_maxsim_pack_bf16_serial, *c = nk_cap_serial_k; return;
344
+ case nk_kernel_maxsim_packed_k: *m = (m_t)&nk_maxsim_packed_bf16_serial, *c = nk_cap_serial_k; return;
345
+ default: break;
346
+ }
347
+
348
+ // Error fallback - zero capability signals lookup failure
349
+ *m = (m_t)nk_error_dense_, *c = 0;
350
+ }
351
+
352
+ void nk_dispatch_bf16_init_(nk_capability_t caps) {
353
+ nk_implementations_t *t = &nk_dispatch_table;
354
+ nk_capability_t used;
355
+
356
+ nk_dispatch_bf16_find_(caps, nk_kernel_dot_k, (nk_kernel_punned_t *)&t->dot_bf16, &used);
357
+ nk_dispatch_bf16_find_(caps, nk_kernel_angular_k, (nk_kernel_punned_t *)&t->angular_bf16, &used);
358
+ nk_dispatch_bf16_find_(caps, nk_kernel_euclidean_k, (nk_kernel_punned_t *)&t->euclidean_bf16, &used);
359
+ nk_dispatch_bf16_find_(caps, nk_kernel_sqeuclidean_k, (nk_kernel_punned_t *)&t->sqeuclidean_bf16, &used);
360
+ nk_dispatch_bf16_find_(caps, nk_kernel_bilinear_k, (nk_kernel_punned_t *)&t->bilinear_bf16, &used);
361
+ nk_dispatch_bf16_find_(caps, nk_kernel_mahalanobis_k, (nk_kernel_punned_t *)&t->mahalanobis_bf16, &used);
362
+ nk_dispatch_bf16_find_(caps, nk_kernel_kld_k, (nk_kernel_punned_t *)&t->kld_bf16, &used);
363
+ nk_dispatch_bf16_find_(caps, nk_kernel_jsd_k, (nk_kernel_punned_t *)&t->jsd_bf16, &used);
364
+ nk_dispatch_bf16_find_(caps, nk_kernel_rmsd_k, (nk_kernel_punned_t *)&t->rmsd_bf16, &used);
365
+ nk_dispatch_bf16_find_(caps, nk_kernel_kabsch_k, (nk_kernel_punned_t *)&t->kabsch_bf16, &used);
366
+ nk_dispatch_bf16_find_(caps, nk_kernel_umeyama_k, (nk_kernel_punned_t *)&t->umeyama_bf16, &used);
367
+ nk_dispatch_bf16_find_(caps, nk_kernel_sparse_dot_k, (nk_kernel_punned_t *)&t->sparse_dot_u16bf16, &used);
368
+ nk_dispatch_bf16_find_(caps, nk_kernel_each_scale_k, (nk_kernel_punned_t *)&t->each_scale_bf16, &used);
369
+ nk_dispatch_bf16_find_(caps, nk_kernel_each_sum_k, (nk_kernel_punned_t *)&t->each_sum_bf16, &used);
370
+ nk_dispatch_bf16_find_(caps, nk_kernel_each_blend_k, (nk_kernel_punned_t *)&t->each_blend_bf16, &used);
371
+ nk_dispatch_bf16_find_(caps, nk_kernel_each_fma_k, (nk_kernel_punned_t *)&t->each_fma_bf16, &used);
372
+ nk_dispatch_bf16_find_(caps, nk_kernel_reduce_moments_k, (nk_kernel_punned_t *)&t->reduce_moments_bf16, &used);
373
+ nk_dispatch_bf16_find_(caps, nk_kernel_reduce_minmax_k, (nk_kernel_punned_t *)&t->reduce_minmax_bf16, &used);
374
+ nk_dispatch_bf16_find_(caps, nk_kernel_dots_packed_size_k, (nk_kernel_punned_t *)&t->dots_packed_size_bf16, &used);
375
+ nk_dispatch_bf16_find_(caps, nk_kernel_dots_pack_k, (nk_kernel_punned_t *)&t->dots_pack_bf16, &used);
376
+ nk_dispatch_bf16_find_(caps, nk_kernel_dots_packed_k, (nk_kernel_punned_t *)&t->dots_packed_bf16, &used);
377
+ nk_dispatch_bf16_find_(caps, nk_kernel_dots_symmetric_k, (nk_kernel_punned_t *)&t->dots_symmetric_bf16, &used);
378
+ nk_dispatch_bf16_find_(caps, nk_kernel_angulars_packed_k, (nk_kernel_punned_t *)&t->angulars_packed_bf16, &used);
379
+ nk_dispatch_bf16_find_(caps, nk_kernel_angulars_symmetric_k, (nk_kernel_punned_t *)&t->angulars_symmetric_bf16,
380
+ &used);
381
+ nk_dispatch_bf16_find_(caps, nk_kernel_euclideans_packed_k, (nk_kernel_punned_t *)&t->euclideans_packed_bf16,
382
+ &used);
383
+ nk_dispatch_bf16_find_(caps, nk_kernel_euclideans_symmetric_k, (nk_kernel_punned_t *)&t->euclideans_symmetric_bf16,
384
+ &used);
385
+ nk_dispatch_bf16_find_(caps, nk_kernel_maxsim_packed_size_k, (nk_kernel_punned_t *)&t->maxsim_packed_size_bf16,
386
+ &used);
387
+ nk_dispatch_bf16_find_(caps, nk_kernel_maxsim_pack_k, (nk_kernel_punned_t *)&t->maxsim_pack_bf16, &used);
388
+ nk_dispatch_bf16_find_(caps, nk_kernel_maxsim_packed_k, (nk_kernel_punned_t *)&t->maxsim_packed_bf16, &used);
389
+ }
@@ -0,0 +1,52 @@
1
+ /**
2
+ * @brief Dispatch Initialization for BF16C Data Types.
3
+ * @file c/dispatch_bf16c.c
4
+ * @author Ash Vardanian
5
+ * @date February 3, 2026
6
+ */
7
+ #include "dispatch.h"
8
+
9
+ void nk_dispatch_bf16c_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punned_t *m, nk_capability_t *c) {
10
+ typedef nk_kernel_punned_t m_t;
11
+ #if NK_TARGET_NEONBFDOT
12
+ if (v & nk_cap_neonbfdot_k) switch (k) {
13
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_bf16c_neonbfdot, *c = nk_cap_neonbfdot_k; return;
14
+ case nk_kernel_vdot_k: *m = (m_t)&nk_vdot_bf16c_neonbfdot, *c = nk_cap_neonbfdot_k; return;
15
+ case nk_kernel_bilinear_k: *m = (m_t)&nk_bilinear_bf16c_neonbfdot, *c = nk_cap_neonbfdot_k; return;
16
+ default: break;
17
+ }
18
+ #endif
19
+ #if NK_TARGET_GENOA
20
+ if (v & nk_cap_genoa_k) switch (k) {
21
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_bf16c_genoa, *c = nk_cap_genoa_k; return;
22
+ case nk_kernel_vdot_k: *m = (m_t)&nk_vdot_bf16c_genoa, *c = nk_cap_genoa_k; return;
23
+ case nk_kernel_bilinear_k: *m = (m_t)&nk_bilinear_bf16c_genoa, *c = nk_cap_genoa_k; return;
24
+ default: break;
25
+ }
26
+ #endif
27
+ #if NK_TARGET_HASWELL
28
+ if (v & nk_cap_haswell_k) switch (k) {
29
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_bf16c_haswell, *c = nk_cap_haswell_k; return;
30
+ case nk_kernel_vdot_k: *m = (m_t)&nk_vdot_bf16c_haswell, *c = nk_cap_haswell_k; return;
31
+ default: break;
32
+ }
33
+ #endif
34
+ if (v & nk_cap_serial_k) switch (k) {
35
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_bf16c_serial, *c = nk_cap_serial_k; return;
36
+ case nk_kernel_vdot_k: *m = (m_t)&nk_vdot_bf16c_serial, *c = nk_cap_serial_k; return;
37
+ case nk_kernel_bilinear_k: *m = (m_t)&nk_bilinear_bf16c_serial, *c = nk_cap_serial_k; return;
38
+ default: break;
39
+ }
40
+
41
+ // Error fallback - zero capability signals lookup failure
42
+ *m = (m_t)nk_error_dense_, *c = 0;
43
+ }
44
+
45
+ void nk_dispatch_bf16c_init_(nk_capability_t caps) {
46
+ nk_implementations_t *t = &nk_dispatch_table;
47
+ nk_capability_t used;
48
+
49
+ nk_dispatch_bf16c_find_(caps, nk_kernel_dot_k, (nk_kernel_punned_t *)&t->dot_bf16c, &used);
50
+ nk_dispatch_bf16c_find_(caps, nk_kernel_vdot_k, (nk_kernel_punned_t *)&t->vdot_bf16c, &used);
51
+ nk_dispatch_bf16c_find_(caps, nk_kernel_bilinear_k, (nk_kernel_punned_t *)&t->bilinear_bf16c, &used);
52
+ }