numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,276 @@
1
+ /**
2
+ * @brief Dispatch Initialization for E4M3 Data Types.
3
+ * @file c/dispatch_e4m3.c
4
+ * @author Ash Vardanian
5
+ * @date February 3, 2026
6
+ */
7
+ #include "dispatch.h"
8
+
9
+ void nk_dispatch_e4m3_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punned_t *m, nk_capability_t *c) {
10
+ typedef nk_kernel_punned_t m_t;
11
+ #if NK_TARGET_V128RELAXED
12
+ if (v & nk_cap_v128relaxed_k) switch (k) {
13
+ case nk_kernel_reduce_moments_k:
14
+ *m = (m_t)&nk_reduce_moments_e4m3_v128relaxed, *c = nk_cap_v128relaxed_k;
15
+ return;
16
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_e4m3_v128relaxed, *c = nk_cap_v128relaxed_k; return;
17
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e4m3_v128relaxed, *c = nk_cap_v128relaxed_k; return;
18
+ case nk_kernel_dots_packed_size_k:
19
+ *m = (m_t)&nk_dots_packed_size_e4m3_v128relaxed, *c = nk_cap_v128relaxed_k;
20
+ return;
21
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e4m3_v128relaxed, *c = nk_cap_v128relaxed_k; return;
22
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e4m3_v128relaxed, *c = nk_cap_v128relaxed_k; return;
23
+ case nk_kernel_dots_symmetric_k:
24
+ *m = (m_t)&nk_dots_symmetric_e4m3_v128relaxed, *c = nk_cap_v128relaxed_k;
25
+ return;
26
+ case nk_kernel_angulars_packed_k:
27
+ *m = (m_t)&nk_angulars_packed_e4m3_v128relaxed, *c = nk_cap_v128relaxed_k;
28
+ return;
29
+ case nk_kernel_angulars_symmetric_k:
30
+ *m = (m_t)&nk_angulars_symmetric_e4m3_v128relaxed, *c = nk_cap_v128relaxed_k;
31
+ return;
32
+ case nk_kernel_euclideans_packed_k:
33
+ *m = (m_t)&nk_euclideans_packed_e4m3_v128relaxed, *c = nk_cap_v128relaxed_k;
34
+ return;
35
+ case nk_kernel_euclideans_symmetric_k:
36
+ *m = (m_t)&nk_euclideans_symmetric_e4m3_v128relaxed, *c = nk_cap_v128relaxed_k;
37
+ return;
38
+ default: break;
39
+ }
40
+ #endif
41
+ #if NK_TARGET_SME
42
+ if (v & nk_cap_sme_k) switch (k) {
43
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e4m3_sme, *c = nk_cap_sme_k; return;
44
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e4m3_sme, *c = nk_cap_sme_k; return;
45
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e4m3_sme, *c = nk_cap_sme_k; return;
46
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e4m3_sme, *c = nk_cap_sme_k; return;
47
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e4m3_sme, *c = nk_cap_sme_k; return;
48
+ case nk_kernel_angulars_symmetric_k: *m = (m_t)&nk_angulars_symmetric_e4m3_sme, *c = nk_cap_sme_k; return;
49
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_e4m3_sme, *c = nk_cap_sme_k; return;
50
+ case nk_kernel_euclideans_symmetric_k: *m = (m_t)&nk_euclideans_symmetric_e4m3_sme, *c = nk_cap_sme_k; return;
51
+ default: break;
52
+ }
53
+ #endif
54
+ #if NK_TARGET_NEONFHM
55
+ if (v & nk_cap_neonfhm_k) switch (k) {
56
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e4m3_neonfhm, *c = nk_cap_neonfhm_k; return;
57
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_e4m3_neonfhm, *c = nk_cap_neonfhm_k; return;
58
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_e4m3_neonfhm, *c = nk_cap_neonfhm_k; return;
59
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e4m3_neonfhm, *c = nk_cap_neonfhm_k; return;
60
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e4m3_neonfhm, *c = nk_cap_neonfhm_k; return;
61
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e4m3_neonfhm, *c = nk_cap_neonfhm_k; return;
62
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e4m3_neonfhm, *c = nk_cap_neonfhm_k; return;
63
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e4m3_neonfhm, *c = nk_cap_neonfhm_k; return;
64
+ case nk_kernel_angulars_symmetric_k:
65
+ *m = (m_t)&nk_angulars_symmetric_e4m3_neonfhm, *c = nk_cap_neonfhm_k;
66
+ return;
67
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_e4m3_neonfhm, *c = nk_cap_neonfhm_k; return;
68
+ case nk_kernel_euclideans_symmetric_k:
69
+ *m = (m_t)&nk_euclideans_symmetric_e4m3_neonfhm, *c = nk_cap_neonfhm_k;
70
+ return;
71
+ default: break;
72
+ }
73
+ #endif
74
+ #if NK_TARGET_NEONBFDOT
75
+ if (v & nk_cap_neonbfdot_k) switch (k) {
76
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e4m3_neonbfdot, *c = nk_cap_neonbfdot_k; return;
77
+ default: break;
78
+ }
79
+ #endif
80
+ #if NK_TARGET_NEON
81
+ if (v & nk_cap_neon_k) switch (k) {
82
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e4m3_neon, *c = nk_cap_neon_k; return;
83
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_e4m3_neon, *c = nk_cap_neon_k; return;
84
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e4m3_neon, *c = nk_cap_neon_k; return;
85
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e4m3_neon, *c = nk_cap_neon_k; return;
86
+ case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_e4m3_neon, *c = nk_cap_neon_k; return;
87
+ case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_e4m3_neon, *c = nk_cap_neon_k; return;
88
+ case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_e4m3_neon, *c = nk_cap_neon_k; return;
89
+ case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_e4m3_neon, *c = nk_cap_neon_k; return;
90
+ default: break;
91
+ }
92
+ #endif
93
+ #if NK_TARGET_SAPPHIREAMX
94
+ if (v & nk_cap_sapphireamx_k) switch (k) {
95
+ case nk_kernel_dots_packed_size_k:
96
+ *m = (m_t)&nk_dots_packed_size_e4m3_sapphireamx, *c = nk_cap_sapphireamx_k;
97
+ return;
98
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e4m3_sapphireamx, *c = nk_cap_sapphireamx_k; return;
99
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e4m3_sapphireamx, *c = nk_cap_sapphireamx_k; return;
100
+ case nk_kernel_angulars_packed_k:
101
+ *m = (m_t)&nk_angulars_packed_e4m3_sapphireamx, *c = nk_cap_sapphireamx_k;
102
+ return;
103
+ case nk_kernel_euclideans_packed_k:
104
+ *m = (m_t)&nk_euclideans_packed_e4m3_sapphireamx, *c = nk_cap_sapphireamx_k;
105
+ return;
106
+ case nk_kernel_dots_symmetric_k:
107
+ *m = (m_t)&nk_dots_symmetric_e4m3_sapphireamx, *c = nk_cap_sapphireamx_k;
108
+ return;
109
+ case nk_kernel_angulars_symmetric_k:
110
+ *m = (m_t)&nk_angulars_symmetric_e4m3_sapphireamx, *c = nk_cap_sapphireamx_k;
111
+ return;
112
+ case nk_kernel_euclideans_symmetric_k:
113
+ *m = (m_t)&nk_euclideans_symmetric_e4m3_sapphireamx, *c = nk_cap_sapphireamx_k;
114
+ return;
115
+ default: break;
116
+ }
117
+ #endif
118
+ #if NK_TARGET_SAPPHIRE
119
+ if (v & nk_cap_sapphire_k) switch (k) {
120
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e4m3_sapphire, *c = nk_cap_sapphire_k; return;
121
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e4m3_sapphire, *c = nk_cap_sapphire_k; return;
122
+ default: break;
123
+ }
124
+ #endif
125
+ #if NK_TARGET_GENOA
126
+ if (v & nk_cap_genoa_k) switch (k) {
127
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e4m3_genoa, *c = nk_cap_genoa_k; return;
128
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e4m3_genoa, *c = nk_cap_genoa_k; return;
129
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e4m3_genoa, *c = nk_cap_genoa_k; return;
130
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_e4m3_genoa, *c = nk_cap_genoa_k; return;
131
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e4m3_genoa, *c = nk_cap_genoa_k; return;
132
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e4m3_genoa, *c = nk_cap_genoa_k; return;
133
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e4m3_genoa, *c = nk_cap_genoa_k; return;
134
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e4m3_genoa, *c = nk_cap_genoa_k; return;
135
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e4m3_genoa, *c = nk_cap_genoa_k; return;
136
+ case nk_kernel_angulars_symmetric_k: *m = (m_t)&nk_angulars_symmetric_e4m3_genoa, *c = nk_cap_genoa_k; return;
137
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_e4m3_genoa, *c = nk_cap_genoa_k; return;
138
+ case nk_kernel_euclideans_symmetric_k:
139
+ *m = (m_t)&nk_euclideans_symmetric_e4m3_genoa, *c = nk_cap_genoa_k;
140
+ return;
141
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_e4m3_genoa, *c = nk_cap_genoa_k; return;
142
+ default: break;
143
+ }
144
+ #endif
145
+ #if NK_TARGET_SKYLAKE
146
+ if (v & nk_cap_skylake_k) switch (k) {
147
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e4m3_skylake, *c = nk_cap_skylake_k; return;
148
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e4m3_skylake, *c = nk_cap_skylake_k; return;
149
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e4m3_skylake, *c = nk_cap_skylake_k; return;
150
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_e4m3_skylake, *c = nk_cap_skylake_k; return;
151
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e4m3_skylake, *c = nk_cap_skylake_k; return;
152
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e4m3_skylake, *c = nk_cap_skylake_k; return;
153
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e4m3_skylake, *c = nk_cap_skylake_k; return;
154
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e4m3_skylake, *c = nk_cap_skylake_k; return;
155
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e4m3_skylake, *c = nk_cap_skylake_k; return;
156
+ case nk_kernel_angulars_symmetric_k:
157
+ *m = (m_t)&nk_angulars_symmetric_e4m3_skylake, *c = nk_cap_skylake_k;
158
+ return;
159
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_e4m3_skylake, *c = nk_cap_skylake_k; return;
160
+ case nk_kernel_euclideans_symmetric_k:
161
+ *m = (m_t)&nk_euclideans_symmetric_e4m3_skylake, *c = nk_cap_skylake_k;
162
+ return;
163
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_e4m3_skylake, *c = nk_cap_skylake_k; return;
164
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_e4m3_skylake, *c = nk_cap_skylake_k; return;
165
+ default: break;
166
+ }
167
+ #endif
168
+ #if NK_TARGET_HASWELL
169
+ if (v & nk_cap_haswell_k) switch (k) {
170
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e4m3_haswell, *c = nk_cap_haswell_k; return;
171
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_e4m3_haswell, *c = nk_cap_haswell_k; return;
172
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_e4m3_haswell, *c = nk_cap_haswell_k; return;
173
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e4m3_haswell, *c = nk_cap_haswell_k; return;
174
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e4m3_haswell, *c = nk_cap_haswell_k; return;
175
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e4m3_haswell, *c = nk_cap_haswell_k; return;
176
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e4m3_haswell, *c = nk_cap_haswell_k; return;
177
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e4m3_haswell, *c = nk_cap_haswell_k; return;
178
+ case nk_kernel_angulars_symmetric_k:
179
+ *m = (m_t)&nk_angulars_symmetric_e4m3_haswell, *c = nk_cap_haswell_k;
180
+ return;
181
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_e4m3_haswell, *c = nk_cap_haswell_k; return;
182
+ case nk_kernel_euclideans_symmetric_k:
183
+ *m = (m_t)&nk_euclideans_symmetric_e4m3_haswell, *c = nk_cap_haswell_k;
184
+ return;
185
+ default: break;
186
+ }
187
+ #endif
188
+ #if NK_TARGET_RVVHALF
189
+ if (v & nk_cap_rvvhalf_k) switch (k) {
190
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e4m3_rvvhalf, *c = nk_cap_rvvhalf_k; return;
191
+ default: break;
192
+ }
193
+ #endif
194
+ #if NK_TARGET_RVVBF16
195
+ if (v & nk_cap_rvvbf16_k) switch (k) {
196
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e4m3_rvvbf16, *c = nk_cap_rvvbf16_k; return;
197
+ default: break;
198
+ }
199
+ #endif
200
+ #if NK_TARGET_RVV
201
+ if (v & nk_cap_rvv_k) switch (k) {
202
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e4m3_rvv, *c = nk_cap_rvv_k; return;
203
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e4m3_rvv, *c = nk_cap_rvv_k; return;
204
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e4m3_rvv, *c = nk_cap_rvv_k; return;
205
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_e4m3_rvv, *c = nk_cap_rvv_k; return;
206
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_e4m3_rvv, *c = nk_cap_rvv_k; return;
207
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_e4m3_rvv, *c = nk_cap_rvv_k; return;
208
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e4m3_rvv, *c = nk_cap_rvv_k; return;
209
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e4m3_rvv, *c = nk_cap_rvv_k; return;
210
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e4m3_rvv, *c = nk_cap_rvv_k; return;
211
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e4m3_rvv, *c = nk_cap_rvv_k; return;
212
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e4m3_rvv, *c = nk_cap_rvv_k; return;
213
+ case nk_kernel_angulars_symmetric_k: *m = (m_t)&nk_angulars_symmetric_e4m3_rvv, *c = nk_cap_rvv_k; return;
214
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_e4m3_rvv, *c = nk_cap_rvv_k; return;
215
+ case nk_kernel_euclideans_symmetric_k: *m = (m_t)&nk_euclideans_symmetric_e4m3_rvv, *c = nk_cap_rvv_k; return;
216
+ case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_e4m3_rvv, *c = nk_cap_rvv_k; return;
217
+ case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_e4m3_rvv, *c = nk_cap_rvv_k; return;
218
+ case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_e4m3_rvv, *c = nk_cap_rvv_k; return;
219
+ case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_e4m3_rvv, *c = nk_cap_rvv_k; return;
220
+ default: break;
221
+ }
222
+ #endif
223
+ if (v & nk_cap_serial_k) switch (k) {
224
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e4m3_serial, *c = nk_cap_serial_k; return;
225
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e4m3_serial, *c = nk_cap_serial_k; return;
226
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e4m3_serial, *c = nk_cap_serial_k; return;
227
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_e4m3_serial, *c = nk_cap_serial_k; return;
228
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_e4m3_serial, *c = nk_cap_serial_k; return;
229
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_e4m3_serial, *c = nk_cap_serial_k; return;
230
+ case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_e4m3_serial, *c = nk_cap_serial_k; return;
231
+ case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_e4m3_serial, *c = nk_cap_serial_k; return;
232
+ case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_e4m3_serial, *c = nk_cap_serial_k; return;
233
+ case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_e4m3_serial, *c = nk_cap_serial_k; return;
234
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e4m3_serial, *c = nk_cap_serial_k; return;
235
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e4m3_serial, *c = nk_cap_serial_k; return;
236
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e4m3_serial, *c = nk_cap_serial_k; return;
237
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e4m3_serial, *c = nk_cap_serial_k; return;
238
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e4m3_serial, *c = nk_cap_serial_k; return;
239
+ case nk_kernel_angulars_symmetric_k: *m = (m_t)&nk_angulars_symmetric_e4m3_serial, *c = nk_cap_serial_k; return;
240
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_e4m3_serial, *c = nk_cap_serial_k; return;
241
+ case nk_kernel_euclideans_symmetric_k:
242
+ *m = (m_t)&nk_euclideans_symmetric_e4m3_serial, *c = nk_cap_serial_k;
243
+ return;
244
+ default: break;
245
+ }
246
+
247
+ // Error fallback - zero capability signals lookup failure
248
+ *m = (m_t)nk_error_dense_, *c = 0;
249
+ }
250
+
251
+ void nk_dispatch_e4m3_init_(nk_capability_t caps) {
252
+ nk_implementations_t *t = &nk_dispatch_table;
253
+ nk_capability_t used;
254
+
255
+ nk_dispatch_e4m3_find_(caps, nk_kernel_dot_k, (nk_kernel_punned_t *)&t->dot_e4m3, &used);
256
+ nk_dispatch_e4m3_find_(caps, nk_kernel_angular_k, (nk_kernel_punned_t *)&t->angular_e4m3, &used);
257
+ nk_dispatch_e4m3_find_(caps, nk_kernel_euclidean_k, (nk_kernel_punned_t *)&t->euclidean_e4m3, &used);
258
+ nk_dispatch_e4m3_find_(caps, nk_kernel_sqeuclidean_k, (nk_kernel_punned_t *)&t->sqeuclidean_e4m3, &used);
259
+ nk_dispatch_e4m3_find_(caps, nk_kernel_each_scale_k, (nk_kernel_punned_t *)&t->each_scale_e4m3, &used);
260
+ nk_dispatch_e4m3_find_(caps, nk_kernel_each_sum_k, (nk_kernel_punned_t *)&t->each_sum_e4m3, &used);
261
+ nk_dispatch_e4m3_find_(caps, nk_kernel_each_blend_k, (nk_kernel_punned_t *)&t->each_blend_e4m3, &used);
262
+ nk_dispatch_e4m3_find_(caps, nk_kernel_each_fma_k, (nk_kernel_punned_t *)&t->each_fma_e4m3, &used);
263
+ nk_dispatch_e4m3_find_(caps, nk_kernel_reduce_moments_k, (nk_kernel_punned_t *)&t->reduce_moments_e4m3, &used);
264
+ nk_dispatch_e4m3_find_(caps, nk_kernel_reduce_minmax_k, (nk_kernel_punned_t *)&t->reduce_minmax_e4m3, &used);
265
+ nk_dispatch_e4m3_find_(caps, nk_kernel_dots_packed_size_k, (nk_kernel_punned_t *)&t->dots_packed_size_e4m3, &used);
266
+ nk_dispatch_e4m3_find_(caps, nk_kernel_dots_pack_k, (nk_kernel_punned_t *)&t->dots_pack_e4m3, &used);
267
+ nk_dispatch_e4m3_find_(caps, nk_kernel_dots_packed_k, (nk_kernel_punned_t *)&t->dots_packed_e4m3, &used);
268
+ nk_dispatch_e4m3_find_(caps, nk_kernel_dots_symmetric_k, (nk_kernel_punned_t *)&t->dots_symmetric_e4m3, &used);
269
+ nk_dispatch_e4m3_find_(caps, nk_kernel_angulars_packed_k, (nk_kernel_punned_t *)&t->angulars_packed_e4m3, &used);
270
+ nk_dispatch_e4m3_find_(caps, nk_kernel_angulars_symmetric_k, (nk_kernel_punned_t *)&t->angulars_symmetric_e4m3,
271
+ &used);
272
+ nk_dispatch_e4m3_find_(caps, nk_kernel_euclideans_packed_k, (nk_kernel_punned_t *)&t->euclideans_packed_e4m3,
273
+ &used);
274
+ nk_dispatch_e4m3_find_(caps, nk_kernel_euclideans_symmetric_k, (nk_kernel_punned_t *)&t->euclideans_symmetric_e4m3,
275
+ &used);
276
+ }
@@ -0,0 +1,272 @@
1
+ /**
2
+ * @brief Dispatch Initialization for E5M2 Data Types.
3
+ * @file c/dispatch_e5m2.c
4
+ * @author Ash Vardanian
5
+ * @date February 3, 2026
6
+ */
7
+ #include "dispatch.h"
8
+
9
+ void nk_dispatch_e5m2_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punned_t *m, nk_capability_t *c) {
10
+ typedef nk_kernel_punned_t m_t;
11
+ #if NK_TARGET_V128RELAXED
12
+ if (v & nk_cap_v128relaxed_k) switch (k) {
13
+ case nk_kernel_reduce_moments_k:
14
+ *m = (m_t)&nk_reduce_moments_e5m2_v128relaxed, *c = nk_cap_v128relaxed_k;
15
+ return;
16
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_e5m2_v128relaxed, *c = nk_cap_v128relaxed_k; return;
17
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e5m2_v128relaxed, *c = nk_cap_v128relaxed_k; return;
18
+ case nk_kernel_dots_packed_size_k:
19
+ *m = (m_t)&nk_dots_packed_size_e5m2_v128relaxed, *c = nk_cap_v128relaxed_k;
20
+ return;
21
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e5m2_v128relaxed, *c = nk_cap_v128relaxed_k; return;
22
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e5m2_v128relaxed, *c = nk_cap_v128relaxed_k; return;
23
+ case nk_kernel_dots_symmetric_k:
24
+ *m = (m_t)&nk_dots_symmetric_e5m2_v128relaxed, *c = nk_cap_v128relaxed_k;
25
+ return;
26
+ case nk_kernel_angulars_packed_k:
27
+ *m = (m_t)&nk_angulars_packed_e5m2_v128relaxed, *c = nk_cap_v128relaxed_k;
28
+ return;
29
+ case nk_kernel_angulars_symmetric_k:
30
+ *m = (m_t)&nk_angulars_symmetric_e5m2_v128relaxed, *c = nk_cap_v128relaxed_k;
31
+ return;
32
+ case nk_kernel_euclideans_packed_k:
33
+ *m = (m_t)&nk_euclideans_packed_e5m2_v128relaxed, *c = nk_cap_v128relaxed_k;
34
+ return;
35
+ case nk_kernel_euclideans_symmetric_k:
36
+ *m = (m_t)&nk_euclideans_symmetric_e5m2_v128relaxed, *c = nk_cap_v128relaxed_k;
37
+ return;
38
+ default: break;
39
+ }
40
+ #endif
41
+ #if NK_TARGET_SME
42
+ if (v & nk_cap_sme_k) switch (k) {
43
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e5m2_sme, *c = nk_cap_sme_k; return;
44
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e5m2_sme, *c = nk_cap_sme_k; return;
45
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e5m2_sme, *c = nk_cap_sme_k; return;
46
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e5m2_sme, *c = nk_cap_sme_k; return;
47
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e5m2_sme, *c = nk_cap_sme_k; return;
48
+ case nk_kernel_angulars_symmetric_k: *m = (m_t)&nk_angulars_symmetric_e5m2_sme, *c = nk_cap_sme_k; return;
49
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_e5m2_sme, *c = nk_cap_sme_k; return;
50
+ case nk_kernel_euclideans_symmetric_k: *m = (m_t)&nk_euclideans_symmetric_e5m2_sme, *c = nk_cap_sme_k; return;
51
+ default: break;
52
+ }
53
+ #endif
54
+ #if NK_TARGET_NEONFHM
55
+ if (v & nk_cap_neonfhm_k) switch (k) {
56
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e5m2_neonfhm, *c = nk_cap_neonfhm_k; return;
57
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_e5m2_neonfhm, *c = nk_cap_neonfhm_k; return;
58
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_e5m2_neonfhm, *c = nk_cap_neonfhm_k; return;
59
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e5m2_neonfhm, *c = nk_cap_neonfhm_k; return;
60
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e5m2_neonfhm, *c = nk_cap_neonfhm_k; return;
61
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e5m2_neonfhm, *c = nk_cap_neonfhm_k; return;
62
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e5m2_neonfhm, *c = nk_cap_neonfhm_k; return;
63
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e5m2_neonfhm, *c = nk_cap_neonfhm_k; return;
64
+ case nk_kernel_angulars_symmetric_k:
65
+ *m = (m_t)&nk_angulars_symmetric_e5m2_neonfhm, *c = nk_cap_neonfhm_k;
66
+ return;
67
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_e5m2_neonfhm, *c = nk_cap_neonfhm_k; return;
68
+ case nk_kernel_euclideans_symmetric_k:
69
+ *m = (m_t)&nk_euclideans_symmetric_e5m2_neonfhm, *c = nk_cap_neonfhm_k;
70
+ return;
71
+ default: break;
72
+ }
73
+ #endif
74
+ #if NK_TARGET_NEONBFDOT
75
+ if (v & nk_cap_neonbfdot_k) switch (k) {
76
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e5m2_neonbfdot, *c = nk_cap_neonbfdot_k; return;
77
+ default: break;
78
+ }
79
+ #endif
80
+ #if NK_TARGET_NEON
81
+ if (v & nk_cap_neon_k) switch (k) {
82
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e5m2_neon, *c = nk_cap_neon_k; return;
83
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_e5m2_neon, *c = nk_cap_neon_k; return;
84
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e5m2_neon, *c = nk_cap_neon_k; return;
85
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e5m2_neon, *c = nk_cap_neon_k; return;
86
+ case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_e5m2_neon, *c = nk_cap_neon_k; return;
87
+ case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_e5m2_neon, *c = nk_cap_neon_k; return;
88
+ case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_e5m2_neon, *c = nk_cap_neon_k; return;
89
+ case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_e5m2_neon, *c = nk_cap_neon_k; return;
90
+ default: break;
91
+ }
92
+ #endif
93
+ #if NK_TARGET_SAPPHIREAMX
94
+ if (v & nk_cap_sapphireamx_k) switch (k) {
95
+ case nk_kernel_dots_packed_size_k:
96
+ *m = (m_t)&nk_dots_packed_size_e5m2_sapphireamx, *c = nk_cap_sapphireamx_k;
97
+ return;
98
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e5m2_sapphireamx, *c = nk_cap_sapphireamx_k; return;
99
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e5m2_sapphireamx, *c = nk_cap_sapphireamx_k; return;
100
+ case nk_kernel_angulars_packed_k:
101
+ *m = (m_t)&nk_angulars_packed_e5m2_sapphireamx, *c = nk_cap_sapphireamx_k;
102
+ return;
103
+ case nk_kernel_euclideans_packed_k:
104
+ *m = (m_t)&nk_euclideans_packed_e5m2_sapphireamx, *c = nk_cap_sapphireamx_k;
105
+ return;
106
+ case nk_kernel_dots_symmetric_k:
107
+ *m = (m_t)&nk_dots_symmetric_e5m2_sapphireamx, *c = nk_cap_sapphireamx_k;
108
+ return;
109
+ case nk_kernel_angulars_symmetric_k:
110
+ *m = (m_t)&nk_angulars_symmetric_e5m2_sapphireamx, *c = nk_cap_sapphireamx_k;
111
+ return;
112
+ case nk_kernel_euclideans_symmetric_k:
113
+ *m = (m_t)&nk_euclideans_symmetric_e5m2_sapphireamx, *c = nk_cap_sapphireamx_k;
114
+ return;
115
+ default: break;
116
+ }
117
+ #endif
118
+ #if NK_TARGET_GENOA
119
+ if (v & nk_cap_genoa_k) switch (k) {
120
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e5m2_genoa, *c = nk_cap_genoa_k; return;
121
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e5m2_genoa, *c = nk_cap_genoa_k; return;
122
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e5m2_genoa, *c = nk_cap_genoa_k; return;
123
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_e5m2_genoa, *c = nk_cap_genoa_k; return;
124
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e5m2_genoa, *c = nk_cap_genoa_k; return;
125
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e5m2_genoa, *c = nk_cap_genoa_k; return;
126
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e5m2_genoa, *c = nk_cap_genoa_k; return;
127
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e5m2_genoa, *c = nk_cap_genoa_k; return;
128
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e5m2_genoa, *c = nk_cap_genoa_k; return;
129
+ case nk_kernel_angulars_symmetric_k: *m = (m_t)&nk_angulars_symmetric_e5m2_genoa, *c = nk_cap_genoa_k; return;
130
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_e5m2_genoa, *c = nk_cap_genoa_k; return;
131
+ case nk_kernel_euclideans_symmetric_k:
132
+ *m = (m_t)&nk_euclideans_symmetric_e5m2_genoa, *c = nk_cap_genoa_k;
133
+ return;
134
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_e5m2_genoa, *c = nk_cap_genoa_k; return;
135
+ default: break;
136
+ }
137
+ #endif
138
+ #if NK_TARGET_SKYLAKE
139
+ if (v & nk_cap_skylake_k) switch (k) {
140
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e5m2_skylake, *c = nk_cap_skylake_k; return;
141
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e5m2_skylake, *c = nk_cap_skylake_k; return;
142
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e5m2_skylake, *c = nk_cap_skylake_k; return;
143
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_e5m2_skylake, *c = nk_cap_skylake_k; return;
144
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e5m2_skylake, *c = nk_cap_skylake_k; return;
145
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e5m2_skylake, *c = nk_cap_skylake_k; return;
146
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e5m2_skylake, *c = nk_cap_skylake_k; return;
147
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e5m2_skylake, *c = nk_cap_skylake_k; return;
148
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e5m2_skylake, *c = nk_cap_skylake_k; return;
149
+ case nk_kernel_angulars_symmetric_k:
150
+ *m = (m_t)&nk_angulars_symmetric_e5m2_skylake, *c = nk_cap_skylake_k;
151
+ return;
152
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_e5m2_skylake, *c = nk_cap_skylake_k; return;
153
+ case nk_kernel_euclideans_symmetric_k:
154
+ *m = (m_t)&nk_euclideans_symmetric_e5m2_skylake, *c = nk_cap_skylake_k;
155
+ return;
156
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_e5m2_skylake, *c = nk_cap_skylake_k; return;
157
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_e5m2_skylake, *c = nk_cap_skylake_k; return;
158
+ default: break;
159
+ }
160
+ #endif
161
+ #if NK_TARGET_HASWELL
162
+ if (v & nk_cap_haswell_k) switch (k) {
163
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e5m2_haswell, *c = nk_cap_haswell_k; return;
164
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_e5m2_haswell, *c = nk_cap_haswell_k; return;
165
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e5m2_haswell, *c = nk_cap_haswell_k; return;
166
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e5m2_haswell, *c = nk_cap_haswell_k; return;
167
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_e5m2_haswell, *c = nk_cap_haswell_k; return;
168
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_e5m2_haswell, *c = nk_cap_haswell_k; return;
169
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e5m2_haswell, *c = nk_cap_haswell_k; return;
170
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e5m2_haswell, *c = nk_cap_haswell_k; return;
171
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e5m2_haswell, *c = nk_cap_haswell_k; return;
172
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e5m2_haswell, *c = nk_cap_haswell_k; return;
173
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e5m2_haswell, *c = nk_cap_haswell_k; return;
174
+ case nk_kernel_angulars_symmetric_k:
175
+ *m = (m_t)&nk_angulars_symmetric_e5m2_haswell, *c = nk_cap_haswell_k;
176
+ return;
177
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_e5m2_haswell, *c = nk_cap_haswell_k; return;
178
+ case nk_kernel_euclideans_symmetric_k:
179
+ *m = (m_t)&nk_euclideans_symmetric_e5m2_haswell, *c = nk_cap_haswell_k;
180
+ return;
181
+ default: break;
182
+ }
183
+ #endif
184
+ #if NK_TARGET_RVVHALF
185
+ if (v & nk_cap_rvvhalf_k) switch (k) {
186
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e5m2_rvvhalf, *c = nk_cap_rvvhalf_k; return;
187
+ default: break;
188
+ }
189
+ #endif
190
+ #if NK_TARGET_RVVBF16
191
+ if (v & nk_cap_rvvbf16_k) switch (k) {
192
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e5m2_rvvbf16, *c = nk_cap_rvvbf16_k; return;
193
+ default: break;
194
+ }
195
+ #endif
196
+ #if NK_TARGET_RVV
197
+ if (v & nk_cap_rvv_k) switch (k) {
198
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e5m2_rvv, *c = nk_cap_rvv_k; return;
199
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e5m2_rvv, *c = nk_cap_rvv_k; return;
200
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e5m2_rvv, *c = nk_cap_rvv_k; return;
201
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_e5m2_rvv, *c = nk_cap_rvv_k; return;
202
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_e5m2_rvv, *c = nk_cap_rvv_k; return;
203
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_e5m2_rvv, *c = nk_cap_rvv_k; return;
204
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e5m2_rvv, *c = nk_cap_rvv_k; return;
205
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e5m2_rvv, *c = nk_cap_rvv_k; return;
206
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e5m2_rvv, *c = nk_cap_rvv_k; return;
207
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e5m2_rvv, *c = nk_cap_rvv_k; return;
208
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e5m2_rvv, *c = nk_cap_rvv_k; return;
209
+ case nk_kernel_angulars_symmetric_k: *m = (m_t)&nk_angulars_symmetric_e5m2_rvv, *c = nk_cap_rvv_k; return;
210
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_e5m2_rvv, *c = nk_cap_rvv_k; return;
211
+ case nk_kernel_euclideans_symmetric_k: *m = (m_t)&nk_euclideans_symmetric_e5m2_rvv, *c = nk_cap_rvv_k; return;
212
+ case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_e5m2_rvv, *c = nk_cap_rvv_k; return;
213
+ case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_e5m2_rvv, *c = nk_cap_rvv_k; return;
214
+ case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_e5m2_rvv, *c = nk_cap_rvv_k; return;
215
+ case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_e5m2_rvv, *c = nk_cap_rvv_k; return;
216
+ default: break;
217
+ }
218
+ #endif
219
+ if (v & nk_cap_serial_k) switch (k) {
220
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_e5m2_serial, *c = nk_cap_serial_k; return;
221
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_e5m2_serial, *c = nk_cap_serial_k; return;
222
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_e5m2_serial, *c = nk_cap_serial_k; return;
223
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_e5m2_serial, *c = nk_cap_serial_k; return;
224
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_e5m2_serial, *c = nk_cap_serial_k; return;
225
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_e5m2_serial, *c = nk_cap_serial_k; return;
226
+ case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_e5m2_serial, *c = nk_cap_serial_k; return;
227
+ case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_e5m2_serial, *c = nk_cap_serial_k; return;
228
+ case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_e5m2_serial, *c = nk_cap_serial_k; return;
229
+ case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_e5m2_serial, *c = nk_cap_serial_k; return;
230
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_e5m2_serial, *c = nk_cap_serial_k; return;
231
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e5m2_serial, *c = nk_cap_serial_k; return;
232
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e5m2_serial, *c = nk_cap_serial_k; return;
233
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e5m2_serial, *c = nk_cap_serial_k; return;
234
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_e5m2_serial, *c = nk_cap_serial_k; return;
235
+ case nk_kernel_angulars_symmetric_k: *m = (m_t)&nk_angulars_symmetric_e5m2_serial, *c = nk_cap_serial_k; return;
236
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_e5m2_serial, *c = nk_cap_serial_k; return;
237
+ case nk_kernel_euclideans_symmetric_k:
238
+ *m = (m_t)&nk_euclideans_symmetric_e5m2_serial, *c = nk_cap_serial_k;
239
+ return;
240
+ default: break;
241
+ }
242
+
243
+ // Error fallback - zero capability signals lookup failure
244
+ *m = (m_t)nk_error_dense_, *c = 0;
245
+ }
246
+
247
+ void nk_dispatch_e5m2_init_(nk_capability_t caps) {
248
+ nk_implementations_t *t = &nk_dispatch_table;
249
+ nk_capability_t used;
250
+
251
+ nk_dispatch_e5m2_find_(caps, nk_kernel_dot_k, (nk_kernel_punned_t *)&t->dot_e5m2, &used);
252
+ nk_dispatch_e5m2_find_(caps, nk_kernel_angular_k, (nk_kernel_punned_t *)&t->angular_e5m2, &used);
253
+ nk_dispatch_e5m2_find_(caps, nk_kernel_euclidean_k, (nk_kernel_punned_t *)&t->euclidean_e5m2, &used);
254
+ nk_dispatch_e5m2_find_(caps, nk_kernel_sqeuclidean_k, (nk_kernel_punned_t *)&t->sqeuclidean_e5m2, &used);
255
+ nk_dispatch_e5m2_find_(caps, nk_kernel_each_scale_k, (nk_kernel_punned_t *)&t->each_scale_e5m2, &used);
256
+ nk_dispatch_e5m2_find_(caps, nk_kernel_each_sum_k, (nk_kernel_punned_t *)&t->each_sum_e5m2, &used);
257
+ nk_dispatch_e5m2_find_(caps, nk_kernel_each_blend_k, (nk_kernel_punned_t *)&t->each_blend_e5m2, &used);
258
+ nk_dispatch_e5m2_find_(caps, nk_kernel_each_fma_k, (nk_kernel_punned_t *)&t->each_fma_e5m2, &used);
259
+ nk_dispatch_e5m2_find_(caps, nk_kernel_reduce_moments_k, (nk_kernel_punned_t *)&t->reduce_moments_e5m2, &used);
260
+ nk_dispatch_e5m2_find_(caps, nk_kernel_reduce_minmax_k, (nk_kernel_punned_t *)&t->reduce_minmax_e5m2, &used);
261
+ nk_dispatch_e5m2_find_(caps, nk_kernel_dots_packed_size_k, (nk_kernel_punned_t *)&t->dots_packed_size_e5m2, &used);
262
+ nk_dispatch_e5m2_find_(caps, nk_kernel_dots_pack_k, (nk_kernel_punned_t *)&t->dots_pack_e5m2, &used);
263
+ nk_dispatch_e5m2_find_(caps, nk_kernel_dots_packed_k, (nk_kernel_punned_t *)&t->dots_packed_e5m2, &used);
264
+ nk_dispatch_e5m2_find_(caps, nk_kernel_dots_symmetric_k, (nk_kernel_punned_t *)&t->dots_symmetric_e5m2, &used);
265
+ nk_dispatch_e5m2_find_(caps, nk_kernel_angulars_packed_k, (nk_kernel_punned_t *)&t->angulars_packed_e5m2, &used);
266
+ nk_dispatch_e5m2_find_(caps, nk_kernel_angulars_symmetric_k, (nk_kernel_punned_t *)&t->angulars_symmetric_e5m2,
267
+ &used);
268
+ nk_dispatch_e5m2_find_(caps, nk_kernel_euclideans_packed_k, (nk_kernel_punned_t *)&t->euclideans_packed_e5m2,
269
+ &used);
270
+ nk_dispatch_e5m2_find_(caps, nk_kernel_euclideans_symmetric_k, (nk_kernel_punned_t *)&t->euclideans_symmetric_e5m2,
271
+ &used);
272
+ }