numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,330 @@
1
+ /**
2
+ * @brief Dispatch Initialization for Type Conversions and Scalar Math.
3
+ * @file c/dispatch_other.c
4
+ * @author Ash Vardanian
5
+ * @date February 3, 2026
6
+ */
7
+ #include "dispatch.h"
8
+
9
+ void nk_dispatch_cast_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punned_t *m, nk_capability_t *c) {
10
+ typedef nk_kernel_punned_t m_t;
11
+ #if NK_TARGET_NEON
12
+ if (v & nk_cap_neon_k) switch (k) {
13
+ case nk_kernel_cast_k: *m = (m_t)&nk_cast_neon, *c = nk_cap_neon_k; return;
14
+ default: break;
15
+ }
16
+ #endif
17
+ #if NK_TARGET_SAPPHIRE
18
+ if (v & nk_cap_sapphire_k) switch (k) {
19
+ case nk_kernel_cast_k: *m = (m_t)&nk_cast_sapphire, *c = nk_cap_sapphire_k; return;
20
+ default: break;
21
+ }
22
+ #endif
23
+ #if NK_TARGET_ICELAKE
24
+ if (v & nk_cap_icelake_k) switch (k) {
25
+ case nk_kernel_cast_k: *m = (m_t)&nk_cast_icelake, *c = nk_cap_icelake_k; return;
26
+ default: break;
27
+ }
28
+ #endif
29
+ #if NK_TARGET_SKYLAKE
30
+ if (v & nk_cap_skylake_k) switch (k) {
31
+ case nk_kernel_cast_k: *m = (m_t)&nk_cast_skylake, *c = nk_cap_skylake_k; return;
32
+ default: break;
33
+ }
34
+ #endif
35
+ #if NK_TARGET_HASWELL
36
+ if (v & nk_cap_haswell_k) switch (k) {
37
+ case nk_kernel_cast_k: *m = (m_t)&nk_cast_haswell, *c = nk_cap_haswell_k; return;
38
+ default: break;
39
+ }
40
+ #endif
41
+ #if NK_TARGET_RVV
42
+ if (v & nk_cap_rvv_k) switch (k) {
43
+ case nk_kernel_cast_k: *m = (m_t)&nk_cast_rvv, *c = nk_cap_rvv_k; return;
44
+ default: break;
45
+ }
46
+ #endif
47
+ if (v & nk_cap_serial_k) switch (k) {
48
+ case nk_kernel_cast_k: *m = (m_t)&nk_cast_serial, *c = nk_cap_serial_k; return;
49
+ default: break;
50
+ }
51
+
52
+ // Error fallback - zero capability signals lookup failure
53
+ *m = (m_t)nk_error_dense_, *c = 0;
54
+ }
55
+
56
+ void nk_dispatch_cast_init_(nk_capability_t caps) {
57
+ nk_implementations_t *t = &nk_dispatch_table;
58
+ nk_capability_t used;
59
+
60
+ // Type casting (buffer-to-buffer)
61
+ nk_dispatch_cast_find_(caps, nk_kernel_cast_k, (nk_kernel_punned_t *)&t->cast, &used);
62
+
63
+ // Scalar conversions: bf16 ↔ f32
64
+ t->bf16_to_f32 = &nk_bf16_to_f32_serial;
65
+ t->f32_to_bf16 = &nk_f32_to_bf16_serial;
66
+
67
+ // Scalar conversions: f16 ↔ f32
68
+ t->f16_to_f32 = &nk_f16_to_f32_serial;
69
+ t->f32_to_f16 = &nk_f32_to_f16_serial;
70
+
71
+ #if NK_TARGET_HASWELL
72
+ if (caps & nk_cap_haswell_k) {
73
+ t->f16_to_f32 = &nk_f16_to_f32_haswell;
74
+ t->f32_to_f16 = &nk_f32_to_f16_haswell;
75
+ }
76
+ #endif
77
+
78
+ #if NK_TARGET_SAPPHIRE
79
+ if (caps & nk_cap_sapphire_k) {
80
+ t->f16_to_f32 = &nk_f16_to_f32_sapphire;
81
+ t->f32_to_f16 = &nk_f32_to_f16_sapphire;
82
+ }
83
+ #endif
84
+
85
+ #if NK_TARGET_NEON
86
+ if (caps & nk_cap_neon_k) {
87
+ t->f16_to_f32 = &nk_f16_to_f32_neon;
88
+ t->f32_to_f16 = &nk_f32_to_f16_neon;
89
+ }
90
+ #endif
91
+
92
+ // Scalar conversions: e5m2, e4m3, e3m2, e2m3 (serial only)
93
+ t->e5m2_to_f32 = &nk_e5m2_to_f32_serial;
94
+ t->f32_to_e5m2 = &nk_f32_to_e5m2_serial;
95
+ t->e4m3_to_f32 = &nk_e4m3_to_f32_serial;
96
+ t->f32_to_e4m3 = &nk_f32_to_e4m3_serial;
97
+ t->e3m2_to_f32 = &nk_e3m2_to_f32_serial;
98
+ t->f32_to_e3m2 = &nk_f32_to_e3m2_serial;
99
+ t->e2m3_to_f32 = &nk_e2m3_to_f32_serial;
100
+ t->f32_to_e2m3 = &nk_f32_to_e2m3_serial;
101
+ }
102
+
103
+ void nk_dispatch_math_init_(nk_capability_t caps) {
104
+ nk_implementations_t *t = &nk_dispatch_table;
105
+
106
+ // Scalar math: f64
107
+ t->f64_sqrt = &nk_f64_sqrt_serial;
108
+ t->f64_rsqrt = &nk_f64_rsqrt_serial;
109
+ t->f64_fma = &nk_f64_fma_serial;
110
+
111
+ // Scalar math: f32
112
+ t->f32_sqrt = &nk_f32_sqrt_serial;
113
+ t->f32_rsqrt = &nk_f32_rsqrt_serial;
114
+ t->f32_fma = &nk_f32_fma_serial;
115
+
116
+ #if NK_TARGET_V128RELAXED
117
+ if (caps & nk_cap_v128relaxed_k) {
118
+ t->f64_rsqrt = &nk_f64_rsqrt_v128relaxed;
119
+ t->f64_fma = &nk_f64_fma_v128relaxed;
120
+ t->f32_rsqrt = &nk_f32_rsqrt_v128relaxed;
121
+ t->f32_fma = &nk_f32_fma_v128relaxed;
122
+ }
123
+ #endif
124
+
125
+ #if NK_TARGET_HASWELL
126
+ if (caps & nk_cap_haswell_k) {
127
+ t->f64_sqrt = &nk_f64_sqrt_haswell;
128
+ t->f64_rsqrt = &nk_f64_rsqrt_haswell;
129
+ t->f64_fma = &nk_f64_fma_haswell;
130
+ t->f32_sqrt = &nk_f32_sqrt_haswell;
131
+ t->f32_rsqrt = &nk_f32_rsqrt_haswell;
132
+ t->f32_fma = &nk_f32_fma_haswell;
133
+ }
134
+ #endif
135
+
136
+ #if NK_TARGET_NEON
137
+ if (caps & nk_cap_neon_k) {
138
+ t->f64_sqrt = &nk_f64_sqrt_neon;
139
+ t->f64_rsqrt = &nk_f64_rsqrt_neon;
140
+ t->f64_fma = &nk_f64_fma_neon;
141
+ t->f32_sqrt = &nk_f32_sqrt_neon;
142
+ t->f32_rsqrt = &nk_f32_rsqrt_neon;
143
+ t->f32_fma = &nk_f32_fma_neon;
144
+ }
145
+ #endif
146
+
147
+ // Scalar math: f16
148
+ t->f16_sqrt = &nk_f16_sqrt_serial;
149
+ t->f16_rsqrt = &nk_f16_rsqrt_serial;
150
+ t->f16_fma = &nk_f16_fma_serial;
151
+
152
+ #if NK_TARGET_HASWELL
153
+ if (caps & nk_cap_haswell_k) {
154
+ t->f16_sqrt = &nk_f16_sqrt_haswell;
155
+ t->f16_rsqrt = &nk_f16_rsqrt_haswell;
156
+ t->f16_fma = &nk_f16_fma_haswell;
157
+ }
158
+ #endif
159
+
160
+ #if NK_TARGET_NEONHALF
161
+ if (caps & nk_cap_neonhalf_k) {
162
+ t->f16_sqrt = &nk_f16_sqrt_neonhalf;
163
+ t->f16_rsqrt = &nk_f16_rsqrt_neonhalf;
164
+ t->f16_fma = &nk_f16_fma_neonhalf;
165
+ }
166
+ #endif
167
+
168
+ #if NK_TARGET_SAPPHIRE
169
+ if (caps & nk_cap_sapphire_k) {
170
+ t->f16_sqrt = &nk_f16_sqrt_sapphire;
171
+ t->f16_rsqrt = &nk_f16_rsqrt_sapphire;
172
+ t->f16_fma = &nk_f16_fma_sapphire;
173
+ }
174
+ #endif
175
+
176
+ #if NK_TARGET_RVV
177
+ if (caps & nk_cap_rvv_k) {
178
+ t->f64_fma = &nk_f64_fma_rvv;
179
+ t->f32_fma = &nk_f32_fma_rvv;
180
+ }
181
+ #endif
182
+
183
+ // Saturating arithmetic
184
+ t->i64_saturating_add = &nk_i64_saturating_add_serial;
185
+ t->i64_saturating_mul = &nk_i64_saturating_mul_serial;
186
+ t->i32_saturating_add = &nk_i32_saturating_add_serial;
187
+ t->i32_saturating_mul = &nk_i32_saturating_mul_serial;
188
+ t->i16_saturating_add = &nk_i16_saturating_add_serial;
189
+ t->i16_saturating_mul = &nk_i16_saturating_mul_serial;
190
+ t->i8_saturating_add = &nk_i8_saturating_add_serial;
191
+ t->i8_saturating_mul = &nk_i8_saturating_mul_serial;
192
+ t->i4x2_saturating_add = &nk_i4x2_saturating_add_serial;
193
+ t->i4x2_saturating_mul = &nk_i4x2_saturating_mul_serial;
194
+ t->u64_saturating_add = &nk_u64_saturating_add_serial;
195
+ t->u64_saturating_mul = &nk_u64_saturating_mul_serial;
196
+ t->u32_saturating_add = &nk_u32_saturating_add_serial;
197
+ t->u32_saturating_mul = &nk_u32_saturating_mul_serial;
198
+ t->u16_saturating_add = &nk_u16_saturating_add_serial;
199
+ t->u16_saturating_mul = &nk_u16_saturating_mul_serial;
200
+ t->u8_saturating_add = &nk_u8_saturating_add_serial;
201
+ t->u8_saturating_mul = &nk_u8_saturating_mul_serial;
202
+ t->u4x2_saturating_add = &nk_u4x2_saturating_add_serial;
203
+ t->u4x2_saturating_mul = &nk_u4x2_saturating_mul_serial;
204
+
205
+ #if NK_TARGET_RVV
206
+ if (caps & nk_cap_rvv_k) {
207
+ t->i64_saturating_add = &nk_i64_saturating_add_rvv;
208
+ t->i64_saturating_mul = &nk_i64_saturating_mul_rvv;
209
+ t->i32_saturating_add = &nk_i32_saturating_add_rvv;
210
+ t->i32_saturating_mul = &nk_i32_saturating_mul_rvv;
211
+ t->i16_saturating_add = &nk_i16_saturating_add_rvv;
212
+ t->i16_saturating_mul = &nk_i16_saturating_mul_rvv;
213
+ t->i8_saturating_add = &nk_i8_saturating_add_rvv;
214
+ t->i8_saturating_mul = &nk_i8_saturating_mul_rvv;
215
+ t->u64_saturating_add = &nk_u64_saturating_add_rvv;
216
+ t->u64_saturating_mul = &nk_u64_saturating_mul_rvv;
217
+ t->u32_saturating_add = &nk_u32_saturating_add_rvv;
218
+ t->u32_saturating_mul = &nk_u32_saturating_mul_rvv;
219
+ t->u16_saturating_add = &nk_u16_saturating_add_rvv;
220
+ t->u16_saturating_mul = &nk_u16_saturating_mul_rvv;
221
+ t->u8_saturating_add = &nk_u8_saturating_add_rvv;
222
+ t->u8_saturating_mul = &nk_u8_saturating_mul_rvv;
223
+ }
224
+ #endif
225
+
226
+ #if NK_TARGET_NEON
227
+ if (caps & nk_cap_neon_k) {
228
+ t->i64_saturating_add = &nk_i64_saturating_add_neon;
229
+ t->i64_saturating_mul = &nk_i64_saturating_mul_neon;
230
+ t->i32_saturating_add = &nk_i32_saturating_add_neon;
231
+ t->i16_saturating_add = &nk_i16_saturating_add_neon;
232
+ t->i8_saturating_add = &nk_i8_saturating_add_neon;
233
+ t->u64_saturating_add = &nk_u64_saturating_add_neon;
234
+ t->u64_saturating_mul = &nk_u64_saturating_mul_neon;
235
+ t->u32_saturating_add = &nk_u32_saturating_add_neon;
236
+ t->u16_saturating_add = &nk_u16_saturating_add_neon;
237
+ t->u8_saturating_add = &nk_u8_saturating_add_neon;
238
+ }
239
+ #endif
240
+
241
+ #if NK_TARGET_HASWELL
242
+ if (caps & nk_cap_haswell_k) {
243
+ t->i64_saturating_mul = &nk_i64_saturating_mul_haswell;
244
+ t->i16_saturating_add = &nk_i16_saturating_add_haswell;
245
+ t->i8_saturating_add = &nk_i8_saturating_add_haswell;
246
+ t->u64_saturating_mul = &nk_u64_saturating_mul_haswell;
247
+ t->u16_saturating_add = &nk_u16_saturating_add_haswell;
248
+ t->u8_saturating_add = &nk_u8_saturating_add_haswell;
249
+ }
250
+ #endif
251
+
252
+ // Conversion-free ordering for mini-floats
253
+ t->bf16_order = &nk_bf16_order_serial;
254
+ t->f16_order = &nk_f16_order_serial;
255
+ t->e5m2_order = &nk_e5m2_order_serial;
256
+ t->e4m3_order = &nk_e4m3_order_serial;
257
+ t->e3m2_order = &nk_e3m2_order_serial;
258
+ t->e2m3_order = &nk_e2m3_order_serial;
259
+
260
+ #if NK_TARGET_SAPPHIRE
261
+ if (caps & nk_cap_sapphire_k) { t->f16_order = &nk_f16_order_sapphire; }
262
+ #endif
263
+ }
264
+
265
+ // Scalar conversion dispatch functions
266
+
267
+ NK_DYNAMIC void nk_bf16_to_f32(nk_bf16_t const *src, nk_f32_t *dest) { nk_dispatch_table.bf16_to_f32(src, dest); }
268
+ NK_DYNAMIC void nk_f32_to_bf16(nk_f32_t const *src, nk_bf16_t *dest) { nk_dispatch_table.f32_to_bf16(src, dest); }
269
+ NK_DYNAMIC void nk_f16_to_f32(nk_f16_t const *src, nk_f32_t *dest) { nk_dispatch_table.f16_to_f32(src, dest); }
270
+ NK_DYNAMIC void nk_f32_to_f16(nk_f32_t const *src, nk_f16_t *dest) { nk_dispatch_table.f32_to_f16(src, dest); }
271
+ NK_DYNAMIC void nk_e5m2_to_f32(nk_e5m2_t const *src, nk_f32_t *dest) { nk_dispatch_table.e5m2_to_f32(src, dest); }
272
+ NK_DYNAMIC void nk_f32_to_e5m2(nk_f32_t const *src, nk_e5m2_t *dest) { nk_dispatch_table.f32_to_e5m2(src, dest); }
273
+ NK_DYNAMIC void nk_e4m3_to_f32(nk_e4m3_t const *src, nk_f32_t *dest) { nk_dispatch_table.e4m3_to_f32(src, dest); }
274
+ NK_DYNAMIC void nk_f32_to_e4m3(nk_f32_t const *src, nk_e4m3_t *dest) { nk_dispatch_table.f32_to_e4m3(src, dest); }
275
+ NK_DYNAMIC void nk_e3m2_to_f32(nk_e3m2_t const *src, nk_f32_t *dest) { nk_dispatch_table.e3m2_to_f32(src, dest); }
276
+ NK_DYNAMIC void nk_f32_to_e3m2(nk_f32_t const *src, nk_e3m2_t *dest) { nk_dispatch_table.f32_to_e3m2(src, dest); }
277
+ NK_DYNAMIC void nk_e2m3_to_f32(nk_e2m3_t const *src, nk_f32_t *dest) { nk_dispatch_table.e2m3_to_f32(src, dest); }
278
+ NK_DYNAMIC void nk_f32_to_e2m3(nk_f32_t const *src, nk_e2m3_t *dest) { nk_dispatch_table.f32_to_e2m3(src, dest); }
279
+
280
+ // Scalar math dispatch functions
281
+
282
+ NK_DYNAMIC nk_f64_t nk_f64_sqrt(nk_f64_t x) { return nk_dispatch_table.f64_sqrt(x); }
283
+ NK_DYNAMIC nk_f64_t nk_f64_rsqrt(nk_f64_t x) { return nk_dispatch_table.f64_rsqrt(x); }
284
+ NK_DYNAMIC nk_f64_t nk_f64_fma(nk_f64_t a, nk_f64_t b, nk_f64_t c) { return nk_dispatch_table.f64_fma(a, b, c); }
285
+ NK_DYNAMIC nk_f32_t nk_f32_sqrt(nk_f32_t x) { return nk_dispatch_table.f32_sqrt(x); }
286
+ NK_DYNAMIC nk_f32_t nk_f32_rsqrt(nk_f32_t x) { return nk_dispatch_table.f32_rsqrt(x); }
287
+ NK_DYNAMIC nk_f32_t nk_f32_fma(nk_f32_t a, nk_f32_t b, nk_f32_t c) { return nk_dispatch_table.f32_fma(a, b, c); }
288
+ NK_DYNAMIC nk_f16_t nk_f16_sqrt(nk_f16_t x) { return nk_dispatch_table.f16_sqrt(x); }
289
+ NK_DYNAMIC nk_f16_t nk_f16_rsqrt(nk_f16_t x) { return nk_dispatch_table.f16_rsqrt(x); }
290
+ NK_DYNAMIC nk_f16_t nk_f16_fma(nk_f16_t a, nk_f16_t b, nk_f16_t c) { return nk_dispatch_table.f16_fma(a, b, c); }
291
+
292
+ // Saturating arithmetic dispatch functions
293
+
294
+ NK_DYNAMIC nk_i64_t nk_i64_saturating_add(nk_i64_t a, nk_i64_t b) { return nk_dispatch_table.i64_saturating_add(a, b); }
295
+ NK_DYNAMIC nk_i64_t nk_i64_saturating_mul(nk_i64_t a, nk_i64_t b) { return nk_dispatch_table.i64_saturating_mul(a, b); }
296
+ NK_DYNAMIC nk_i32_t nk_i32_saturating_add(nk_i32_t a, nk_i32_t b) { return nk_dispatch_table.i32_saturating_add(a, b); }
297
+ NK_DYNAMIC nk_i32_t nk_i32_saturating_mul(nk_i32_t a, nk_i32_t b) { return nk_dispatch_table.i32_saturating_mul(a, b); }
298
+ NK_DYNAMIC nk_i16_t nk_i16_saturating_add(nk_i16_t a, nk_i16_t b) { return nk_dispatch_table.i16_saturating_add(a, b); }
299
+ NK_DYNAMIC nk_i16_t nk_i16_saturating_mul(nk_i16_t a, nk_i16_t b) { return nk_dispatch_table.i16_saturating_mul(a, b); }
300
+ NK_DYNAMIC nk_i8_t nk_i8_saturating_add(nk_i8_t a, nk_i8_t b) { return nk_dispatch_table.i8_saturating_add(a, b); }
301
+ NK_DYNAMIC nk_i8_t nk_i8_saturating_mul(nk_i8_t a, nk_i8_t b) { return nk_dispatch_table.i8_saturating_mul(a, b); }
302
+ NK_DYNAMIC nk_i4x2_t nk_i4x2_saturating_add(nk_i4x2_t a, nk_i4x2_t b) {
303
+ return nk_dispatch_table.i4x2_saturating_add(a, b);
304
+ }
305
+ NK_DYNAMIC nk_i4x2_t nk_i4x2_saturating_mul(nk_i4x2_t a, nk_i4x2_t b) {
306
+ return nk_dispatch_table.i4x2_saturating_mul(a, b);
307
+ }
308
+ NK_DYNAMIC nk_u64_t nk_u64_saturating_add(nk_u64_t a, nk_u64_t b) { return nk_dispatch_table.u64_saturating_add(a, b); }
309
+ NK_DYNAMIC nk_u64_t nk_u64_saturating_mul(nk_u64_t a, nk_u64_t b) { return nk_dispatch_table.u64_saturating_mul(a, b); }
310
+ NK_DYNAMIC nk_u32_t nk_u32_saturating_add(nk_u32_t a, nk_u32_t b) { return nk_dispatch_table.u32_saturating_add(a, b); }
311
+ NK_DYNAMIC nk_u32_t nk_u32_saturating_mul(nk_u32_t a, nk_u32_t b) { return nk_dispatch_table.u32_saturating_mul(a, b); }
312
+ NK_DYNAMIC nk_u16_t nk_u16_saturating_add(nk_u16_t a, nk_u16_t b) { return nk_dispatch_table.u16_saturating_add(a, b); }
313
+ NK_DYNAMIC nk_u16_t nk_u16_saturating_mul(nk_u16_t a, nk_u16_t b) { return nk_dispatch_table.u16_saturating_mul(a, b); }
314
+ NK_DYNAMIC nk_u8_t nk_u8_saturating_add(nk_u8_t a, nk_u8_t b) { return nk_dispatch_table.u8_saturating_add(a, b); }
315
+ NK_DYNAMIC nk_u8_t nk_u8_saturating_mul(nk_u8_t a, nk_u8_t b) { return nk_dispatch_table.u8_saturating_mul(a, b); }
316
+ NK_DYNAMIC nk_u4x2_t nk_u4x2_saturating_add(nk_u4x2_t a, nk_u4x2_t b) {
317
+ return nk_dispatch_table.u4x2_saturating_add(a, b);
318
+ }
319
+ NK_DYNAMIC nk_u4x2_t nk_u4x2_saturating_mul(nk_u4x2_t a, nk_u4x2_t b) {
320
+ return nk_dispatch_table.u4x2_saturating_mul(a, b);
321
+ }
322
+
323
+ // Ordering dispatch functions
324
+
325
+ NK_DYNAMIC int nk_bf16_order(nk_bf16_t a, nk_bf16_t b) { return nk_dispatch_table.bf16_order(a, b); }
326
+ NK_DYNAMIC int nk_f16_order(nk_f16_t a, nk_f16_t b) { return nk_dispatch_table.f16_order(a, b); }
327
+ NK_DYNAMIC int nk_e5m2_order(nk_e5m2_t a, nk_e5m2_t b) { return nk_dispatch_table.e5m2_order(a, b); }
328
+ NK_DYNAMIC int nk_e4m3_order(nk_e4m3_t a, nk_e4m3_t b) { return nk_dispatch_table.e4m3_order(a, b); }
329
+ NK_DYNAMIC int nk_e3m2_order(nk_e3m2_t a, nk_e3m2_t b) { return nk_dispatch_table.e3m2_order(a, b); }
330
+ NK_DYNAMIC int nk_e2m3_order(nk_e2m3_t a, nk_e2m3_t b) { return nk_dispatch_table.e2m3_order(a, b); }
@@ -0,0 +1,148 @@
1
+ /**
2
+ * @brief Dispatch Initialization for U1 Data Types.
3
+ * @file c/dispatch_u1.c
4
+ * @author Ash Vardanian
5
+ * @date February 3, 2026
6
+ */
7
+ #include "dispatch.h"
8
+
9
+ void nk_dispatch_u1_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punned_t *m, nk_capability_t *c) {
10
+ typedef nk_kernel_punned_t m_t;
11
+ #if NK_TARGET_V128RELAXED
12
+ if (v & nk_cap_v128relaxed_k) switch (k) {
13
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_u1_v128relaxed, *c = nk_cap_v128relaxed_k; return;
14
+ case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u1_v128relaxed, *c = nk_cap_v128relaxed_k; return;
15
+ case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u1_v128relaxed, *c = nk_cap_v128relaxed_k; return;
16
+ default: break;
17
+ }
18
+ #endif
19
+ #if NK_TARGET_SMEBI32
20
+ if (v & nk_cap_sme_k) switch (k) {
21
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_u1_smebi32, *c = nk_cap_sme_k; return;
22
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u1_smebi32, *c = nk_cap_sme_k; return;
23
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u1_smebi32, *c = nk_cap_sme_k; return;
24
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u1_smebi32, *c = nk_cap_sme_k; return;
25
+ case nk_kernel_hammings_packed_k: *m = (m_t)&nk_hammings_packed_u1_smebi32, *c = nk_cap_sme_k; return;
26
+ case nk_kernel_hammings_symmetric_k: *m = (m_t)&nk_hammings_symmetric_u1_smebi32, *c = nk_cap_sme_k; return;
27
+ case nk_kernel_jaccards_packed_k: *m = (m_t)&nk_jaccards_packed_u1_smebi32, *c = nk_cap_sme_k; return;
28
+ case nk_kernel_jaccards_symmetric_k: *m = (m_t)&nk_jaccards_symmetric_u1_smebi32, *c = nk_cap_sme_k; return;
29
+ default: break;
30
+ }
31
+ #endif
32
+ #if NK_TARGET_SVE
33
+ if (v & nk_cap_sve_k) switch (k) {
34
+ case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u1_sve, *c = nk_cap_sve_k; return;
35
+ case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u1_sve, *c = nk_cap_sve_k; return;
36
+ default: break;
37
+ }
38
+ #endif
39
+ #if NK_TARGET_NEON
40
+ if (v & nk_cap_neon_k) switch (k) {
41
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_u1_neon, *c = nk_cap_neon_k; return;
42
+ case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u1_neon, *c = nk_cap_neon_k; return;
43
+ case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u1_neon, *c = nk_cap_neon_k; return;
44
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_u1_neon, *c = nk_cap_neon_k; return;
45
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u1_neon, *c = nk_cap_neon_k; return;
46
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u1_neon, *c = nk_cap_neon_k; return;
47
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u1_neon, *c = nk_cap_neon_k; return;
48
+ case nk_kernel_hammings_packed_k: *m = (m_t)&nk_hammings_packed_u1_neon, *c = nk_cap_neon_k; return;
49
+ case nk_kernel_hammings_symmetric_k: *m = (m_t)&nk_hammings_symmetric_u1_neon, *c = nk_cap_neon_k; return;
50
+ case nk_kernel_jaccards_packed_k: *m = (m_t)&nk_jaccards_packed_u1_neon, *c = nk_cap_neon_k; return;
51
+ case nk_kernel_jaccards_symmetric_k: *m = (m_t)&nk_jaccards_symmetric_u1_neon, *c = nk_cap_neon_k; return;
52
+ default: break;
53
+ }
54
+ #endif
55
+ #if NK_TARGET_ICELAKE
56
+ if (v & nk_cap_icelake_k) switch (k) {
57
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_u1_icelake, *c = nk_cap_icelake_k; return;
58
+ case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u1_icelake, *c = nk_cap_icelake_k; return;
59
+ case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u1_icelake, *c = nk_cap_icelake_k; return;
60
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_u1_icelake, *c = nk_cap_icelake_k; return;
61
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u1_icelake, *c = nk_cap_icelake_k; return;
62
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u1_icelake, *c = nk_cap_icelake_k; return;
63
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u1_icelake, *c = nk_cap_icelake_k; return;
64
+ case nk_kernel_hammings_packed_k: *m = (m_t)&nk_hammings_packed_u1_icelake, *c = nk_cap_icelake_k; return;
65
+ case nk_kernel_hammings_symmetric_k: *m = (m_t)&nk_hammings_symmetric_u1_icelake, *c = nk_cap_icelake_k; return;
66
+ case nk_kernel_jaccards_packed_k: *m = (m_t)&nk_jaccards_packed_u1_icelake, *c = nk_cap_icelake_k; return;
67
+ case nk_kernel_jaccards_symmetric_k: *m = (m_t)&nk_jaccards_symmetric_u1_icelake, *c = nk_cap_icelake_k; return;
68
+ default: break;
69
+ }
70
+ #endif
71
+ #if NK_TARGET_SKYLAKE
72
+ if (v & nk_cap_skylake_k) switch (k) {
73
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_u1_skylake, *c = nk_cap_skylake_k; return;
74
+ default: break;
75
+ }
76
+ #endif
77
+ #if NK_TARGET_HASWELL
78
+ if (v & nk_cap_haswell_k) switch (k) {
79
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_u1_haswell, *c = nk_cap_haswell_k; return;
80
+ case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u1_haswell, *c = nk_cap_haswell_k; return;
81
+ case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u1_haswell, *c = nk_cap_haswell_k; return;
82
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_u1_haswell, *c = nk_cap_haswell_k; return;
83
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u1_haswell, *c = nk_cap_haswell_k; return;
84
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u1_haswell, *c = nk_cap_haswell_k; return;
85
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u1_haswell, *c = nk_cap_haswell_k; return;
86
+ case nk_kernel_hammings_packed_k: *m = (m_t)&nk_hammings_packed_u1_haswell, *c = nk_cap_haswell_k; return;
87
+ case nk_kernel_hammings_symmetric_k: *m = (m_t)&nk_hammings_symmetric_u1_haswell, *c = nk_cap_haswell_k; return;
88
+ case nk_kernel_jaccards_packed_k: *m = (m_t)&nk_jaccards_packed_u1_haswell, *c = nk_cap_haswell_k; return;
89
+ case nk_kernel_jaccards_symmetric_k: *m = (m_t)&nk_jaccards_symmetric_u1_haswell, *c = nk_cap_haswell_k; return;
90
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_u1_haswell, *c = nk_cap_haswell_k; return;
91
+ default: break;
92
+ }
93
+ #endif
94
+ #if NK_TARGET_RVVBB
95
+ if (v & nk_cap_rvvbb_k) switch (k) {
96
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_u1_rvvbb, *c = nk_cap_rvvbb_k; return;
97
+ case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u1_rvvbb, *c = nk_cap_rvvbb_k; return;
98
+ case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u1_rvvbb, *c = nk_cap_rvvbb_k; return;
99
+ default: break;
100
+ }
101
+ #endif
102
+ #if NK_TARGET_RVV
103
+ if (v & nk_cap_rvv_k) switch (k) {
104
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_u1_rvv, *c = nk_cap_rvv_k; return;
105
+ case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u1_rvv, *c = nk_cap_rvv_k; return;
106
+ case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u1_rvv, *c = nk_cap_rvv_k; return;
107
+ default: break;
108
+ }
109
+ #endif
110
+ if (v & nk_cap_serial_k) switch (k) {
111
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_u1_serial, *c = nk_cap_serial_k; return;
112
+ case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u1_serial, *c = nk_cap_serial_k; return;
113
+ case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u1_serial, *c = nk_cap_serial_k; return;
114
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_u1_serial, *c = nk_cap_serial_k; return;
115
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u1_serial, *c = nk_cap_serial_k; return;
116
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u1_serial, *c = nk_cap_serial_k; return;
117
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u1_serial, *c = nk_cap_serial_k; return;
118
+ case nk_kernel_hammings_packed_k: *m = (m_t)&nk_hammings_packed_u1_serial, *c = nk_cap_serial_k; return;
119
+ case nk_kernel_hammings_symmetric_k: *m = (m_t)&nk_hammings_symmetric_u1_serial, *c = nk_cap_serial_k; return;
120
+ case nk_kernel_jaccards_packed_k: *m = (m_t)&nk_jaccards_packed_u1_serial, *c = nk_cap_serial_k; return;
121
+ case nk_kernel_jaccards_symmetric_k: *m = (m_t)&nk_jaccards_symmetric_u1_serial, *c = nk_cap_serial_k; return;
122
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_u1_serial, *c = nk_cap_serial_k; return;
123
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_u1_serial, *c = nk_cap_serial_k; return;
124
+ default: break;
125
+ }
126
+
127
+ // Error fallback - zero capability signals lookup failure
128
+ *m = (m_t)nk_error_dense_, *c = 0;
129
+ }
130
+
131
+ void nk_dispatch_u1_init_(nk_capability_t caps) {
132
+ nk_implementations_t *t = &nk_dispatch_table;
133
+ nk_capability_t used;
134
+
135
+ nk_dispatch_u1_find_(caps, nk_kernel_dot_k, (nk_kernel_punned_t *)&t->dot_u1, &used);
136
+ nk_dispatch_u1_find_(caps, nk_kernel_hamming_k, (nk_kernel_punned_t *)&t->hamming_u1, &used);
137
+ nk_dispatch_u1_find_(caps, nk_kernel_jaccard_k, (nk_kernel_punned_t *)&t->jaccard_u1, &used);
138
+ nk_dispatch_u1_find_(caps, nk_kernel_reduce_moments_k, (nk_kernel_punned_t *)&t->reduce_moments_u1, &used);
139
+ nk_dispatch_u1_find_(caps, nk_kernel_reduce_minmax_k, (nk_kernel_punned_t *)&t->reduce_minmax_u1, &used);
140
+ nk_dispatch_u1_find_(caps, nk_kernel_dots_packed_size_k, (nk_kernel_punned_t *)&t->dots_packed_size_u1, &used);
141
+ nk_dispatch_u1_find_(caps, nk_kernel_dots_pack_k, (nk_kernel_punned_t *)&t->dots_pack_u1, &used);
142
+ nk_dispatch_u1_find_(caps, nk_kernel_dots_packed_k, (nk_kernel_punned_t *)&t->dots_packed_u1, &used);
143
+ nk_dispatch_u1_find_(caps, nk_kernel_dots_symmetric_k, (nk_kernel_punned_t *)&t->dots_symmetric_u1, &used);
144
+ nk_dispatch_u1_find_(caps, nk_kernel_hammings_packed_k, (nk_kernel_punned_t *)&t->hammings_packed_u1, &used);
145
+ nk_dispatch_u1_find_(caps, nk_kernel_hammings_symmetric_k, (nk_kernel_punned_t *)&t->hammings_symmetric_u1, &used);
146
+ nk_dispatch_u1_find_(caps, nk_kernel_jaccards_packed_k, (nk_kernel_punned_t *)&t->jaccards_packed_u1, &used);
147
+ nk_dispatch_u1_find_(caps, nk_kernel_jaccards_symmetric_k, (nk_kernel_punned_t *)&t->jaccards_symmetric_u1, &used);
148
+ }
@@ -0,0 +1,124 @@
1
+ /**
2
+ * @brief Dispatch Initialization for U16 Data Types.
3
+ * @file c/dispatch_u16.c
4
+ * @author Ash Vardanian
5
+ * @date February 3, 2026
6
+ */
7
+ #include "dispatch.h"
8
+
9
+ void nk_dispatch_u16_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punned_t *m, nk_capability_t *c) {
10
+ typedef nk_kernel_punned_t m_t;
11
+ #if NK_TARGET_V128RELAXED
12
+ if (v & nk_cap_v128relaxed_k) switch (k) {
13
+ case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
14
+ case nk_kernel_reduce_moments_k:
15
+ *m = (m_t)&nk_reduce_moments_u16_v128relaxed, *c = nk_cap_v128relaxed_k;
16
+ return;
17
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_u16_v128relaxed, *c = nk_cap_v128relaxed_k; return;
18
+ default: break;
19
+ }
20
+ #endif
21
+ #if NK_TARGET_SVE2
22
+ if (v & nk_cap_sve2_k) switch (k) {
23
+ case nk_kernel_sparse_intersect_k: *m = (m_t)&nk_sparse_intersect_u16_sve2, *c = nk_cap_sve2_k; return;
24
+ default: break;
25
+ }
26
+ #endif
27
+ #if NK_TARGET_SVE
28
+ if (v & nk_cap_sve_k) switch (k) {
29
+ case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u16_sve, *c = nk_cap_sve_k; return;
30
+ default: break;
31
+ }
32
+ #endif
33
+ #if NK_TARGET_NEON
34
+ if (v & nk_cap_neon_k) switch (k) {
35
+ case nk_kernel_sparse_intersect_k: *m = (m_t)&nk_sparse_intersect_u16_neon, *c = nk_cap_neon_k; return;
36
+ case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u16_neon, *c = nk_cap_neon_k; return;
37
+ case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_u16_neon, *c = nk_cap_neon_k; return;
38
+ case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_u16_neon, *c = nk_cap_neon_k; return;
39
+ case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_u16_neon, *c = nk_cap_neon_k; return;
40
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_u16_neon, *c = nk_cap_neon_k; return;
41
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_u16_neon, *c = nk_cap_neon_k; return;
42
+ default: break;
43
+ }
44
+ #endif
45
+ #if NK_TARGET_TURIN
46
+ if (v & nk_cap_turin_k) switch (k) {
47
+ case nk_kernel_sparse_intersect_k: *m = (m_t)&nk_sparse_intersect_u16_turin, *c = nk_cap_turin_k; return;
48
+ default: break;
49
+ }
50
+ #endif
51
+ #if NK_TARGET_ICELAKE
52
+ if (v & nk_cap_icelake_k) switch (k) {
53
+ case nk_kernel_sparse_intersect_k: *m = (m_t)&nk_sparse_intersect_u16_icelake, *c = nk_cap_skylake_k; return;
54
+ case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u16_icelake, *c = nk_cap_icelake_k; return;
55
+ case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_u16_icelake, *c = nk_cap_icelake_k; return;
56
+ default: break;
57
+ }
58
+ #endif
59
+ #if NK_TARGET_SKYLAKE
60
+ if (v & nk_cap_skylake_k) switch (k) {
61
+ case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_u16_skylake, *c = nk_cap_skylake_k; return;
62
+ case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_u16_skylake, *c = nk_cap_skylake_k; return;
63
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_u16_skylake, *c = nk_cap_skylake_k; return;
64
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_u16_skylake, *c = nk_cap_skylake_k; return;
65
+ default: break;
66
+ }
67
+ #endif
68
+ #if NK_TARGET_ALDER
69
+ if (v & nk_cap_alder_k) switch (k) {
70
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_u16_alder, *c = nk_cap_alder_k; return;
71
+ default: break;
72
+ }
73
+ #endif
74
+ #if NK_TARGET_HASWELL
75
+ if (v & nk_cap_haswell_k) switch (k) {
76
+ case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u16_haswell, *c = nk_cap_haswell_k; return;
77
+ case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_u16_haswell, *c = nk_cap_haswell_k; return;
78
+ case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_u16_haswell, *c = nk_cap_haswell_k; return;
79
+ case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_u16_haswell, *c = nk_cap_haswell_k; return;
80
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_u16_haswell, *c = nk_cap_haswell_k; return;
81
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_u16_haswell, *c = nk_cap_haswell_k; return;
82
+ default: break;
83
+ }
84
+ #endif
85
+ #if NK_TARGET_RVV
86
+ if (v & nk_cap_rvv_k) switch (k) {
87
+ case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u16_rvv, *c = nk_cap_rvv_k; return;
88
+ case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_u16_rvv, *c = nk_cap_rvv_k; return;
89
+ case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_u16_rvv, *c = nk_cap_rvv_k; return;
90
+ case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_u16_rvv, *c = nk_cap_rvv_k; return;
91
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_u16_rvv, *c = nk_cap_rvv_k; return;
92
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_u16_rvv, *c = nk_cap_rvv_k; return;
93
+ default: break;
94
+ }
95
+ #endif
96
+ if (v & nk_cap_serial_k) switch (k) {
97
+ case nk_kernel_sparse_intersect_k: *m = (m_t)&nk_sparse_intersect_u16_serial, *c = nk_cap_serial_k; return;
98
+ case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u16_serial, *c = nk_cap_serial_k; return;
99
+ case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_u16_serial, *c = nk_cap_serial_k; return;
100
+ case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_u16_serial, *c = nk_cap_serial_k; return;
101
+ case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_u16_serial, *c = nk_cap_serial_k; return;
102
+ case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_u16_serial, *c = nk_cap_serial_k; return;
103
+ case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_u16_serial, *c = nk_cap_serial_k; return;
104
+ case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_u16_serial, *c = nk_cap_serial_k; return;
105
+ default: break;
106
+ }
107
+
108
+ // Error fallback - zero capability signals lookup failure
109
+ *m = (m_t)nk_error_dense_, *c = 0;
110
+ }
111
+
112
+ void nk_dispatch_u16_init_(nk_capability_t caps) {
113
+ nk_implementations_t *t = &nk_dispatch_table;
114
+ nk_capability_t used;
115
+
116
+ nk_dispatch_u16_find_(caps, nk_kernel_jaccard_k, (nk_kernel_punned_t *)&t->jaccard_u16, &used);
117
+ nk_dispatch_u16_find_(caps, nk_kernel_sparse_intersect_k, (nk_kernel_punned_t *)&t->sparse_intersect_u16, &used);
118
+ nk_dispatch_u16_find_(caps, nk_kernel_each_scale_k, (nk_kernel_punned_t *)&t->each_scale_u16, &used);
119
+ nk_dispatch_u16_find_(caps, nk_kernel_each_sum_k, (nk_kernel_punned_t *)&t->each_sum_u16, &used);
120
+ nk_dispatch_u16_find_(caps, nk_kernel_each_blend_k, (nk_kernel_punned_t *)&t->each_blend_u16, &used);
121
+ nk_dispatch_u16_find_(caps, nk_kernel_each_fma_k, (nk_kernel_punned_t *)&t->each_fma_u16, &used);
122
+ nk_dispatch_u16_find_(caps, nk_kernel_reduce_moments_k, (nk_kernel_punned_t *)&t->reduce_moments_u16, &used);
123
+ nk_dispatch_u16_find_(caps, nk_kernel_reduce_minmax_k, (nk_kernel_punned_t *)&t->reduce_minmax_u16, &used);
124
+ }