numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +197 -124
  2. package/binding.gyp +34 -484
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
package/c/dispatch_i8.c CHANGED
@@ -34,6 +34,10 @@ void nk_dispatch_i8_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
34
34
  case nk_kernel_euclideans_symmetric_k:
35
35
  *m = (m_t)&nk_euclideans_symmetric_i8_v128relaxed, *c = nk_cap_v128relaxed_k;
36
36
  return;
37
+ case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_i8_v128relaxed, *c = nk_cap_v128relaxed_k; return;
38
+ case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_i8_v128relaxed, *c = nk_cap_v128relaxed_k; return;
39
+ case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_i8_v128relaxed, *c = nk_cap_v128relaxed_k; return;
40
+ case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_i8_v128relaxed, *c = nk_cap_v128relaxed_k; return;
37
41
  default: break;
38
42
  }
39
43
  #endif
@@ -50,6 +54,15 @@ void nk_dispatch_i8_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
50
54
  default: break;
51
55
  }
52
56
  #endif
57
+ #if NK_TARGET_SVESDOT
58
+ if (v & nk_cap_svesdot_k) switch (k) {
59
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_i8_svesdot, *c = nk_cap_svesdot_k; return;
60
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_i8_svesdot, *c = nk_cap_svesdot_k; return;
61
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_i8_svesdot, *c = nk_cap_svesdot_k; return;
62
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_i8_svesdot, *c = nk_cap_svesdot_k; return;
63
+ default: break;
64
+ }
65
+ #endif
53
66
  #if NK_TARGET_NEONSDOT
54
67
  if (v & nk_cap_neonsdot_k) switch (k) {
55
68
  case nk_kernel_dot_k: *m = (m_t)&nk_dot_i8_neonsdot, *c = nk_cap_neonsdot_k; return;
@@ -74,15 +87,14 @@ void nk_dispatch_i8_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
74
87
  #endif
75
88
  #if NK_TARGET_NEONHALF
76
89
  if (v & nk_cap_neonhalf_k) switch (k) {
77
- case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_i8_neonhalf, *c = nk_cap_neonhalf_k; return;
78
90
  case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_i8_neonhalf, *c = nk_cap_neonhalf_k; return;
79
91
  case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_i8_neonhalf, *c = nk_cap_neonhalf_k; return;
80
- case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_i8_neonhalf, *c = nk_cap_neonhalf_k; return;
81
92
  default: break;
82
93
  }
83
94
  #endif
84
95
  #if NK_TARGET_NEON
85
96
  if (v & nk_cap_neon_k) switch (k) {
97
+ case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_i8_neon, *c = nk_cap_neon_k; return;
86
98
  case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_i8_neon, *c = nk_cap_neon_k; return;
87
99
  case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_i8_neon, *c = nk_cap_neon_k; return;
88
100
  default: break;
@@ -113,7 +125,6 @@ void nk_dispatch_i8_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
113
125
  #endif
114
126
  #if NK_TARGET_SAPPHIRE
115
127
  if (v & nk_cap_sapphire_k) switch (k) {
116
- case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_i8_sapphire, *c = nk_cap_sapphire_k; return;
117
128
  case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_i8_sapphire, *c = nk_cap_sapphire_k; return;
118
129
  case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_i8_sapphire, *c = nk_cap_sapphire_k; return;
119
130
  default: break;
@@ -212,6 +223,54 @@ void nk_dispatch_i8_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
212
223
  default: break;
213
224
  }
214
225
  #endif
226
+ #if NK_TARGET_POWERVSX
227
+ if (v & nk_cap_powervsx_k) switch (k) {
228
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_i8_powervsx, *c = nk_cap_powervsx_k; return;
229
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_i8_powervsx, *c = nk_cap_powervsx_k; return;
230
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_i8_powervsx, *c = nk_cap_powervsx_k; return;
231
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_i8_powervsx, *c = nk_cap_powervsx_k; return;
232
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_i8_powervsx, *c = nk_cap_powervsx_k; return;
233
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_i8_powervsx, *c = nk_cap_powervsx_k; return;
234
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_i8_powervsx, *c = nk_cap_powervsx_k; return;
235
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_i8_powervsx, *c = nk_cap_powervsx_k; return;
236
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_i8_powervsx, *c = nk_cap_powervsx_k; return;
237
+ case nk_kernel_angulars_symmetric_k:
238
+ *m = (m_t)&nk_angulars_symmetric_i8_powervsx, *c = nk_cap_powervsx_k;
239
+ return;
240
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_i8_powervsx, *c = nk_cap_powervsx_k; return;
241
+ case nk_kernel_euclideans_symmetric_k:
242
+ *m = (m_t)&nk_euclideans_symmetric_i8_powervsx, *c = nk_cap_powervsx_k;
243
+ return;
244
+ default: break;
245
+ }
246
+ #endif
247
+ #if NK_TARGET_LOONGSONASX
248
+ if (v & nk_cap_loongsonasx_k) switch (k) {
249
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_i8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
250
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_i8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
251
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_i8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
252
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_i8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
253
+ case nk_kernel_dots_packed_size_k:
254
+ *m = (m_t)&nk_dots_packed_size_i8_loongsonasx, *c = nk_cap_loongsonasx_k;
255
+ return;
256
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_i8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
257
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_i8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
258
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_i8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
259
+ case nk_kernel_angulars_packed_k:
260
+ *m = (m_t)&nk_angulars_packed_i8_loongsonasx, *c = nk_cap_loongsonasx_k;
261
+ return;
262
+ case nk_kernel_angulars_symmetric_k:
263
+ *m = (m_t)&nk_angulars_symmetric_i8_loongsonasx, *c = nk_cap_loongsonasx_k;
264
+ return;
265
+ case nk_kernel_euclideans_packed_k:
266
+ *m = (m_t)&nk_euclideans_packed_i8_loongsonasx, *c = nk_cap_loongsonasx_k;
267
+ return;
268
+ case nk_kernel_euclideans_symmetric_k:
269
+ *m = (m_t)&nk_euclideans_symmetric_i8_loongsonasx, *c = nk_cap_loongsonasx_k;
270
+ return;
271
+ default: break;
272
+ }
273
+ #endif
215
274
  #if NK_TARGET_RVV
216
275
  if (v & nk_cap_rvv_k) switch (k) {
217
276
  case nk_kernel_dot_k: *m = (m_t)&nk_dot_i8_rvv, *c = nk_cap_rvv_k; return;
@@ -89,6 +89,13 @@ void nk_dispatch_cast_init_(nk_capability_t caps) {
89
89
  }
90
90
  #endif
91
91
 
92
+ #if NK_TARGET_POWERVSX
93
+ if (caps & nk_cap_powervsx_k) {
94
+ t->f16_to_f32 = &nk_f16_to_f32_powervsx;
95
+ t->f32_to_f16 = &nk_f32_to_f16_powervsx;
96
+ }
97
+ #endif
98
+
92
99
  // Scalar conversions: e5m2, e4m3, e3m2, e2m3 (serial only)
93
100
  t->e5m2_to_f32 = &nk_e5m2_to_f32_serial;
94
101
  t->f32_to_e5m2 = &nk_f32_to_e5m2_serial;
@@ -144,6 +151,17 @@ void nk_dispatch_math_init_(nk_capability_t caps) {
144
151
  }
145
152
  #endif
146
153
 
154
+ #if NK_TARGET_POWERVSX
155
+ if (caps & nk_cap_powervsx_k) {
156
+ t->f64_sqrt = &nk_f64_sqrt_powervsx;
157
+ t->f64_rsqrt = &nk_f64_rsqrt_powervsx;
158
+ t->f64_fma = &nk_f64_fma_powervsx;
159
+ t->f32_sqrt = &nk_f32_sqrt_powervsx;
160
+ t->f32_rsqrt = &nk_f32_rsqrt_powervsx;
161
+ t->f32_fma = &nk_f32_fma_powervsx;
162
+ }
163
+ #endif
164
+
147
165
  // Scalar math: f16
148
166
  t->f16_sqrt = &nk_f16_sqrt_serial;
149
167
  t->f16_rsqrt = &nk_f16_rsqrt_serial;
package/c/dispatch_u1.c CHANGED
@@ -17,15 +17,15 @@ void nk_dispatch_u1_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
17
17
  }
18
18
  #endif
19
19
  #if NK_TARGET_SMEBI32
20
- if (v & nk_cap_sme_k) switch (k) {
21
- case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_u1_smebi32, *c = nk_cap_sme_k; return;
22
- case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u1_smebi32, *c = nk_cap_sme_k; return;
23
- case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u1_smebi32, *c = nk_cap_sme_k; return;
24
- case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u1_smebi32, *c = nk_cap_sme_k; return;
25
- case nk_kernel_hammings_packed_k: *m = (m_t)&nk_hammings_packed_u1_smebi32, *c = nk_cap_sme_k; return;
26
- case nk_kernel_hammings_symmetric_k: *m = (m_t)&nk_hammings_symmetric_u1_smebi32, *c = nk_cap_sme_k; return;
27
- case nk_kernel_jaccards_packed_k: *m = (m_t)&nk_jaccards_packed_u1_smebi32, *c = nk_cap_sme_k; return;
28
- case nk_kernel_jaccards_symmetric_k: *m = (m_t)&nk_jaccards_symmetric_u1_smebi32, *c = nk_cap_sme_k; return;
20
+ if (v & nk_cap_smebi32_k) switch (k) {
21
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_u1_smebi32, *c = nk_cap_smebi32_k; return;
22
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u1_smebi32, *c = nk_cap_smebi32_k; return;
23
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u1_smebi32, *c = nk_cap_smebi32_k; return;
24
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u1_smebi32, *c = nk_cap_smebi32_k; return;
25
+ case nk_kernel_hammings_packed_k: *m = (m_t)&nk_hammings_packed_u1_smebi32, *c = nk_cap_smebi32_k; return;
26
+ case nk_kernel_hammings_symmetric_k: *m = (m_t)&nk_hammings_symmetric_u1_smebi32, *c = nk_cap_smebi32_k; return;
27
+ case nk_kernel_jaccards_packed_k: *m = (m_t)&nk_jaccards_packed_u1_smebi32, *c = nk_cap_smebi32_k; return;
28
+ case nk_kernel_jaccards_symmetric_k: *m = (m_t)&nk_jaccards_symmetric_u1_smebi32, *c = nk_cap_smebi32_k; return;
29
29
  default: break;
30
30
  }
31
31
  #endif
@@ -91,6 +91,51 @@ void nk_dispatch_u1_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
91
91
  default: break;
92
92
  }
93
93
  #endif
94
+ #if NK_TARGET_POWERVSX
95
+ if (v & nk_cap_powervsx_k) switch (k) {
96
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_u1_powervsx, *c = nk_cap_powervsx_k; return;
97
+ case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u1_powervsx, *c = nk_cap_powervsx_k; return;
98
+ case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u1_powervsx, *c = nk_cap_powervsx_k; return;
99
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_u1_powervsx, *c = nk_cap_powervsx_k; return;
100
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u1_powervsx, *c = nk_cap_powervsx_k; return;
101
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u1_powervsx, *c = nk_cap_powervsx_k; return;
102
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u1_powervsx, *c = nk_cap_powervsx_k; return;
103
+ case nk_kernel_hammings_packed_k: *m = (m_t)&nk_hammings_packed_u1_powervsx, *c = nk_cap_powervsx_k; return;
104
+ case nk_kernel_hammings_symmetric_k:
105
+ *m = (m_t)&nk_hammings_symmetric_u1_powervsx, *c = nk_cap_powervsx_k;
106
+ return;
107
+ case nk_kernel_jaccards_packed_k: *m = (m_t)&nk_jaccards_packed_u1_powervsx, *c = nk_cap_powervsx_k; return;
108
+ case nk_kernel_jaccards_symmetric_k:
109
+ *m = (m_t)&nk_jaccards_symmetric_u1_powervsx, *c = nk_cap_powervsx_k;
110
+ return;
111
+ default: break;
112
+ }
113
+ #endif
114
+ #if NK_TARGET_LOONGSONASX
115
+ if (v & nk_cap_loongsonasx_k) switch (k) {
116
+ case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u1_loongsonasx, *c = nk_cap_loongsonasx_k; return;
117
+ case nk_kernel_jaccard_k: *m = (m_t)&nk_jaccard_u1_loongsonasx, *c = nk_cap_loongsonasx_k; return;
118
+ case nk_kernel_dots_packed_size_k:
119
+ *m = (m_t)&nk_dots_packed_size_u1_loongsonasx, *c = nk_cap_loongsonasx_k;
120
+ return;
121
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u1_loongsonasx, *c = nk_cap_loongsonasx_k; return;
122
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u1_loongsonasx, *c = nk_cap_loongsonasx_k; return;
123
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u1_loongsonasx, *c = nk_cap_loongsonasx_k; return;
124
+ case nk_kernel_hammings_packed_k:
125
+ *m = (m_t)&nk_hammings_packed_u1_loongsonasx, *c = nk_cap_loongsonasx_k;
126
+ return;
127
+ case nk_kernel_hammings_symmetric_k:
128
+ *m = (m_t)&nk_hammings_symmetric_u1_loongsonasx, *c = nk_cap_loongsonasx_k;
129
+ return;
130
+ case nk_kernel_jaccards_packed_k:
131
+ *m = (m_t)&nk_jaccards_packed_u1_loongsonasx, *c = nk_cap_loongsonasx_k;
132
+ return;
133
+ case nk_kernel_jaccards_symmetric_k:
134
+ *m = (m_t)&nk_jaccards_symmetric_u1_loongsonasx, *c = nk_cap_loongsonasx_k;
135
+ return;
136
+ default: break;
137
+ }
138
+ #endif
94
139
  #if NK_TARGET_RVVBB
95
140
  if (v & nk_cap_rvvbb_k) switch (k) {
96
141
  case nk_kernel_dot_k: *m = (m_t)&nk_dot_u1_rvvbb, *c = nk_cap_rvvbb_k; return;
package/c/dispatch_u4.c CHANGED
@@ -52,6 +52,9 @@ void nk_dispatch_u4_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
52
52
  case nk_kernel_euclideans_symmetric_k:
53
53
  *m = (m_t)&nk_euclideans_symmetric_u4_neonsdot, *c = nk_cap_neonsdot_k;
54
54
  return;
55
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_u4_neonsdot, *c = nk_cap_neonsdot_k; return;
56
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_u4_neonsdot, *c = nk_cap_neonsdot_k; return;
57
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_u4_neonsdot, *c = nk_cap_neonsdot_k; return;
55
58
  default: break;
56
59
  }
57
60
  #endif
package/c/dispatch_u8.c CHANGED
@@ -35,6 +35,10 @@ void nk_dispatch_u8_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
35
35
  case nk_kernel_euclideans_symmetric_k:
36
36
  *m = (m_t)&nk_euclideans_symmetric_u8_v128relaxed, *c = nk_cap_v128relaxed_k;
37
37
  return;
38
+ case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_u8_v128relaxed, *c = nk_cap_v128relaxed_k; return;
39
+ case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_u8_v128relaxed, *c = nk_cap_v128relaxed_k; return;
40
+ case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_u8_v128relaxed, *c = nk_cap_v128relaxed_k; return;
41
+ case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_u8_v128relaxed, *c = nk_cap_v128relaxed_k; return;
38
42
  default: break;
39
43
  }
40
44
  #endif
@@ -57,6 +61,15 @@ void nk_dispatch_u8_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
57
61
  default: break;
58
62
  }
59
63
  #endif
64
+ #if NK_TARGET_SVESDOT
65
+ if (v & nk_cap_svesdot_k) switch (k) {
66
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_u8_svesdot, *c = nk_cap_svesdot_k; return;
67
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_u8_svesdot, *c = nk_cap_svesdot_k; return;
68
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_u8_svesdot, *c = nk_cap_svesdot_k; return;
69
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_u8_svesdot, *c = nk_cap_svesdot_k; return;
70
+ default: break;
71
+ }
72
+ #endif
60
73
  #if NK_TARGET_NEONSDOT
61
74
  if (v & nk_cap_neonsdot_k) switch (k) {
62
75
  case nk_kernel_dot_k: *m = (m_t)&nk_dot_u8_neonsdot, *c = nk_cap_neonsdot_k; return;
@@ -81,16 +94,15 @@ void nk_dispatch_u8_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
81
94
  #endif
82
95
  #if NK_TARGET_NEONHALF
83
96
  if (v & nk_cap_neonhalf_k) switch (k) {
84
- case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_u8_neonhalf, *c = nk_cap_neonhalf_k; return;
85
97
  case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_u8_neonhalf, *c = nk_cap_neonhalf_k; return;
86
98
  case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_u8_neonhalf, *c = nk_cap_neonhalf_k; return;
87
- case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_u8_neonhalf, *c = nk_cap_neonhalf_k; return;
88
99
  default: break;
89
100
  }
90
101
  #endif
91
102
  #if NK_TARGET_NEON
92
103
  if (v & nk_cap_neon_k) switch (k) {
93
104
  case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u8_neon, *c = nk_cap_neon_k; return;
105
+ case nk_kernel_each_sum_k: *m = (m_t)&nk_each_sum_u8_neon, *c = nk_cap_neon_k; return;
94
106
  case nk_kernel_reduce_moments_k: *m = (m_t)&nk_reduce_moments_u8_neon, *c = nk_cap_neon_k; return;
95
107
  case nk_kernel_reduce_minmax_k: *m = (m_t)&nk_reduce_minmax_u8_neon, *c = nk_cap_neon_k; return;
96
108
  default: break;
@@ -121,7 +133,6 @@ void nk_dispatch_u8_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
121
133
  #endif
122
134
  #if NK_TARGET_SAPPHIRE
123
135
  if (v & nk_cap_sapphire_k) switch (k) {
124
- case nk_kernel_each_fma_k: *m = (m_t)&nk_each_fma_u8_sapphire, *c = nk_cap_sapphire_k; return;
125
136
  case nk_kernel_each_blend_k: *m = (m_t)&nk_each_blend_u8_sapphire, *c = nk_cap_sapphire_k; return;
126
137
  case nk_kernel_each_scale_k: *m = (m_t)&nk_each_scale_u8_sapphire, *c = nk_cap_sapphire_k; return;
127
138
  default: break;
@@ -223,6 +234,56 @@ void nk_dispatch_u8_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_punne
223
234
  default: break;
224
235
  }
225
236
  #endif
237
+ #if NK_TARGET_POWERVSX
238
+ if (v & nk_cap_powervsx_k) switch (k) {
239
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_u8_powervsx, *c = nk_cap_powervsx_k; return;
240
+ case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u8_powervsx, *c = nk_cap_powervsx_k; return;
241
+ case nk_kernel_dots_packed_size_k: *m = (m_t)&nk_dots_packed_size_u8_powervsx, *c = nk_cap_powervsx_k; return;
242
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u8_powervsx, *c = nk_cap_powervsx_k; return;
243
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u8_powervsx, *c = nk_cap_powervsx_k; return;
244
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u8_powervsx, *c = nk_cap_powervsx_k; return;
245
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_u8_powervsx, *c = nk_cap_powervsx_k; return;
246
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_u8_powervsx, *c = nk_cap_powervsx_k; return;
247
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_u8_powervsx, *c = nk_cap_powervsx_k; return;
248
+ case nk_kernel_angulars_packed_k: *m = (m_t)&nk_angulars_packed_u8_powervsx, *c = nk_cap_powervsx_k; return;
249
+ case nk_kernel_angulars_symmetric_k:
250
+ *m = (m_t)&nk_angulars_symmetric_u8_powervsx, *c = nk_cap_powervsx_k;
251
+ return;
252
+ case nk_kernel_euclideans_packed_k: *m = (m_t)&nk_euclideans_packed_u8_powervsx, *c = nk_cap_powervsx_k; return;
253
+ case nk_kernel_euclideans_symmetric_k:
254
+ *m = (m_t)&nk_euclideans_symmetric_u8_powervsx, *c = nk_cap_powervsx_k;
255
+ return;
256
+ default: break;
257
+ }
258
+ #endif
259
+ #if NK_TARGET_LOONGSONASX
260
+ if (v & nk_cap_loongsonasx_k) switch (k) {
261
+ case nk_kernel_dot_k: *m = (m_t)&nk_dot_u8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
262
+ case nk_kernel_angular_k: *m = (m_t)&nk_angular_u8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
263
+ case nk_kernel_sqeuclidean_k: *m = (m_t)&nk_sqeuclidean_u8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
264
+ case nk_kernel_euclidean_k: *m = (m_t)&nk_euclidean_u8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
265
+ case nk_kernel_hamming_k: *m = (m_t)&nk_hamming_u8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
266
+ case nk_kernel_dots_packed_size_k:
267
+ *m = (m_t)&nk_dots_packed_size_u8_loongsonasx, *c = nk_cap_loongsonasx_k;
268
+ return;
269
+ case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_u8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
270
+ case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_u8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
271
+ case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_u8_loongsonasx, *c = nk_cap_loongsonasx_k; return;
272
+ case nk_kernel_angulars_packed_k:
273
+ *m = (m_t)&nk_angulars_packed_u8_loongsonasx, *c = nk_cap_loongsonasx_k;
274
+ return;
275
+ case nk_kernel_angulars_symmetric_k:
276
+ *m = (m_t)&nk_angulars_symmetric_u8_loongsonasx, *c = nk_cap_loongsonasx_k;
277
+ return;
278
+ case nk_kernel_euclideans_packed_k:
279
+ *m = (m_t)&nk_euclideans_packed_u8_loongsonasx, *c = nk_cap_loongsonasx_k;
280
+ return;
281
+ case nk_kernel_euclideans_symmetric_k:
282
+ *m = (m_t)&nk_euclideans_symmetric_u8_loongsonasx, *c = nk_cap_loongsonasx_k;
283
+ return;
284
+ default: break;
285
+ }
286
+ #endif
226
287
  #if NK_TARGET_RVV
227
288
  if (v & nk_cap_rvv_k) switch (k) {
228
289
  case nk_kernel_dot_k: *m = (m_t)&nk_dot_u8_rvv, *c = nk_cap_rvv_k; return;
package/c/numkong.c CHANGED
@@ -842,6 +842,9 @@ NK_DYNAMIC nk_capability_t nk_capabilities(void) {
842
842
  return static_capabilities;
843
843
  }
844
844
 
845
+ NK_DYNAMIC nk_capability_t nk_capabilities_available(void) { return nk_capabilities() & nk_capabilities_compiled_(); }
846
+ NK_DYNAMIC nk_capability_t nk_capabilities_compiled(void) { return nk_capabilities_compiled_(); }
847
+
845
848
  NK_DYNAMIC void nk_find_kernel_punned( //
846
849
  nk_kernel_kind_t kind, //
847
850
  nk_dtype_t dtype, //
package/include/README.md CHANGED
@@ -25,7 +25,7 @@ int main(void) {
25
25
 
26
26
  ## Highlights
27
27
 
28
- This is the most complete SDK in the project.
28
+ This is the primary SDK in the project.
29
29
  It is the right layer if you want exact control over dtypes, allocators, packed buffers, dispatch, and host-side partitioning.
30
30
 
31
31
  __Full kernel surface.__
@@ -167,6 +167,20 @@ For example, `f32_t::dot_result_t` is wider than `f32_t`.
167
167
  The higher-level templates use `result_type_ = typename in_type_::dot_result_t` and similar defaults.
168
168
  The fast typed overloads are constrained so that overriding the result type away from the native policy can disable the specialized path and fall back to the more generic one.
169
169
 
170
+ When `__cpp_lib_format >= 202110L` for the C++23 `<format>` header support, all NumKong scalar types provide `std::formatter` specializations with similar format specs to the traditional `float`.
171
+ For the BFloat16 type, the output for `nk::f16_t::from_f32(3.14f)` will look like:
172
+
173
+ | Format spec | Output example | Description |
174
+ | ----------- | -------------------- | -------------------------------------- |
175
+ | `{}` | `3.140625` | Clean float value |
176
+ | `{:#}` | `3.140625 [0x4248]` | Annotated with hex bits |
177
+ | `{:.2f}` | `3.14` | Precision forwarded to float formatter |
178
+ | `{:x}` | `4248` | Raw hex bits |
179
+ | `{:#x}` | `0x4248` | Hex with prefix |
180
+ | `{:X}` | `4248` | Uppercase hex |
181
+ | `{:b}` | `0100001001001000` | Binary bits |
182
+ | `{:#b}` | `0b0100001001001000` | Binary with prefix |
183
+
170
184
  ## Dot Products
171
185
 
172
186
  Dot products are one of the broadest parts of the native SDK.
@@ -243,7 +257,7 @@ nk_jsd_f32(q, p, 3, &js_reverse);
243
257
  assert(js_forward == js_reverse && "JSD is symmetric");
244
258
  ```
245
259
 
246
- These paths are especially valuable once you move below `f64`.
260
+ These paths are useful once you move below `f64`.
247
261
  Naive implementations are usually dominated by repeated scalar transcendental calls and weak accumulation policy.
248
262
 
249
263
  ## Geospatial Metrics
@@ -365,9 +379,44 @@ nk::f64_t dot {};
365
379
  nk::dot(view.row(0), view.row(1), md.extent(1), &dot);
366
380
  ```
367
381
 
382
+ ## Iterators and Enumeration
383
+
384
+ NumKong containers expose random-access iterators for element and row traversal.
385
+
386
+ - __`dim_iterator`__ — random-access iterator over element values, used by `vector`, `vector_view`, and `vector_span`.
387
+ Supports all standard iterator operations plus `index()` to retrieve the current position.
388
+ - __`axis_iterator`__ — random-access iterator over sub-views (rows), used by `tensor_view` and `tensor_span`.
389
+ Also exposes `index()`.
390
+ - __`enumerate()`__ — free function returning a lightweight view that yields `{index, value}` pairs from any container with `begin()`/`end()`/`size()`.
391
+
392
+ ```cpp
393
+ #include <numkong/numkong.hpp>
394
+
395
+ namespace nk = ashvardanian::numkong;
396
+
397
+ nk::vector<nk::f16_t> v(128);
398
+ for (auto [i, val] : nk::enumerate(v))
399
+ std::printf("[%zu] = %f\n", i, val.to_f32());
400
+
401
+ // index() on raw iterators
402
+ for (auto it = v.begin(); it != v.end(); ++it)
403
+ std::printf("[%zu] = %f\n", it.index(), (*it).to_f32());
404
+ ```
405
+
406
+ Since `tensor.hpp` includes `vector.hpp`, `enumerate()` works on tensor row views too.
407
+
408
+ Tensors also support range-for over all logical scalar elements, yielding `(position, value)` pairs.
409
+ For sub-byte types each dimension is a logical scalar. Use `.dims()` to iterate values without positions.
410
+
411
+ ```cpp
412
+ for (auto [pos, val] : matrix) { /* pos is std::array<size_t, R> */ }
413
+ for (auto [pos, ref] : matrix.span()) { ref = nk::f32_t{1}; }
414
+ for (auto val : matrix.dims()) { /* scalar only, no position */ }
415
+ ```
416
+
368
417
  ## Packed Matrix Kernels for GEMM-Like Workloads
369
418
 
370
- This is the most distinctive native subsystem outside the raw vector kernels.
419
+ This is a separate native subsystem from the raw vector kernels.
371
420
  It is the right tool when the right-hand side is reused many times.
372
421
 
373
422
  ```cpp
@@ -414,7 +463,7 @@ This is SYRK-like in the sense that the output is square and symmetric.
414
463
  The important difference from packed GEMM-style work is the partitioning model.
415
464
  You typically split by output row windows, not by distinct left batches against a shared packed right-hand side.
416
465
 
417
- The arithmetic advantage is direct and honest.
466
+ The arithmetic advantage is straightforward.
418
467
  The symmetric kernels avoid recomputing both `(i, j)` and `(j, i)` pairs.
419
468
  That cuts the pair count almost in half before any micro-kernel details matter.
420
469
 
@@ -479,8 +528,8 @@ Its footprint is exposed through `size_bytes()`.
479
528
  ## Runtime Dispatch and Capabilities
480
529
 
481
530
  Dynamic dispatch is the default recommendation for shipping one binary across many CPU generations.
482
- `nk_configure_thread` configures rounding behavior and enables CPU-specific acceleration features such as Intel AMX.
483
- It must be called once per thread before any kernel invocation and returns 1 on success, 0 on failure.
531
+ `nk_configure_thread` enables CPU-specific acceleration features such as Intel AMX.
532
+ It must be called once per thread before using AMX operations and returns 1 on success, 0 on failure.
484
533
 
485
534
  ```c
486
535
  nk_capability_t caps = nk_capabilities();
@@ -491,7 +540,7 @@ if (caps & nk_cap_sapphireamx_k) { /* AMX available */ }
491
540
  For exact register-level details, see `capabilities.h`.
492
541
  The C++ wrappers can also call directly into named backends if you want to pin a path for testing or benchmarking.
493
542
 
494
- ## Parallelism and Fork Union
543
+ ## Parallelism and ForkUnion
495
544
 
496
545
  NumKong does not manage its own threads.
497
546
  That is deliberate.
@@ -521,7 +570,7 @@ fork_union.parallel_for(0, worker_count, [&](std::size_t t) {
521
570
  });
522
571
  ```
523
572
 
524
- We recommend [Fork Union](https://github.com/ashvardanian/ForkUnion) for that host-side orchestration.
573
+ We recommend [ForkUnion](https://github.com/ashvardanian/ForkUnion) for that host-side orchestration.
525
574
  OpenMP is still a reasonable fit if the rest of your application already uses it.
526
575
  Manual thread pools and task systems also work well because the kernels have explicit row-range interfaces.
527
576
 
@@ -570,4 +619,25 @@ cmake -B build -D CMAKE_TOOLCHAIN_FILE=cmake/toolchain-aarch64-gnu.cmake
570
619
 
571
620
  NumKong does not use OpenMP and does not create a hidden thread pool.
572
621
  Standard pthreads are linked via CMake's `Threads` package.
573
- Parallelism is host-controlled: partition work across row ranges and dispatch through Fork Union, `std::thread`, or any external scheduler.
622
+ Parallelism is host-controlled: partition work across row ranges and dispatch through ForkUnion, `std::thread`, or any external scheduler.
623
+
624
+ ## Addressing External Memory
625
+
626
+ Every kernel takes plain pointers, so any CPU-accessible memory works: mmap, pinned buffers, CUDA unified memory, custom arenas.
627
+ C++ views wrap any pointer without ownership.
628
+ Owning containers accept any C++ Allocator.
629
+
630
+ ```cpp
631
+ template <typename T>
632
+ struct cuda_allocator {
633
+ using value_type = T;
634
+ T *allocate(std::size_t n) { T *p;
635
+ cudaMallocManaged(&p, n * sizeof(T), cudaMemAttachGlobal);
636
+ return p; }
637
+ void deallocate(T *p, std::size_t) noexcept { cudaFree(p); }
638
+ };
639
+
640
+ nk_dot_f32(cuda_managed_ptr, cuda_managed_ptr, 1024, &dot); // C ABI, any pointer
641
+ auto view = nk::tensor_view<nk::f32_t>(mmap_ptr, rows, cols); // non-owning view
642
+ auto v = nk::vector<float, cuda_allocator<float>>::try_zeros(1024); // allocator-aware owning
643
+ ```