numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +197 -124
  2. package/binding.gyp +34 -484
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -140,18 +140,18 @@
140
140
  * Low-precision matmul relies on VPMADD* (AVX2), VNNI dot-products, and BF16 dot-products
141
141
  * on AVX-512. Zen4 improves throughput by dual-issuing many integer ops on FP ports.
142
142
  *
143
- * Intrinsic Instruction Haswell Genoa
144
- * _mm256_maddubs_epi16 VPMADDUBSW (YMM, YMM, YMM) 5c @ p0 3c @ p01
145
- * _mm256_madd_epi16 VPMADDWD (YMM, YMM, YMM) 5c @ p0 3c @ p01
146
- * _mm256_dpbusd_epi32 VPDPBUSD (YMM, K, YMM, YMM) n/a 4c @ p01
147
- * _mm256_dpwssds_epi32 VPDPWSSDS (YMM, K, YMM, YMM) n/a 4c @ p01
148
- * _mm256_dpbf16_ps VDPBF16PS (YMM, YMM, YMM) n/a 6c @ p01
143
+ * Intrinsic Instruction Haswell Genoa
144
+ * _mm256_maddubs_epi16 VPMADDUBSW (YMM, YMM, YMM) 5cy @ p0 3cy @ p01
145
+ * _mm256_madd_epi16 VPMADDWD (YMM, YMM, YMM) 5cy @ p0 3cy @ p01
146
+ * _mm256_dpbusd_epi32 VPDPBUSD (YMM, K, YMM, YMM) n/a 4cy @ p01
147
+ * _mm256_dpwssds_epi32 VPDPWSSDS (YMM, K, YMM, YMM) n/a 4cy @ p01
148
+ * _mm256_dpbf16_ps VDPBF16PS (YMM, YMM, YMM) n/a 6cy @ p01
149
149
  *
150
150
  * AMX tile ops (TDPBF16PS/TDPBUSD/TDPBSSD) are not covered by the uops.info 2022 dataset.
151
151
  *
152
152
  * @section references References
153
153
  *
154
- * - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
154
+ * - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
155
155
  * - Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
156
156
  * - uops.info: https://uops.info/
157
157
  * - Matrix Multiplication in 40 lines: https://en.algorithmica.org/hpc/algorithms/matmul/
@@ -298,64 +298,64 @@ NK_DYNAMIC void nk_dots_packed_u1(nk_u1x8_t const *a, void const *b_packed, nk_u
298
298
  /**
299
299
  * @brief Computes C = A × Aᵀ symmetric Gram matrix.
300
300
  * @param[in] vectors Input matrix of row vectors in row-major order.
301
- * @param[in] n_vectors Number of vectors (rows) in the input matrix.
301
+ * @param[in] vectors_count Number of vectors (rows) in the input matrix.
302
302
  * @param[in] depth Dimension of each vector (columns).
303
303
  * @param[in] stride Row stride in bytes for the input matrix.
304
- * @param[out] result Output symmetric matrix (n_vectors × n_vectors).
304
+ * @param[out] result Output symmetric matrix (vectors_count × vectors_count).
305
305
  * @param[in] result_stride Row stride in bytes for the result matrix.
306
306
  * @param[in] row_start Starting row offset of results to compute (needed for parallelism).
307
307
  * @param[in] row_count Number of rows of results to compute (needed for parallelism).
308
308
  */
309
- NK_DYNAMIC void nk_dots_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
310
- nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
309
+ NK_DYNAMIC void nk_dots_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
310
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
311
311
  nk_size_t row_count);
312
312
  /** @copydoc nk_dots_symmetric_bf16 */
313
- NK_DYNAMIC void nk_dots_symmetric_f16(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
314
- nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
313
+ NK_DYNAMIC void nk_dots_symmetric_f16(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
314
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
315
315
  nk_size_t row_count);
316
316
  /** @copydoc nk_dots_symmetric_bf16 */
317
- NK_DYNAMIC void nk_dots_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
318
- nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
317
+ NK_DYNAMIC void nk_dots_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
318
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
319
319
  nk_size_t row_count);
320
320
  /** @copydoc nk_dots_symmetric_bf16 */
321
- NK_DYNAMIC void nk_dots_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
322
- nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
321
+ NK_DYNAMIC void nk_dots_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
322
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
323
323
  nk_size_t row_count);
324
324
  /** @copydoc nk_dots_symmetric_bf16 */
325
- NK_DYNAMIC void nk_dots_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
326
- nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
325
+ NK_DYNAMIC void nk_dots_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
326
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
327
327
  nk_size_t row_count);
328
328
  /** @copydoc nk_dots_symmetric_bf16 */
329
- NK_DYNAMIC void nk_dots_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
330
- nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
329
+ NK_DYNAMIC void nk_dots_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
330
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
331
331
  nk_size_t row_count);
332
332
  /** @copydoc nk_dots_symmetric_bf16 */
333
- NK_DYNAMIC void nk_dots_symmetric_f32(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
334
- nk_f64_t *result, nk_size_t result_stride, nk_size_t row_start,
333
+ NK_DYNAMIC void nk_dots_symmetric_f32(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
334
+ nk_size_t stride, nk_f64_t *result, nk_size_t result_stride, nk_size_t row_start,
335
335
  nk_size_t row_count);
336
336
  /** @copydoc nk_dots_symmetric_bf16 */
337
- NK_DYNAMIC void nk_dots_symmetric_f64(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
338
- nk_f64_t *result, nk_size_t result_stride, nk_size_t row_start,
337
+ NK_DYNAMIC void nk_dots_symmetric_f64(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
338
+ nk_size_t stride, nk_f64_t *result, nk_size_t result_stride, nk_size_t row_start,
339
339
  nk_size_t row_count);
340
340
  /** @copydoc nk_dots_symmetric_bf16 */
341
- NK_DYNAMIC void nk_dots_symmetric_i8(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
341
+ NK_DYNAMIC void nk_dots_symmetric_i8(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride,
342
342
  nk_i32_t *result, nk_size_t result_stride, nk_size_t row_start,
343
343
  nk_size_t row_count);
344
344
  /** @copydoc nk_dots_symmetric_bf16 */
345
- NK_DYNAMIC void nk_dots_symmetric_u8(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
345
+ NK_DYNAMIC void nk_dots_symmetric_u8(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride,
346
346
  nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
347
347
  nk_size_t row_count);
348
348
  /** @copydoc nk_dots_symmetric_bf16 */
349
- NK_DYNAMIC void nk_dots_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
350
- nk_i32_t *result, nk_size_t result_stride, nk_size_t row_start,
349
+ NK_DYNAMIC void nk_dots_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
350
+ nk_size_t stride, nk_i32_t *result, nk_size_t result_stride, nk_size_t row_start,
351
351
  nk_size_t row_count);
352
352
  /** @copydoc nk_dots_symmetric_bf16 */
353
- NK_DYNAMIC void nk_dots_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
354
- nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
353
+ NK_DYNAMIC void nk_dots_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
354
+ nk_size_t stride, nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
355
355
  nk_size_t row_count);
356
356
  /** @copydoc nk_dots_symmetric_bf16 */
357
- NK_DYNAMIC void nk_dots_symmetric_u1(nk_u1x8_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
358
- nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
357
+ NK_DYNAMIC void nk_dots_symmetric_u1(nk_u1x8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
358
+ nk_size_t stride, nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
359
359
  nk_size_t row_count);
360
360
 
361
361
  /** @copydoc nk_dots_packed_size_f32 */
@@ -367,7 +367,7 @@ NK_PUBLIC void nk_dots_pack_f32_serial(nk_f32_t const *b, nk_size_t width, nk_si
367
367
  NK_PUBLIC void nk_dots_packed_f32_serial(nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
368
368
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
369
369
  /** @copydoc nk_dots_symmetric_f32 */
370
- NK_PUBLIC void nk_dots_symmetric_f32_serial(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
370
+ NK_PUBLIC void nk_dots_symmetric_f32_serial(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
371
371
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
372
372
  nk_size_t row_start, nk_size_t row_count);
373
373
 
@@ -380,7 +380,7 @@ NK_PUBLIC void nk_dots_pack_f64_serial(nk_f64_t const *b, nk_size_t width, nk_si
380
380
  NK_PUBLIC void nk_dots_packed_f64_serial(nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
381
381
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
382
382
  /** @copydoc nk_dots_symmetric_f64 */
383
- NK_PUBLIC void nk_dots_symmetric_f64_serial(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
383
+ NK_PUBLIC void nk_dots_symmetric_f64_serial(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
384
384
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
385
385
  nk_size_t row_start, nk_size_t row_count);
386
386
 
@@ -393,7 +393,7 @@ NK_PUBLIC void nk_dots_pack_f16_serial(nk_f16_t const *b, nk_size_t width, nk_si
393
393
  NK_PUBLIC void nk_dots_packed_f16_serial(nk_f16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
394
394
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
395
395
  /** @copydoc nk_dots_symmetric_f16 */
396
- NK_PUBLIC void nk_dots_symmetric_f16_serial(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
396
+ NK_PUBLIC void nk_dots_symmetric_f16_serial(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
397
397
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
398
398
  nk_size_t row_start, nk_size_t row_count);
399
399
 
@@ -406,7 +406,7 @@ NK_PUBLIC void nk_dots_pack_bf16_serial(nk_bf16_t const *b, nk_size_t width, nk_
406
406
  NK_PUBLIC void nk_dots_packed_bf16_serial(nk_bf16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
407
407
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
408
408
  /** @copydoc nk_dots_symmetric_bf16 */
409
- NK_PUBLIC void nk_dots_symmetric_bf16_serial(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
409
+ NK_PUBLIC void nk_dots_symmetric_bf16_serial(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
410
410
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
411
411
  nk_size_t row_start, nk_size_t row_count);
412
412
 
@@ -419,7 +419,7 @@ NK_PUBLIC void nk_dots_pack_i8_serial(nk_i8_t const *b, nk_size_t width, nk_size
419
419
  NK_PUBLIC void nk_dots_packed_i8_serial(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
420
420
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
421
421
  /** @copydoc nk_dots_symmetric_i8 */
422
- NK_PUBLIC void nk_dots_symmetric_i8_serial(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
422
+ NK_PUBLIC void nk_dots_symmetric_i8_serial(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
423
423
  nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
424
424
  nk_size_t row_start, nk_size_t row_count);
425
425
 
@@ -432,7 +432,7 @@ NK_PUBLIC void nk_dots_pack_u8_serial(nk_u8_t const *b, nk_size_t width, nk_size
432
432
  NK_PUBLIC void nk_dots_packed_u8_serial(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
433
433
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
434
434
  /** @copydoc nk_dots_symmetric_u8 */
435
- NK_PUBLIC void nk_dots_symmetric_u8_serial(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
435
+ NK_PUBLIC void nk_dots_symmetric_u8_serial(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
436
436
  nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
437
437
  nk_size_t row_start, nk_size_t row_count);
438
438
 
@@ -445,7 +445,7 @@ NK_PUBLIC void nk_dots_pack_u4_serial(nk_u4x2_t const *b, nk_size_t width, nk_si
445
445
  NK_PUBLIC void nk_dots_packed_u4_serial(nk_u4x2_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
446
446
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
447
447
  /** @copydoc nk_dots_symmetric_u4 */
448
- NK_PUBLIC void nk_dots_symmetric_u4_serial(nk_u4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
448
+ NK_PUBLIC void nk_dots_symmetric_u4_serial(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
449
449
  nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
450
450
  nk_size_t row_start, nk_size_t row_count);
451
451
 
@@ -458,7 +458,7 @@ NK_PUBLIC void nk_dots_pack_u1_serial(nk_u1x8_t const *b, nk_size_t width, nk_si
458
458
  NK_PUBLIC void nk_dots_packed_u1_serial(nk_u1x8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
459
459
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
460
460
  /** @copydoc nk_dots_symmetric_u1 */
461
- NK_PUBLIC void nk_dots_symmetric_u1_serial(nk_u1x8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
461
+ NK_PUBLIC void nk_dots_symmetric_u1_serial(nk_u1x8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
462
462
  nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
463
463
  nk_size_t row_start, nk_size_t row_count);
464
464
 
@@ -471,23 +471,23 @@ NK_PUBLIC void nk_dots_pack_i4_serial(nk_i4x2_t const *b, nk_size_t width, nk_si
471
471
  NK_PUBLIC void nk_dots_packed_i4_serial(nk_i4x2_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
472
472
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
473
473
  /** @copydoc nk_dots_symmetric_i4 */
474
- NK_PUBLIC void nk_dots_symmetric_i4_serial(nk_i4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
474
+ NK_PUBLIC void nk_dots_symmetric_i4_serial(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
475
475
  nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
476
476
  nk_size_t row_start, nk_size_t row_count);
477
477
  /** @copydoc nk_dots_symmetric_e4m3 */
478
- NK_PUBLIC void nk_dots_symmetric_e4m3_serial(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
478
+ NK_PUBLIC void nk_dots_symmetric_e4m3_serial(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
479
479
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
480
480
  nk_size_t row_start, nk_size_t row_count);
481
481
  /** @copydoc nk_dots_symmetric_e5m2 */
482
- NK_PUBLIC void nk_dots_symmetric_e5m2_serial(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
482
+ NK_PUBLIC void nk_dots_symmetric_e5m2_serial(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
483
483
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
484
484
  nk_size_t row_start, nk_size_t row_count);
485
485
  /** @copydoc nk_dots_symmetric_e2m3 */
486
- NK_PUBLIC void nk_dots_symmetric_e2m3_serial(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
486
+ NK_PUBLIC void nk_dots_symmetric_e2m3_serial(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
487
487
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
488
488
  nk_size_t row_start, nk_size_t row_count);
489
489
  /** @copydoc nk_dots_symmetric_e3m2 */
490
- NK_PUBLIC void nk_dots_symmetric_e3m2_serial(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
490
+ NK_PUBLIC void nk_dots_symmetric_e3m2_serial(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
491
491
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
492
492
  nk_size_t row_start, nk_size_t row_count);
493
493
  /** @copydoc nk_dots_packed_size_e2m3 */
@@ -521,7 +521,7 @@ NK_PUBLIC void nk_dots_pack_bf16_genoa(nk_bf16_t const *b, nk_size_t width, nk_s
521
521
  NK_PUBLIC void nk_dots_packed_bf16_genoa(nk_bf16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
522
522
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
523
523
  /** @copydoc nk_dots_symmetric_bf16 */
524
- NK_PUBLIC void nk_dots_symmetric_bf16_genoa(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
524
+ NK_PUBLIC void nk_dots_symmetric_bf16_genoa(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
525
525
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
526
526
  nk_size_t row_start, nk_size_t row_count);
527
527
 
@@ -542,15 +542,42 @@ NK_PUBLIC void nk_dots_pack_e5m2_genoa(nk_e5m2_t const *b, nk_size_t width, nk_s
542
542
  NK_PUBLIC void nk_dots_packed_e5m2_genoa(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
543
543
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
544
544
  /** @copydoc nk_dots_symmetric_e4m3 */
545
- NK_PUBLIC void nk_dots_symmetric_e4m3_genoa(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
545
+ NK_PUBLIC void nk_dots_symmetric_e4m3_genoa(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
546
546
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
547
547
  nk_size_t row_start, nk_size_t row_count);
548
548
  /** @copydoc nk_dots_symmetric_e5m2 */
549
- NK_PUBLIC void nk_dots_symmetric_e5m2_genoa(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
549
+ NK_PUBLIC void nk_dots_symmetric_e5m2_genoa(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
550
550
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
551
551
  nk_size_t row_start, nk_size_t row_count);
552
552
  #endif // NK_TARGET_GENOA
553
553
 
554
+ #if NK_TARGET_DIAMOND
555
+ /** @copydoc nk_dots_packed_size_e4m3 */
556
+ NK_PUBLIC nk_size_t nk_dots_packed_size_e4m3_diamond(nk_size_t width, nk_size_t depth);
557
+ /** @copydoc nk_dots_pack_e4m3 */
558
+ NK_PUBLIC void nk_dots_pack_e4m3_diamond(nk_e4m3_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
559
+ void *b_packed);
560
+ /** @copydoc nk_dots_packed_e4m3 */
561
+ NK_PUBLIC void nk_dots_packed_e4m3_diamond(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
562
+ nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
563
+ /** @copydoc nk_dots_packed_size_e5m2 */
564
+ NK_PUBLIC nk_size_t nk_dots_packed_size_e5m2_diamond(nk_size_t width, nk_size_t depth);
565
+ /** @copydoc nk_dots_pack_e5m2 */
566
+ NK_PUBLIC void nk_dots_pack_e5m2_diamond(nk_e5m2_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
567
+ void *b_packed);
568
+ /** @copydoc nk_dots_packed_e5m2 */
569
+ NK_PUBLIC void nk_dots_packed_e5m2_diamond(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
570
+ nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
571
+ /** @copydoc nk_dots_symmetric_e4m3 */
572
+ NK_PUBLIC void nk_dots_symmetric_e4m3_diamond(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
573
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
574
+ nk_size_t row_start, nk_size_t row_count);
575
+ /** @copydoc nk_dots_symmetric_e5m2 */
576
+ NK_PUBLIC void nk_dots_symmetric_e5m2_diamond(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
577
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
578
+ nk_size_t row_start, nk_size_t row_count);
579
+ #endif // NK_TARGET_DIAMOND
580
+
554
581
  /* Sapphire Rapids backends using Intel AMX (Advanced Matrix Extensions).
555
582
  * AMX provides 8 tile registers (TMM0-TMM7), each holding up to 1KB of data.
556
583
  * Tiles are configured as 16 rows × 64 bytes, enabling (16 × 32) BF16 or (16 × 64) INT8 tiles.
@@ -567,7 +594,7 @@ NK_PUBLIC void nk_dots_packed_bf16_sapphireamx(nk_bf16_t const *a, void const *b
567
594
  nk_size_t width, nk_size_t depth, nk_size_t a_stride,
568
595
  nk_size_t c_stride);
569
596
  /** @copydoc nk_dots_symmetric_bf16 */
570
- NK_PUBLIC void nk_dots_symmetric_bf16_sapphireamx(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
597
+ NK_PUBLIC void nk_dots_symmetric_bf16_sapphireamx(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
571
598
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
572
599
  nk_size_t row_start, nk_size_t row_count);
573
600
 
@@ -580,7 +607,7 @@ NK_PUBLIC void nk_dots_pack_i8_sapphireamx(nk_i8_t const *b, nk_size_t width, nk
580
607
  NK_PUBLIC void nk_dots_packed_i8_sapphireamx(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
581
608
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
582
609
  /** @copydoc nk_dots_symmetric_i8 */
583
- NK_PUBLIC void nk_dots_symmetric_i8_sapphireamx(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
610
+ NK_PUBLIC void nk_dots_symmetric_i8_sapphireamx(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
584
611
  nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
585
612
  nk_size_t row_start, nk_size_t row_count);
586
613
 
@@ -595,7 +622,7 @@ NK_PUBLIC void nk_dots_packed_e4m3_sapphireamx(nk_e4m3_t const *a, void const *b
595
622
  nk_size_t c_stride);
596
623
 
597
624
  /** @copydoc nk_dots_symmetric_e4m3 */
598
- NK_PUBLIC void nk_dots_symmetric_e4m3_sapphireamx(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
625
+ NK_PUBLIC void nk_dots_symmetric_e4m3_sapphireamx(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
599
626
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
600
627
  nk_size_t row_start, nk_size_t row_count);
601
628
 
@@ -609,7 +636,7 @@ NK_PUBLIC void nk_dots_packed_e5m2_sapphireamx(nk_e5m2_t const *a, void const *b
609
636
  nk_size_t width, nk_size_t depth, nk_size_t a_stride,
610
637
  nk_size_t c_stride);
611
638
  /** @copydoc nk_dots_symmetric_e5m2 */
612
- NK_PUBLIC void nk_dots_symmetric_e5m2_sapphireamx(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
639
+ NK_PUBLIC void nk_dots_symmetric_e5m2_sapphireamx(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
613
640
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
614
641
  nk_size_t row_start, nk_size_t row_count);
615
642
  /** @copydoc nk_dots_packed_size_e2m3 */
@@ -622,7 +649,7 @@ NK_PUBLIC void nk_dots_packed_e2m3_sapphireamx(nk_e2m3_t const *a, void const *b
622
649
  nk_size_t width, nk_size_t depth, nk_size_t a_stride,
623
650
  nk_size_t c_stride);
624
651
  /** @copydoc nk_dots_symmetric_e2m3 */
625
- NK_PUBLIC void nk_dots_symmetric_e2m3_sapphireamx(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
652
+ NK_PUBLIC void nk_dots_symmetric_e2m3_sapphireamx(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
626
653
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
627
654
  nk_size_t row_start, nk_size_t row_count);
628
655
 
@@ -636,7 +663,7 @@ NK_PUBLIC void nk_dots_packed_e3m2_sapphireamx(nk_e3m2_t const *a, void const *b
636
663
  nk_size_t width, nk_size_t depth, nk_size_t a_stride,
637
664
  nk_size_t c_stride);
638
665
  /** @copydoc nk_dots_symmetric_e3m2 */
639
- NK_PUBLIC void nk_dots_symmetric_e3m2_sapphireamx(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
666
+ NK_PUBLIC void nk_dots_symmetric_e3m2_sapphireamx(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
640
667
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
641
668
  nk_size_t row_start, nk_size_t row_count);
642
669
 
@@ -649,7 +676,7 @@ NK_PUBLIC void nk_dots_pack_u8_sapphireamx(nk_u8_t const *b, nk_size_t width, nk
649
676
  NK_PUBLIC void nk_dots_packed_u8_sapphireamx(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
650
677
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
651
678
  /** @copydoc nk_dots_symmetric_u8 */
652
- NK_PUBLIC void nk_dots_symmetric_u8_sapphireamx(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
679
+ NK_PUBLIC void nk_dots_symmetric_u8_sapphireamx(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
653
680
  nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
654
681
  nk_size_t row_start, nk_size_t row_count);
655
682
  #endif // NK_TARGET_SAPPHIREAMX
@@ -668,7 +695,7 @@ NK_PUBLIC void nk_dots_pack_f16_sme(nk_f16_t const *b, nk_size_t width, nk_size_
668
695
  NK_PUBLIC void nk_dots_packed_f16_sme(nk_f16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
669
696
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
670
697
  /** @copydoc nk_dots_symmetric_f16 */
671
- NK_PUBLIC void nk_dots_symmetric_f16_sme(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
698
+ NK_PUBLIC void nk_dots_symmetric_f16_sme(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
672
699
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
673
700
  nk_size_t row_start, nk_size_t row_count);
674
701
 
@@ -681,7 +708,7 @@ NK_PUBLIC void nk_dots_pack_bf16_sme(nk_bf16_t const *b, nk_size_t width, nk_siz
681
708
  NK_PUBLIC void nk_dots_packed_bf16_sme(nk_bf16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
682
709
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
683
710
  /** @copydoc nk_dots_symmetric_bf16 */
684
- NK_PUBLIC void nk_dots_symmetric_bf16_sme(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
711
+ NK_PUBLIC void nk_dots_symmetric_bf16_sme(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
685
712
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
686
713
  nk_size_t row_start, nk_size_t row_count);
687
714
 
@@ -694,9 +721,9 @@ NK_PUBLIC void nk_dots_pack_i8_sme(nk_i8_t const *b, nk_size_t width, nk_size_t
694
721
  NK_PUBLIC void nk_dots_packed_i8_sme(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
695
722
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
696
723
  /** @copydoc nk_dots_symmetric_i8 */
697
- NK_PUBLIC void nk_dots_symmetric_i8_sme(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
698
- nk_i32_t *result, nk_size_t result_stride, nk_size_t row_start,
699
- nk_size_t row_count);
724
+ NK_PUBLIC void nk_dots_symmetric_i8_sme(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
725
+ nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
726
+ nk_size_t row_start, nk_size_t row_count);
700
727
 
701
728
  /** @copydoc nk_dots_packed_size_u8 */
702
729
  NK_PUBLIC nk_size_t nk_dots_packed_size_u8_sme(nk_size_t width, nk_size_t depth);
@@ -707,9 +734,9 @@ NK_PUBLIC void nk_dots_pack_u8_sme(nk_u8_t const *b, nk_size_t width, nk_size_t
707
734
  NK_PUBLIC void nk_dots_packed_u8_sme(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
708
735
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
709
736
  /** @copydoc nk_dots_symmetric_u8 */
710
- NK_PUBLIC void nk_dots_symmetric_u8_sme(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
711
- nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
712
- nk_size_t row_count);
737
+ NK_PUBLIC void nk_dots_symmetric_u8_sme(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
738
+ nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
739
+ nk_size_t row_start, nk_size_t row_count);
713
740
 
714
741
  /** @copydoc nk_dots_packed_size_e4m3 */
715
742
  NK_PUBLIC nk_size_t nk_dots_packed_size_e4m3_sme(nk_size_t width, nk_size_t depth);
@@ -720,7 +747,7 @@ NK_PUBLIC void nk_dots_pack_e4m3_sme(nk_e4m3_t const *b, nk_size_t width, nk_siz
720
747
  NK_PUBLIC void nk_dots_packed_e4m3_sme(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
721
748
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
722
749
  /** @copydoc nk_dots_symmetric_e4m3 */
723
- NK_PUBLIC void nk_dots_symmetric_e4m3_sme(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
750
+ NK_PUBLIC void nk_dots_symmetric_e4m3_sme(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
724
751
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
725
752
  nk_size_t row_start, nk_size_t row_count);
726
753
 
@@ -733,7 +760,7 @@ NK_PUBLIC void nk_dots_pack_e5m2_sme(nk_e5m2_t const *b, nk_size_t width, nk_siz
733
760
  NK_PUBLIC void nk_dots_packed_e5m2_sme(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
734
761
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
735
762
  /** @copydoc nk_dots_symmetric_e5m2 */
736
- NK_PUBLIC void nk_dots_symmetric_e5m2_sme(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
763
+ NK_PUBLIC void nk_dots_symmetric_e5m2_sme(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
737
764
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
738
765
  nk_size_t row_start, nk_size_t row_count);
739
766
 
@@ -746,7 +773,7 @@ NK_PUBLIC void nk_dots_pack_u4_sme(nk_u4x2_t const *b, nk_size_t width, nk_size_
746
773
  NK_PUBLIC void nk_dots_packed_u4_sme(nk_u4x2_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
747
774
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
748
775
  /** @copydoc nk_dots_symmetric_u4 */
749
- NK_PUBLIC void nk_dots_symmetric_u4_sme(nk_u4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
776
+ NK_PUBLIC void nk_dots_symmetric_u4_sme(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
750
777
  nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
751
778
  nk_size_t row_start, nk_size_t row_count);
752
779
 
@@ -759,7 +786,7 @@ NK_PUBLIC void nk_dots_pack_i4_sme(nk_i4x2_t const *b, nk_size_t width, nk_size_
759
786
  NK_PUBLIC void nk_dots_packed_i4_sme(nk_i4x2_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
760
787
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
761
788
  /** @copydoc nk_dots_symmetric_i4 */
762
- NK_PUBLIC void nk_dots_symmetric_i4_sme(nk_i4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
789
+ NK_PUBLIC void nk_dots_symmetric_i4_sme(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
763
790
  nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
764
791
  nk_size_t row_start, nk_size_t row_count);
765
792
 
@@ -772,7 +799,7 @@ NK_PUBLIC void nk_dots_pack_e2m3_sme(nk_e2m3_t const *b, nk_size_t width, nk_siz
772
799
  NK_PUBLIC void nk_dots_packed_e2m3_sme(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
773
800
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
774
801
  /** @copydoc nk_dots_symmetric_e2m3 */
775
- NK_PUBLIC void nk_dots_symmetric_e2m3_sme(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
802
+ NK_PUBLIC void nk_dots_symmetric_e2m3_sme(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
776
803
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
777
804
  nk_size_t row_start, nk_size_t row_count);
778
805
 
@@ -785,7 +812,7 @@ NK_PUBLIC void nk_dots_pack_e3m2_sme(nk_e3m2_t const *b, nk_size_t width, nk_siz
785
812
  NK_PUBLIC void nk_dots_packed_e3m2_sme(nk_e3m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
786
813
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
787
814
  /** @copydoc nk_dots_symmetric_e3m2 */
788
- NK_PUBLIC void nk_dots_symmetric_e3m2_sme(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
815
+ NK_PUBLIC void nk_dots_symmetric_e3m2_sme(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
789
816
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
790
817
  nk_size_t row_start, nk_size_t row_count);
791
818
  #endif // NK_TARGET_SME
@@ -803,7 +830,7 @@ NK_PUBLIC void nk_dots_pack_u1_smebi32(nk_u1x8_t const *b, nk_size_t width, nk_s
803
830
  NK_PUBLIC void nk_dots_packed_u1_smebi32(nk_u1x8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
804
831
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
805
832
  /** @copydoc nk_dots_symmetric_u1 */
806
- NK_PUBLIC void nk_dots_symmetric_u1_smebi32(nk_u1x8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
833
+ NK_PUBLIC void nk_dots_symmetric_u1_smebi32(nk_u1x8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
807
834
  nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
808
835
  nk_size_t row_start, nk_size_t row_count);
809
836
  #endif // NK_TARGET_SMEBI32
@@ -821,7 +848,7 @@ NK_PUBLIC void nk_dots_pack_f32_smef64(nk_f32_t const *b, nk_size_t width, nk_si
821
848
  NK_PUBLIC void nk_dots_packed_f32_smef64(nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
822
849
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
823
850
  /** @copydoc nk_dots_symmetric_f32 */
824
- NK_PUBLIC void nk_dots_symmetric_f32_smef64(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
851
+ NK_PUBLIC void nk_dots_symmetric_f32_smef64(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
825
852
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
826
853
  nk_size_t row_start, nk_size_t row_count);
827
854
 
@@ -834,7 +861,7 @@ NK_PUBLIC void nk_dots_pack_f64_smef64(nk_f64_t const *b, nk_size_t width, nk_si
834
861
  NK_PUBLIC void nk_dots_packed_f64_smef64(nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
835
862
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
836
863
  /** @copydoc nk_dots_symmetric_f64 */
837
- NK_PUBLIC void nk_dots_symmetric_f64_smef64(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
864
+ NK_PUBLIC void nk_dots_symmetric_f64_smef64(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
838
865
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
839
866
  nk_size_t row_start, nk_size_t row_count);
840
867
  #endif // NK_TARGET_SMEF64
@@ -852,7 +879,7 @@ NK_PUBLIC void nk_dots_pack_f32_haswell(nk_f32_t const *b, nk_size_t width, nk_s
852
879
  NK_PUBLIC void nk_dots_packed_f32_haswell(nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
853
880
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
854
881
  /** @copydoc nk_dots_symmetric_f32 */
855
- NK_PUBLIC void nk_dots_symmetric_f32_haswell(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
882
+ NK_PUBLIC void nk_dots_symmetric_f32_haswell(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
856
883
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
857
884
  nk_size_t row_start, nk_size_t row_count);
858
885
  /** @copydoc nk_dots_packed_size_f64 */
@@ -864,7 +891,7 @@ NK_PUBLIC void nk_dots_pack_f64_haswell(nk_f64_t const *b, nk_size_t width, nk_s
864
891
  NK_PUBLIC void nk_dots_packed_f64_haswell(nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
865
892
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
866
893
  /** @copydoc nk_dots_symmetric_f64 */
867
- NK_PUBLIC void nk_dots_symmetric_f64_haswell(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
894
+ NK_PUBLIC void nk_dots_symmetric_f64_haswell(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
868
895
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
869
896
  nk_size_t row_start, nk_size_t row_count);
870
897
  /** @copydoc nk_dots_packed_size_f16 */
@@ -876,7 +903,7 @@ NK_PUBLIC void nk_dots_pack_f16_haswell(nk_f16_t const *b, nk_size_t width, nk_s
876
903
  NK_PUBLIC void nk_dots_packed_f16_haswell(nk_f16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
877
904
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
878
905
  /** @copydoc nk_dots_symmetric_f16 */
879
- NK_PUBLIC void nk_dots_symmetric_f16_haswell(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
906
+ NK_PUBLIC void nk_dots_symmetric_f16_haswell(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
880
907
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
881
908
  nk_size_t row_start, nk_size_t row_count);
882
909
  /** @copydoc nk_dots_packed_size_bf16 */
@@ -888,7 +915,7 @@ NK_PUBLIC void nk_dots_pack_bf16_haswell(nk_bf16_t const *b, nk_size_t width, nk
888
915
  NK_PUBLIC void nk_dots_packed_bf16_haswell(nk_bf16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
889
916
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
890
917
  /** @copydoc nk_dots_symmetric_bf16 */
891
- NK_PUBLIC void nk_dots_symmetric_bf16_haswell(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
918
+ NK_PUBLIC void nk_dots_symmetric_bf16_haswell(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
892
919
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
893
920
  nk_size_t row_start, nk_size_t row_count);
894
921
  /** @copydoc nk_dots_packed_size_e4m3 */
@@ -900,7 +927,7 @@ NK_PUBLIC void nk_dots_pack_e4m3_haswell(nk_e4m3_t const *b, nk_size_t width, nk
900
927
  NK_PUBLIC void nk_dots_packed_e4m3_haswell(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
901
928
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
902
929
  /** @copydoc nk_dots_symmetric_e4m3 */
903
- NK_PUBLIC void nk_dots_symmetric_e4m3_haswell(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
930
+ NK_PUBLIC void nk_dots_symmetric_e4m3_haswell(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
904
931
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
905
932
  nk_size_t row_start, nk_size_t row_count);
906
933
  /** @copydoc nk_dots_packed_size_e5m2 */
@@ -912,7 +939,7 @@ NK_PUBLIC void nk_dots_pack_e5m2_haswell(nk_e5m2_t const *b, nk_size_t width, nk
912
939
  NK_PUBLIC void nk_dots_packed_e5m2_haswell(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
913
940
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
914
941
  /** @copydoc nk_dots_symmetric_e5m2 */
915
- NK_PUBLIC void nk_dots_symmetric_e5m2_haswell(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
942
+ NK_PUBLIC void nk_dots_symmetric_e5m2_haswell(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
916
943
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
917
944
  nk_size_t row_start, nk_size_t row_count);
918
945
  /** @copydoc nk_dots_packed_size_e2m3 */
@@ -924,7 +951,7 @@ NK_PUBLIC void nk_dots_pack_e2m3_haswell(nk_e2m3_t const *b, nk_size_t width, nk
924
951
  NK_PUBLIC void nk_dots_packed_e2m3_haswell(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
925
952
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
926
953
  /** @copydoc nk_dots_symmetric_e2m3 */
927
- NK_PUBLIC void nk_dots_symmetric_e2m3_haswell(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
954
+ NK_PUBLIC void nk_dots_symmetric_e2m3_haswell(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
928
955
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
929
956
  nk_size_t row_start, nk_size_t row_count);
930
957
  /** @copydoc nk_dots_packed_size_e3m2 */
@@ -936,7 +963,7 @@ NK_PUBLIC void nk_dots_pack_e3m2_haswell(nk_e3m2_t const *b, nk_size_t width, nk
936
963
  NK_PUBLIC void nk_dots_packed_e3m2_haswell(nk_e3m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
937
964
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
938
965
  /** @copydoc nk_dots_symmetric_e3m2 */
939
- NK_PUBLIC void nk_dots_symmetric_e3m2_haswell(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
966
+ NK_PUBLIC void nk_dots_symmetric_e3m2_haswell(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
940
967
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
941
968
  nk_size_t row_start, nk_size_t row_count);
942
969
  /** @copydoc nk_dots_packed_size_i8 */
@@ -948,7 +975,7 @@ NK_PUBLIC void nk_dots_pack_i8_haswell(nk_i8_t const *b, nk_size_t width, nk_siz
948
975
  NK_PUBLIC void nk_dots_packed_i8_haswell(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
949
976
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
950
977
  /** @copydoc nk_dots_symmetric_i8 */
951
- NK_PUBLIC void nk_dots_symmetric_i8_haswell(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
978
+ NK_PUBLIC void nk_dots_symmetric_i8_haswell(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
952
979
  nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
953
980
  nk_size_t row_start, nk_size_t row_count);
954
981
  /** @copydoc nk_dots_packed_size_u8 */
@@ -960,7 +987,7 @@ NK_PUBLIC void nk_dots_pack_u8_haswell(nk_u8_t const *b, nk_size_t width, nk_siz
960
987
  NK_PUBLIC void nk_dots_packed_u8_haswell(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
961
988
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
962
989
  /** @copydoc nk_dots_symmetric_u8 */
963
- NK_PUBLIC void nk_dots_symmetric_u8_haswell(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
990
+ NK_PUBLIC void nk_dots_symmetric_u8_haswell(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
964
991
  nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
965
992
  nk_size_t row_start, nk_size_t row_count);
966
993
  /** @copydoc nk_dots_packed_size_u1 */
@@ -972,7 +999,7 @@ NK_PUBLIC void nk_dots_pack_u1_haswell(nk_u1x8_t const *b, nk_size_t width, nk_s
972
999
  NK_PUBLIC void nk_dots_packed_u1_haswell(nk_u1x8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
973
1000
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
974
1001
  /** @copydoc nk_dots_symmetric_u1 */
975
- NK_PUBLIC void nk_dots_symmetric_u1_haswell(nk_u1x8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1002
+ NK_PUBLIC void nk_dots_symmetric_u1_haswell(nk_u1x8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
976
1003
  nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
977
1004
  nk_size_t row_start, nk_size_t row_count);
978
1005
  /** @copydoc nk_dots_packed_size_i4 */
@@ -984,7 +1011,7 @@ NK_PUBLIC void nk_dots_pack_i4_haswell(nk_i4x2_t const *b, nk_size_t width, nk_s
984
1011
  NK_PUBLIC void nk_dots_packed_i4_haswell(nk_i4x2_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
985
1012
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
986
1013
  /** @copydoc nk_dots_symmetric_i4 */
987
- NK_PUBLIC void nk_dots_symmetric_i4_haswell(nk_i4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1014
+ NK_PUBLIC void nk_dots_symmetric_i4_haswell(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
988
1015
  nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
989
1016
  nk_size_t row_start, nk_size_t row_count);
990
1017
  /** @copydoc nk_dots_packed_size_u4 */
@@ -996,7 +1023,7 @@ NK_PUBLIC void nk_dots_pack_u4_haswell(nk_u4x2_t const *b, nk_size_t width, nk_s
996
1023
  NK_PUBLIC void nk_dots_packed_u4_haswell(nk_u4x2_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
997
1024
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
998
1025
  /** @copydoc nk_dots_symmetric_u4 */
999
- NK_PUBLIC void nk_dots_symmetric_u4_haswell(nk_u4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1026
+ NK_PUBLIC void nk_dots_symmetric_u4_haswell(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1000
1027
  nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
1001
1028
  nk_size_t row_start, nk_size_t row_count);
1002
1029
  #endif // NK_TARGET_HASWELL
@@ -1014,7 +1041,7 @@ NK_PUBLIC void nk_dots_pack_f64_skylake(nk_f64_t const *b, nk_size_t width, nk_s
1014
1041
  NK_PUBLIC void nk_dots_packed_f64_skylake(nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
1015
1042
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1016
1043
  /** @copydoc nk_dots_symmetric_f64 */
1017
- NK_PUBLIC void nk_dots_symmetric_f64_skylake(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1044
+ NK_PUBLIC void nk_dots_symmetric_f64_skylake(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1018
1045
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1019
1046
  nk_size_t row_start, nk_size_t row_count);
1020
1047
  /** @copydoc nk_dots_packed_size_f32 */
@@ -1026,7 +1053,7 @@ NK_PUBLIC void nk_dots_pack_f32_skylake(nk_f32_t const *b, nk_size_t width, nk_s
1026
1053
  NK_PUBLIC void nk_dots_packed_f32_skylake(nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
1027
1054
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1028
1055
  /** @copydoc nk_dots_symmetric_f32 */
1029
- NK_PUBLIC void nk_dots_symmetric_f32_skylake(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1056
+ NK_PUBLIC void nk_dots_symmetric_f32_skylake(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1030
1057
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1031
1058
  nk_size_t row_start, nk_size_t row_count);
1032
1059
  /** @copydoc nk_dots_packed_size_bf16 */
@@ -1038,7 +1065,7 @@ NK_PUBLIC void nk_dots_pack_bf16_skylake(nk_bf16_t const *b, nk_size_t width, nk
1038
1065
  NK_PUBLIC void nk_dots_packed_bf16_skylake(nk_bf16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1039
1066
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1040
1067
  /** @copydoc nk_dots_symmetric_bf16 */
1041
- NK_PUBLIC void nk_dots_symmetric_bf16_skylake(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1068
+ NK_PUBLIC void nk_dots_symmetric_bf16_skylake(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1042
1069
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1043
1070
  nk_size_t row_start, nk_size_t row_count);
1044
1071
  /** @copydoc nk_dots_packed_size_f16 */
@@ -1050,7 +1077,7 @@ NK_PUBLIC void nk_dots_pack_f16_skylake(nk_f16_t const *b, nk_size_t width, nk_s
1050
1077
  NK_PUBLIC void nk_dots_packed_f16_skylake(nk_f16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1051
1078
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1052
1079
  /** @copydoc nk_dots_symmetric_f16 */
1053
- NK_PUBLIC void nk_dots_symmetric_f16_skylake(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1080
+ NK_PUBLIC void nk_dots_symmetric_f16_skylake(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1054
1081
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1055
1082
  nk_size_t row_start, nk_size_t row_count);
1056
1083
  /** @copydoc nk_dots_packed_size_e4m3 */
@@ -1062,7 +1089,7 @@ NK_PUBLIC void nk_dots_pack_e4m3_skylake(nk_e4m3_t const *b, nk_size_t width, nk
1062
1089
  NK_PUBLIC void nk_dots_packed_e4m3_skylake(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1063
1090
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1064
1091
  /** @copydoc nk_dots_symmetric_e4m3 */
1065
- NK_PUBLIC void nk_dots_symmetric_e4m3_skylake(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1092
+ NK_PUBLIC void nk_dots_symmetric_e4m3_skylake(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1066
1093
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1067
1094
  nk_size_t row_start, nk_size_t row_count);
1068
1095
  /** @copydoc nk_dots_packed_size_e5m2 */
@@ -1074,7 +1101,7 @@ NK_PUBLIC void nk_dots_pack_e5m2_skylake(nk_e5m2_t const *b, nk_size_t width, nk
1074
1101
  NK_PUBLIC void nk_dots_packed_e5m2_skylake(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1075
1102
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1076
1103
  /** @copydoc nk_dots_symmetric_e5m2 */
1077
- NK_PUBLIC void nk_dots_symmetric_e5m2_skylake(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1104
+ NK_PUBLIC void nk_dots_symmetric_e5m2_skylake(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1078
1105
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1079
1106
  nk_size_t row_start, nk_size_t row_count);
1080
1107
  /** @copydoc nk_dots_packed_size_e2m3 */
@@ -1086,7 +1113,7 @@ NK_PUBLIC void nk_dots_pack_e2m3_skylake(nk_e2m3_t const *b, nk_size_t width, nk
1086
1113
  NK_PUBLIC void nk_dots_packed_e2m3_skylake(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1087
1114
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1088
1115
  /** @copydoc nk_dots_symmetric_e2m3 */
1089
- NK_PUBLIC void nk_dots_symmetric_e2m3_skylake(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1116
+ NK_PUBLIC void nk_dots_symmetric_e2m3_skylake(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1090
1117
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1091
1118
  nk_size_t row_start, nk_size_t row_count);
1092
1119
  /** @copydoc nk_dots_packed_size_e3m2 */
@@ -1098,7 +1125,7 @@ NK_PUBLIC void nk_dots_pack_e3m2_skylake(nk_e3m2_t const *b, nk_size_t width, nk
1098
1125
  NK_PUBLIC void nk_dots_packed_e3m2_skylake(nk_e3m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1099
1126
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1100
1127
  /** @copydoc nk_dots_symmetric_e3m2 */
1101
- NK_PUBLIC void nk_dots_symmetric_e3m2_skylake(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1128
+ NK_PUBLIC void nk_dots_symmetric_e3m2_skylake(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1102
1129
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1103
1130
  nk_size_t row_start, nk_size_t row_count);
1104
1131
  #endif // NK_TARGET_SKYLAKE
@@ -1116,7 +1143,7 @@ NK_PUBLIC void nk_dots_pack_i8_icelake(nk_i8_t const *b, nk_size_t width, nk_siz
1116
1143
  NK_PUBLIC void nk_dots_packed_i8_icelake(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
1117
1144
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1118
1145
  /** @copydoc nk_dots_symmetric_i8 */
1119
- NK_PUBLIC void nk_dots_symmetric_i8_icelake(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1146
+ NK_PUBLIC void nk_dots_symmetric_i8_icelake(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1120
1147
  nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
1121
1148
  nk_size_t row_start, nk_size_t row_count);
1122
1149
  /** @copydoc nk_dots_packed_size_u8 */
@@ -1128,7 +1155,7 @@ NK_PUBLIC void nk_dots_pack_u8_icelake(nk_u8_t const *b, nk_size_t width, nk_siz
1128
1155
  NK_PUBLIC void nk_dots_packed_u8_icelake(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
1129
1156
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1130
1157
  /** @copydoc nk_dots_symmetric_u8 */
1131
- NK_PUBLIC void nk_dots_symmetric_u8_icelake(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1158
+ NK_PUBLIC void nk_dots_symmetric_u8_icelake(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1132
1159
  nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
1133
1160
  nk_size_t row_start, nk_size_t row_count);
1134
1161
  /** @copydoc nk_dots_packed_size_i4 */
@@ -1140,7 +1167,7 @@ NK_PUBLIC void nk_dots_pack_i4_icelake(nk_i4x2_t const *b, nk_size_t width, nk_s
1140
1167
  NK_PUBLIC void nk_dots_packed_i4_icelake(nk_i4x2_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
1141
1168
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1142
1169
  /** @copydoc nk_dots_symmetric_i4 */
1143
- NK_PUBLIC void nk_dots_symmetric_i4_icelake(nk_i4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1170
+ NK_PUBLIC void nk_dots_symmetric_i4_icelake(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1144
1171
  nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
1145
1172
  nk_size_t row_start, nk_size_t row_count);
1146
1173
  /** @copydoc nk_dots_packed_size_u4 */
@@ -1152,7 +1179,7 @@ NK_PUBLIC void nk_dots_pack_u4_icelake(nk_u4x2_t const *b, nk_size_t width, nk_s
1152
1179
  NK_PUBLIC void nk_dots_packed_u4_icelake(nk_u4x2_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
1153
1180
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1154
1181
  /** @copydoc nk_dots_symmetric_u4 */
1155
- NK_PUBLIC void nk_dots_symmetric_u4_icelake(nk_u4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1182
+ NK_PUBLIC void nk_dots_symmetric_u4_icelake(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1156
1183
  nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
1157
1184
  nk_size_t row_start, nk_size_t row_count);
1158
1185
  /** @copydoc nk_dots_packed_size_u1 */
@@ -1164,7 +1191,7 @@ NK_PUBLIC void nk_dots_pack_u1_icelake(nk_u1x8_t const *b, nk_size_t width, nk_s
1164
1191
  NK_PUBLIC void nk_dots_packed_u1_icelake(nk_u1x8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
1165
1192
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1166
1193
  /** @copydoc nk_dots_symmetric_u1 */
1167
- NK_PUBLIC void nk_dots_symmetric_u1_icelake(nk_u1x8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1194
+ NK_PUBLIC void nk_dots_symmetric_u1_icelake(nk_u1x8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1168
1195
  nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
1169
1196
  nk_size_t row_start, nk_size_t row_count);
1170
1197
  #endif // NK_TARGET_ICELAKE
@@ -1182,7 +1209,7 @@ NK_PUBLIC void nk_dots_pack_i8_alder(nk_i8_t const *b, nk_size_t width, nk_size_
1182
1209
  NK_PUBLIC void nk_dots_packed_i8_alder(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
1183
1210
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1184
1211
  /** @copydoc nk_dots_symmetric_i8 */
1185
- NK_PUBLIC void nk_dots_symmetric_i8_alder(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1212
+ NK_PUBLIC void nk_dots_symmetric_i8_alder(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1186
1213
  nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
1187
1214
  nk_size_t row_start, nk_size_t row_count);
1188
1215
  /** @copydoc nk_dots_packed_size_u8 */
@@ -1194,7 +1221,7 @@ NK_PUBLIC void nk_dots_pack_u8_alder(nk_u8_t const *b, nk_size_t width, nk_size_
1194
1221
  NK_PUBLIC void nk_dots_packed_u8_alder(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
1195
1222
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1196
1223
  /** @copydoc nk_dots_symmetric_u8 */
1197
- NK_PUBLIC void nk_dots_symmetric_u8_alder(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1224
+ NK_PUBLIC void nk_dots_symmetric_u8_alder(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1198
1225
  nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
1199
1226
  nk_size_t row_start, nk_size_t row_count);
1200
1227
  /** @copydoc nk_dots_packed_size_e2m3 */
@@ -1206,7 +1233,7 @@ NK_PUBLIC void nk_dots_pack_e2m3_alder(nk_e2m3_t const *b, nk_size_t width, nk_s
1206
1233
  NK_PUBLIC void nk_dots_packed_e2m3_alder(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1207
1234
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1208
1235
  /** @copydoc nk_dots_symmetric_e2m3 */
1209
- NK_PUBLIC void nk_dots_symmetric_e2m3_alder(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1236
+ NK_PUBLIC void nk_dots_symmetric_e2m3_alder(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1210
1237
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1211
1238
  nk_size_t row_start, nk_size_t row_count);
1212
1239
  #endif // NK_TARGET_ALDER
@@ -1224,7 +1251,7 @@ NK_PUBLIC void nk_dots_pack_i8_sierra(nk_i8_t const *b, nk_size_t width, nk_size
1224
1251
  NK_PUBLIC void nk_dots_packed_i8_sierra(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
1225
1252
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1226
1253
  /** @copydoc nk_dots_symmetric_i8 */
1227
- NK_PUBLIC void nk_dots_symmetric_i8_sierra(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1254
+ NK_PUBLIC void nk_dots_symmetric_i8_sierra(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1228
1255
  nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
1229
1256
  nk_size_t row_start, nk_size_t row_count);
1230
1257
  /** @copydoc nk_dots_packed_size_u8 */
@@ -1236,7 +1263,7 @@ NK_PUBLIC void nk_dots_pack_u8_sierra(nk_u8_t const *b, nk_size_t width, nk_size
1236
1263
  NK_PUBLIC void nk_dots_packed_u8_sierra(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
1237
1264
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1238
1265
  /** @copydoc nk_dots_symmetric_u8 */
1239
- NK_PUBLIC void nk_dots_symmetric_u8_sierra(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1266
+ NK_PUBLIC void nk_dots_symmetric_u8_sierra(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1240
1267
  nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
1241
1268
  nk_size_t row_start, nk_size_t row_count);
1242
1269
  /** @copydoc nk_dots_packed_size_e2m3 */
@@ -1248,7 +1275,7 @@ NK_PUBLIC void nk_dots_pack_e2m3_sierra(nk_e2m3_t const *b, nk_size_t width, nk_
1248
1275
  NK_PUBLIC void nk_dots_packed_e2m3_sierra(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1249
1276
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1250
1277
  /** @copydoc nk_dots_symmetric_e2m3 */
1251
- NK_PUBLIC void nk_dots_symmetric_e2m3_sierra(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1278
+ NK_PUBLIC void nk_dots_symmetric_e2m3_sierra(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1252
1279
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1253
1280
  nk_size_t row_start, nk_size_t row_count);
1254
1281
  #endif // NK_TARGET_SIERRA
@@ -1266,7 +1293,7 @@ NK_PUBLIC void nk_dots_pack_i8_v128relaxed(nk_i8_t const *b, nk_size_t width, nk
1266
1293
  NK_PUBLIC void nk_dots_packed_i8_v128relaxed(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
1267
1294
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1268
1295
  /** @copydoc nk_dots_symmetric_i8 */
1269
- NK_PUBLIC void nk_dots_symmetric_i8_v128relaxed(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1296
+ NK_PUBLIC void nk_dots_symmetric_i8_v128relaxed(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1270
1297
  nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
1271
1298
  nk_size_t row_start, nk_size_t row_count);
1272
1299
  /** @copydoc nk_dots_packed_size_u8 */
@@ -1278,7 +1305,7 @@ NK_PUBLIC void nk_dots_pack_u8_v128relaxed(nk_u8_t const *b, nk_size_t width, nk
1278
1305
  NK_PUBLIC void nk_dots_packed_u8_v128relaxed(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
1279
1306
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1280
1307
  /** @copydoc nk_dots_symmetric_u8 */
1281
- NK_PUBLIC void nk_dots_symmetric_u8_v128relaxed(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1308
+ NK_PUBLIC void nk_dots_symmetric_u8_v128relaxed(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1282
1309
  nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
1283
1310
  nk_size_t row_start, nk_size_t row_count);
1284
1311
  /** @copydoc nk_dots_packed_size_e2m3 */
@@ -1291,7 +1318,7 @@ NK_PUBLIC void nk_dots_packed_e2m3_v128relaxed(nk_e2m3_t const *a, void const *b
1291
1318
  nk_size_t width, nk_size_t depth, nk_size_t a_stride,
1292
1319
  nk_size_t c_stride);
1293
1320
  /** @copydoc nk_dots_symmetric_e2m3 */
1294
- NK_PUBLIC void nk_dots_symmetric_e2m3_v128relaxed(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1321
+ NK_PUBLIC void nk_dots_symmetric_e2m3_v128relaxed(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1295
1322
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1296
1323
  nk_size_t row_start, nk_size_t row_count);
1297
1324
  /** @copydoc nk_dots_packed_size_bf16 */
@@ -1304,7 +1331,7 @@ NK_PUBLIC void nk_dots_packed_bf16_v128relaxed(nk_bf16_t const *a, void const *b
1304
1331
  nk_size_t width, nk_size_t depth, nk_size_t a_stride,
1305
1332
  nk_size_t c_stride);
1306
1333
  /** @copydoc nk_dots_symmetric_bf16 */
1307
- NK_PUBLIC void nk_dots_symmetric_bf16_v128relaxed(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1334
+ NK_PUBLIC void nk_dots_symmetric_bf16_v128relaxed(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1308
1335
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1309
1336
  nk_size_t row_start, nk_size_t row_count);
1310
1337
  /** @copydoc nk_dots_packed_size_f32 */
@@ -1316,7 +1343,7 @@ NK_PUBLIC void nk_dots_pack_f32_v128relaxed(nk_f32_t const *b, nk_size_t width,
1316
1343
  NK_PUBLIC void nk_dots_packed_f32_v128relaxed(nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
1317
1344
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1318
1345
  /** @copydoc nk_dots_symmetric_f32 */
1319
- NK_PUBLIC void nk_dots_symmetric_f32_v128relaxed(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1346
+ NK_PUBLIC void nk_dots_symmetric_f32_v128relaxed(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1320
1347
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1321
1348
  nk_size_t row_start, nk_size_t row_count);
1322
1349
  /** @copydoc nk_dots_packed_size_f64 */
@@ -1328,7 +1355,7 @@ NK_PUBLIC void nk_dots_pack_f64_v128relaxed(nk_f64_t const *b, nk_size_t width,
1328
1355
  NK_PUBLIC void nk_dots_packed_f64_v128relaxed(nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
1329
1356
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1330
1357
  /** @copydoc nk_dots_symmetric_f64 */
1331
- NK_PUBLIC void nk_dots_symmetric_f64_v128relaxed(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1358
+ NK_PUBLIC void nk_dots_symmetric_f64_v128relaxed(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1332
1359
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1333
1360
  nk_size_t row_start, nk_size_t row_count);
1334
1361
  /** @copydoc nk_dots_packed_size_bf16 */
@@ -1341,7 +1368,7 @@ NK_PUBLIC void nk_dots_packed_e4m3_v128relaxed(nk_e4m3_t const *a, void const *b
1341
1368
  nk_size_t width, nk_size_t depth, nk_size_t a_stride,
1342
1369
  nk_size_t c_stride);
1343
1370
  /** @copydoc nk_dots_symmetric_bf16 */
1344
- NK_PUBLIC void nk_dots_symmetric_e4m3_v128relaxed(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1371
+ NK_PUBLIC void nk_dots_symmetric_e4m3_v128relaxed(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1345
1372
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1346
1373
  nk_size_t row_start, nk_size_t row_count);
1347
1374
  /** @copydoc nk_dots_packed_size_bf16 */
@@ -1354,7 +1381,7 @@ NK_PUBLIC void nk_dots_packed_e5m2_v128relaxed(nk_e5m2_t const *a, void const *b
1354
1381
  nk_size_t width, nk_size_t depth, nk_size_t a_stride,
1355
1382
  nk_size_t c_stride);
1356
1383
  /** @copydoc nk_dots_symmetric_bf16 */
1357
- NK_PUBLIC void nk_dots_symmetric_e5m2_v128relaxed(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1384
+ NK_PUBLIC void nk_dots_symmetric_e5m2_v128relaxed(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1358
1385
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1359
1386
  nk_size_t row_start, nk_size_t row_count);
1360
1387
  /** @copydoc nk_dots_packed_size_bf16 */
@@ -1366,7 +1393,7 @@ NK_PUBLIC void nk_dots_pack_u4_v128relaxed(nk_u4x2_t const *b, nk_size_t width,
1366
1393
  NK_PUBLIC void nk_dots_packed_u4_v128relaxed(nk_u4x2_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
1367
1394
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1368
1395
  /** @copydoc nk_dots_symmetric_bf16 */
1369
- NK_PUBLIC void nk_dots_symmetric_u4_v128relaxed(nk_u4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1396
+ NK_PUBLIC void nk_dots_symmetric_u4_v128relaxed(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1370
1397
  nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
1371
1398
  nk_size_t row_start, nk_size_t row_count);
1372
1399
  /** @copydoc nk_dots_packed_size_bf16 */
@@ -1378,7 +1405,7 @@ NK_PUBLIC void nk_dots_pack_i4_v128relaxed(nk_i4x2_t const *b, nk_size_t width,
1378
1405
  NK_PUBLIC void nk_dots_packed_i4_v128relaxed(nk_i4x2_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
1379
1406
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1380
1407
  /** @copydoc nk_dots_symmetric_bf16 */
1381
- NK_PUBLIC void nk_dots_symmetric_i4_v128relaxed(nk_i4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1408
+ NK_PUBLIC void nk_dots_symmetric_i4_v128relaxed(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1382
1409
  nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
1383
1410
  nk_size_t row_start, nk_size_t row_count);
1384
1411
  /** @copydoc nk_dots_packed_size_u1 */
@@ -1390,7 +1417,7 @@ NK_PUBLIC void nk_dots_pack_u1_v128relaxed(nk_u1x8_t const *b, nk_size_t width,
1390
1417
  NK_PUBLIC void nk_dots_packed_u1_v128relaxed(nk_u1x8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
1391
1418
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1392
1419
  /** @copydoc nk_dots_symmetric_u1 */
1393
- NK_PUBLIC void nk_dots_symmetric_u1_v128relaxed(nk_u1x8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1420
+ NK_PUBLIC void nk_dots_symmetric_u1_v128relaxed(nk_u1x8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1394
1421
  nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
1395
1422
  nk_size_t row_start, nk_size_t row_count);
1396
1423
  #endif // NK_TARGET_V128RELAXED
@@ -1408,7 +1435,7 @@ NK_PUBLIC void nk_dots_pack_f32_neon(nk_f32_t const *b, nk_size_t width, nk_size
1408
1435
  NK_PUBLIC void nk_dots_packed_f32_neon(nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
1409
1436
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1410
1437
  /** @copydoc nk_dots_symmetric_f32 */
1411
- NK_PUBLIC void nk_dots_symmetric_f32_neon(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1438
+ NK_PUBLIC void nk_dots_symmetric_f32_neon(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1412
1439
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1413
1440
  nk_size_t row_start, nk_size_t row_count);
1414
1441
  /** @copydoc nk_dots_packed_size_f64 */
@@ -1420,7 +1447,7 @@ NK_PUBLIC void nk_dots_pack_f64_neon(nk_f64_t const *b, nk_size_t width, nk_size
1420
1447
  NK_PUBLIC void nk_dots_packed_f64_neon(nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
1421
1448
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1422
1449
  /** @copydoc nk_dots_symmetric_f64 */
1423
- NK_PUBLIC void nk_dots_symmetric_f64_neon(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1450
+ NK_PUBLIC void nk_dots_symmetric_f64_neon(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1424
1451
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1425
1452
  nk_size_t row_start, nk_size_t row_count);
1426
1453
  /** @copydoc nk_dots_packed_size_u1 */
@@ -1432,7 +1459,7 @@ NK_PUBLIC void nk_dots_pack_u1_neon(nk_u1x8_t const *b, nk_size_t width, nk_size
1432
1459
  NK_PUBLIC void nk_dots_packed_u1_neon(nk_u1x8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
1433
1460
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1434
1461
  /** @copydoc nk_dots_symmetric_u1 */
1435
- NK_PUBLIC void nk_dots_symmetric_u1_neon(nk_u1x8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1462
+ NK_PUBLIC void nk_dots_symmetric_u1_neon(nk_u1x8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1436
1463
  nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
1437
1464
  nk_size_t row_start, nk_size_t row_count);
1438
1465
  /** @copydoc nk_dots_packed_size_f16 */
@@ -1444,7 +1471,7 @@ NK_PUBLIC void nk_dots_pack_f16_neon(nk_f16_t const *b, nk_size_t width, nk_size
1444
1471
  NK_PUBLIC void nk_dots_packed_f16_neon(nk_f16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1445
1472
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1446
1473
  /** @copydoc nk_dots_symmetric_f16 */
1447
- NK_PUBLIC void nk_dots_symmetric_f16_neon(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1474
+ NK_PUBLIC void nk_dots_symmetric_f16_neon(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1448
1475
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1449
1476
  nk_size_t row_start, nk_size_t row_count);
1450
1477
  /** @copydoc nk_dots_packed_size_bf16 */
@@ -1456,29 +1483,11 @@ NK_PUBLIC void nk_dots_pack_bf16_neon(nk_bf16_t const *b, nk_size_t width, nk_si
1456
1483
  NK_PUBLIC void nk_dots_packed_bf16_neon(nk_bf16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1457
1484
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1458
1485
  /** @copydoc nk_dots_symmetric_bf16 */
1459
- NK_PUBLIC void nk_dots_symmetric_bf16_neon(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1486
+ NK_PUBLIC void nk_dots_symmetric_bf16_neon(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1460
1487
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1461
1488
  nk_size_t row_start, nk_size_t row_count);
1462
1489
  #endif // NK_TARGET_NEON
1463
1490
 
1464
- /* ARM NEON with F16 arithmetic (ARMv8.2-A FP16).
1465
- * Provides native F16 FMLA for half-precision dot products.
1466
- */
1467
- #if NK_TARGET_NEONHALF
1468
- /** @copydoc nk_dots_packed_size_f16 */
1469
- NK_PUBLIC nk_size_t nk_dots_packed_size_f16_neonhalf(nk_size_t width, nk_size_t depth);
1470
- /** @copydoc nk_dots_pack_f16 */
1471
- NK_PUBLIC void nk_dots_pack_f16_neonhalf(nk_f16_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
1472
- void *b_packed);
1473
- /** @copydoc nk_dots_packed_f16 */
1474
- NK_PUBLIC void nk_dots_packed_f16_neonhalf(nk_f16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1475
- nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1476
- /** @copydoc nk_dots_symmetric_f16 */
1477
- NK_PUBLIC void nk_dots_symmetric_f16_neonhalf(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1478
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1479
- nk_size_t row_start, nk_size_t row_count);
1480
- #endif // NK_TARGET_NEONHALF
1481
-
1482
1491
  /* ARM NEON with BF16 dot product (ARMv8.6-A BF16).
1483
1492
  * Uses BFDOT/BFMMLA for efficient BF16 matrix operations.
1484
1493
  */
@@ -1492,7 +1501,7 @@ NK_PUBLIC void nk_dots_pack_bf16_neonbfdot(nk_bf16_t const *b, nk_size_t width,
1492
1501
  NK_PUBLIC void nk_dots_packed_bf16_neonbfdot(nk_bf16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1493
1502
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1494
1503
  /** @copydoc nk_dots_symmetric_bf16 */
1495
- NK_PUBLIC void nk_dots_symmetric_bf16_neonbfdot(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1504
+ NK_PUBLIC void nk_dots_symmetric_bf16_neonbfdot(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1496
1505
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1497
1506
  nk_size_t row_start, nk_size_t row_count);
1498
1507
  #endif // NK_TARGET_NEONBFDOT
@@ -1510,7 +1519,7 @@ NK_PUBLIC void nk_dots_pack_i8_neonsdot(nk_i8_t const *b, nk_size_t width, nk_si
1510
1519
  NK_PUBLIC void nk_dots_packed_i8_neonsdot(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
1511
1520
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1512
1521
  /** @copydoc nk_dots_symmetric_i8 */
1513
- NK_PUBLIC void nk_dots_symmetric_i8_neonsdot(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1522
+ NK_PUBLIC void nk_dots_symmetric_i8_neonsdot(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1514
1523
  nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
1515
1524
  nk_size_t row_start, nk_size_t row_count);
1516
1525
  /** @copydoc nk_dots_packed_size_u8 */
@@ -1522,7 +1531,7 @@ NK_PUBLIC void nk_dots_pack_u8_neonsdot(nk_u8_t const *b, nk_size_t width, nk_si
1522
1531
  NK_PUBLIC void nk_dots_packed_u8_neonsdot(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
1523
1532
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1524
1533
  /** @copydoc nk_dots_symmetric_u8 */
1525
- NK_PUBLIC void nk_dots_symmetric_u8_neonsdot(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1534
+ NK_PUBLIC void nk_dots_symmetric_u8_neonsdot(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1526
1535
  nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
1527
1536
  nk_size_t row_start, nk_size_t row_count);
1528
1537
  #endif // NK_TARGET_NEONSDOT
@@ -1540,7 +1549,7 @@ NK_PUBLIC void nk_dots_pack_f16_neonfhm(nk_f16_t const *b, nk_size_t width, nk_s
1540
1549
  NK_PUBLIC void nk_dots_packed_f16_neonfhm(nk_f16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1541
1550
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1542
1551
  /** @copydoc nk_dots_symmetric_f16 */
1543
- NK_PUBLIC void nk_dots_symmetric_f16_neonfhm(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1552
+ NK_PUBLIC void nk_dots_symmetric_f16_neonfhm(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1544
1553
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1545
1554
  nk_size_t row_start, nk_size_t row_count);
1546
1555
  /** @copydoc nk_dots_packed_size_e4m3 */
@@ -1552,7 +1561,7 @@ NK_PUBLIC void nk_dots_pack_e4m3_neonfhm(nk_e4m3_t const *b, nk_size_t width, nk
1552
1561
  NK_PUBLIC void nk_dots_packed_e4m3_neonfhm(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1553
1562
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1554
1563
  /** @copydoc nk_dots_symmetric_e4m3 */
1555
- NK_PUBLIC void nk_dots_symmetric_e4m3_neonfhm(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1564
+ NK_PUBLIC void nk_dots_symmetric_e4m3_neonfhm(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1556
1565
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1557
1566
  nk_size_t row_start, nk_size_t row_count);
1558
1567
  /** @copydoc nk_dots_packed_size_e5m2 */
@@ -1564,11 +1573,65 @@ NK_PUBLIC void nk_dots_pack_e5m2_neonfhm(nk_e5m2_t const *b, nk_size_t width, nk
1564
1573
  NK_PUBLIC void nk_dots_packed_e5m2_neonfhm(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1565
1574
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1566
1575
  /** @copydoc nk_dots_symmetric_e5m2 */
1567
- NK_PUBLIC void nk_dots_symmetric_e5m2_neonfhm(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1576
+ NK_PUBLIC void nk_dots_symmetric_e5m2_neonfhm(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1568
1577
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1569
1578
  nk_size_t row_start, nk_size_t row_count);
1570
1579
  #endif // NK_TARGET_NEONFHM
1571
1580
 
1581
+ /* ARM NEON with FP8 (ARMv9.2-A FP8).
1582
+ * Uses native FP8 dot-product instructions for E4M3/E5M2/E2M3/E3M2 operations.
1583
+ */
1584
+ #if NK_TARGET_NEONFP8
1585
+ /** @copydoc nk_dots_packed_size_e4m3 */
1586
+ NK_PUBLIC nk_size_t nk_dots_packed_size_e4m3_neonfp8(nk_size_t width, nk_size_t depth);
1587
+ /** @copydoc nk_dots_pack_e4m3 */
1588
+ NK_PUBLIC void nk_dots_pack_e4m3_neonfp8(nk_e4m3_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
1589
+ void *b_packed);
1590
+ /** @copydoc nk_dots_packed_e4m3 */
1591
+ NK_PUBLIC void nk_dots_packed_e4m3_neonfp8(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1592
+ nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1593
+ /** @copydoc nk_dots_symmetric_e4m3 */
1594
+ NK_PUBLIC void nk_dots_symmetric_e4m3_neonfp8(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1595
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1596
+ nk_size_t row_start, nk_size_t row_count);
1597
+ /** @copydoc nk_dots_packed_size_e5m2 */
1598
+ NK_PUBLIC nk_size_t nk_dots_packed_size_e5m2_neonfp8(nk_size_t width, nk_size_t depth);
1599
+ /** @copydoc nk_dots_pack_e5m2 */
1600
+ NK_PUBLIC void nk_dots_pack_e5m2_neonfp8(nk_e5m2_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
1601
+ void *b_packed);
1602
+ /** @copydoc nk_dots_packed_e5m2 */
1603
+ NK_PUBLIC void nk_dots_packed_e5m2_neonfp8(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1604
+ nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1605
+ /** @copydoc nk_dots_symmetric_e5m2 */
1606
+ NK_PUBLIC void nk_dots_symmetric_e5m2_neonfp8(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1607
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1608
+ nk_size_t row_start, nk_size_t row_count);
1609
+ /** @copydoc nk_dots_packed_size_e2m3 */
1610
+ NK_PUBLIC nk_size_t nk_dots_packed_size_e2m3_neonfp8(nk_size_t width, nk_size_t depth);
1611
+ /** @copydoc nk_dots_pack_e2m3 */
1612
+ NK_PUBLIC void nk_dots_pack_e2m3_neonfp8(nk_e2m3_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
1613
+ void *b_packed);
1614
+ /** @copydoc nk_dots_packed_e2m3 */
1615
+ NK_PUBLIC void nk_dots_packed_e2m3_neonfp8(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1616
+ nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1617
+ /** @copydoc nk_dots_symmetric_e2m3 */
1618
+ NK_PUBLIC void nk_dots_symmetric_e2m3_neonfp8(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1619
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1620
+ nk_size_t row_start, nk_size_t row_count);
1621
+ /** @copydoc nk_dots_packed_size_e3m2 */
1622
+ NK_PUBLIC nk_size_t nk_dots_packed_size_e3m2_neonfp8(nk_size_t width, nk_size_t depth);
1623
+ /** @copydoc nk_dots_pack_e3m2 */
1624
+ NK_PUBLIC void nk_dots_pack_e3m2_neonfp8(nk_e3m2_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
1625
+ void *b_packed);
1626
+ /** @copydoc nk_dots_packed_e3m2 */
1627
+ NK_PUBLIC void nk_dots_packed_e3m2_neonfp8(nk_e3m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1628
+ nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1629
+ /** @copydoc nk_dots_symmetric_e3m2 */
1630
+ NK_PUBLIC void nk_dots_symmetric_e3m2_neonfp8(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1631
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1632
+ nk_size_t row_start, nk_size_t row_count);
1633
+ #endif // NK_TARGET_NEONFP8
1634
+
1572
1635
  #if NK_TARGET_RVV
1573
1636
  /** @copydoc nk_dots_packed_size_e2m3 */
1574
1637
  NK_PUBLIC nk_size_t nk_dots_packed_size_e2m3_rvv(nk_size_t width, nk_size_t depth);
@@ -1579,7 +1642,7 @@ NK_PUBLIC void nk_dots_pack_e2m3_rvv(nk_e2m3_t const *b, nk_size_t width, nk_siz
1579
1642
  NK_PUBLIC void nk_dots_packed_e2m3_rvv(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1580
1643
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1581
1644
  /** @copydoc nk_dots_symmetric_e2m3 */
1582
- NK_PUBLIC void nk_dots_symmetric_e2m3_rvv(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1645
+ NK_PUBLIC void nk_dots_symmetric_e2m3_rvv(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1583
1646
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1584
1647
  nk_size_t row_start, nk_size_t row_count);
1585
1648
  /** @copydoc nk_dots_packed_size_e3m2 */
@@ -1591,7 +1654,7 @@ NK_PUBLIC void nk_dots_pack_e3m2_rvv(nk_e3m2_t const *b, nk_size_t width, nk_siz
1591
1654
  NK_PUBLIC void nk_dots_packed_e3m2_rvv(nk_e3m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1592
1655
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1593
1656
  /** @copydoc nk_dots_symmetric_e3m2 */
1594
- NK_PUBLIC void nk_dots_symmetric_e3m2_rvv(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1657
+ NK_PUBLIC void nk_dots_symmetric_e3m2_rvv(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1595
1658
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1596
1659
  nk_size_t row_start, nk_size_t row_count);
1597
1660
  /** @copydoc nk_dots_packed_size_f32 */
@@ -1603,7 +1666,7 @@ NK_PUBLIC void nk_dots_pack_f32_rvv(nk_f32_t const *b, nk_size_t width, nk_size_
1603
1666
  NK_PUBLIC void nk_dots_packed_f32_rvv(nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
1604
1667
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1605
1668
  /** @copydoc nk_dots_symmetric_f32 */
1606
- NK_PUBLIC void nk_dots_symmetric_f32_rvv(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1669
+ NK_PUBLIC void nk_dots_symmetric_f32_rvv(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1607
1670
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1608
1671
  nk_size_t row_start, nk_size_t row_count);
1609
1672
  /** @copydoc nk_dots_packed_size_f64 */
@@ -1615,7 +1678,7 @@ NK_PUBLIC void nk_dots_pack_f64_rvv(nk_f64_t const *b, nk_size_t width, nk_size_
1615
1678
  NK_PUBLIC void nk_dots_packed_f64_rvv(nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
1616
1679
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1617
1680
  /** @copydoc nk_dots_symmetric_f64 */
1618
- NK_PUBLIC void nk_dots_symmetric_f64_rvv(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1681
+ NK_PUBLIC void nk_dots_symmetric_f64_rvv(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1619
1682
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1620
1683
  nk_size_t row_start, nk_size_t row_count);
1621
1684
  /** @copydoc nk_dots_packed_size_bf16 */
@@ -1627,7 +1690,7 @@ NK_PUBLIC void nk_dots_pack_bf16_rvv(nk_bf16_t const *b, nk_size_t width, nk_siz
1627
1690
  NK_PUBLIC void nk_dots_packed_bf16_rvv(nk_bf16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1628
1691
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1629
1692
  /** @copydoc nk_dots_symmetric_bf16 */
1630
- NK_PUBLIC void nk_dots_symmetric_bf16_rvv(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1693
+ NK_PUBLIC void nk_dots_symmetric_bf16_rvv(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1631
1694
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1632
1695
  nk_size_t row_start, nk_size_t row_count);
1633
1696
  /** @copydoc nk_dots_packed_size_f16 */
@@ -1639,7 +1702,7 @@ NK_PUBLIC void nk_dots_pack_f16_rvv(nk_f16_t const *b, nk_size_t width, nk_size_
1639
1702
  NK_PUBLIC void nk_dots_packed_f16_rvv(nk_f16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1640
1703
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1641
1704
  /** @copydoc nk_dots_symmetric_f16 */
1642
- NK_PUBLIC void nk_dots_symmetric_f16_rvv(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1705
+ NK_PUBLIC void nk_dots_symmetric_f16_rvv(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1643
1706
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1644
1707
  nk_size_t row_start, nk_size_t row_count);
1645
1708
  /** @copydoc nk_dots_packed_size_i8 */
@@ -1651,9 +1714,9 @@ NK_PUBLIC void nk_dots_pack_i8_rvv(nk_i8_t const *b, nk_size_t width, nk_size_t
1651
1714
  NK_PUBLIC void nk_dots_packed_i8_rvv(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
1652
1715
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1653
1716
  /** @copydoc nk_dots_symmetric_i8 */
1654
- NK_PUBLIC void nk_dots_symmetric_i8_rvv(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
1655
- nk_i32_t *result, nk_size_t result_stride, nk_size_t row_start,
1656
- nk_size_t row_count);
1717
+ NK_PUBLIC void nk_dots_symmetric_i8_rvv(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1718
+ nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
1719
+ nk_size_t row_start, nk_size_t row_count);
1657
1720
  /** @copydoc nk_dots_packed_size_u8 */
1658
1721
  NK_PUBLIC nk_size_t nk_dots_packed_size_u8_rvv(nk_size_t width, nk_size_t depth);
1659
1722
  /** @copydoc nk_dots_pack_u8 */
@@ -1663,9 +1726,9 @@ NK_PUBLIC void nk_dots_pack_u8_rvv(nk_u8_t const *b, nk_size_t width, nk_size_t
1663
1726
  NK_PUBLIC void nk_dots_packed_u8_rvv(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
1664
1727
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1665
1728
  /** @copydoc nk_dots_symmetric_u8 */
1666
- NK_PUBLIC void nk_dots_symmetric_u8_rvv(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
1667
- nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
1668
- nk_size_t row_count);
1729
+ NK_PUBLIC void nk_dots_symmetric_u8_rvv(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1730
+ nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
1731
+ nk_size_t row_start, nk_size_t row_count);
1669
1732
  /** @copydoc nk_dots_packed_size_e4m3 */
1670
1733
  NK_PUBLIC nk_size_t nk_dots_packed_size_e4m3_rvv(nk_size_t width, nk_size_t depth);
1671
1734
  /** @copydoc nk_dots_pack_e4m3 */
@@ -1675,7 +1738,7 @@ NK_PUBLIC void nk_dots_pack_e4m3_rvv(nk_e4m3_t const *b, nk_size_t width, nk_siz
1675
1738
  NK_PUBLIC void nk_dots_packed_e4m3_rvv(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1676
1739
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1677
1740
  /** @copydoc nk_dots_symmetric_e4m3 */
1678
- NK_PUBLIC void nk_dots_symmetric_e4m3_rvv(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1741
+ NK_PUBLIC void nk_dots_symmetric_e4m3_rvv(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1679
1742
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1680
1743
  nk_size_t row_start, nk_size_t row_count);
1681
1744
  /** @copydoc nk_dots_packed_size_e5m2 */
@@ -1687,11 +1750,101 @@ NK_PUBLIC void nk_dots_pack_e5m2_rvv(nk_e5m2_t const *b, nk_size_t width, nk_siz
1687
1750
  NK_PUBLIC void nk_dots_packed_e5m2_rvv(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1688
1751
  nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1689
1752
  /** @copydoc nk_dots_symmetric_e5m2 */
1690
- NK_PUBLIC void nk_dots_symmetric_e5m2_rvv(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1753
+ NK_PUBLIC void nk_dots_symmetric_e5m2_rvv(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1691
1754
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1692
1755
  nk_size_t row_start, nk_size_t row_count);
1693
1756
  #endif // NK_TARGET_RVV
1694
1757
 
1758
+ /* Loongson LASX backends using 256-bit SIMD (LoongArch).
1759
+ */
1760
+ #if NK_TARGET_LOONGSONASX
1761
+ /** @copydoc nk_dots_packed_size_f32 */
1762
+ NK_PUBLIC nk_size_t nk_dots_packed_size_f32_loongsonasx(nk_size_t width, nk_size_t depth);
1763
+ /** @copydoc nk_dots_pack_f32 */
1764
+ NK_PUBLIC void nk_dots_pack_f32_loongsonasx(nk_f32_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
1765
+ void *b_packed);
1766
+ /** @copydoc nk_dots_packed_f32 */
1767
+ NK_PUBLIC void nk_dots_packed_f32_loongsonasx(nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
1768
+ nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1769
+ /** @copydoc nk_dots_symmetric_f32 */
1770
+ NK_PUBLIC void nk_dots_symmetric_f32_loongsonasx(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1771
+ nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1772
+ nk_size_t row_start, nk_size_t row_count);
1773
+ /** @copydoc nk_dots_packed_size_f64 */
1774
+ NK_PUBLIC nk_size_t nk_dots_packed_size_f64_loongsonasx(nk_size_t width, nk_size_t depth);
1775
+ /** @copydoc nk_dots_pack_f64 */
1776
+ NK_PUBLIC void nk_dots_pack_f64_loongsonasx(nk_f64_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
1777
+ void *b_packed);
1778
+ /** @copydoc nk_dots_packed_f64 */
1779
+ NK_PUBLIC void nk_dots_packed_f64_loongsonasx(nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t height,
1780
+ nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1781
+ /** @copydoc nk_dots_symmetric_f64 */
1782
+ NK_PUBLIC void nk_dots_symmetric_f64_loongsonasx(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1783
+ nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1784
+ nk_size_t row_start, nk_size_t row_count);
1785
+ /** @copydoc nk_dots_packed_size_f16 */
1786
+ NK_PUBLIC nk_size_t nk_dots_packed_size_f16_loongsonasx(nk_size_t width, nk_size_t depth);
1787
+ /** @copydoc nk_dots_pack_f16 */
1788
+ NK_PUBLIC void nk_dots_pack_f16_loongsonasx(nk_f16_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
1789
+ void *b_packed);
1790
+ /** @copydoc nk_dots_packed_f16 */
1791
+ NK_PUBLIC void nk_dots_packed_f16_loongsonasx(nk_f16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1792
+ nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1793
+ /** @copydoc nk_dots_symmetric_f16 */
1794
+ NK_PUBLIC void nk_dots_symmetric_f16_loongsonasx(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1795
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1796
+ nk_size_t row_start, nk_size_t row_count);
1797
+ /** @copydoc nk_dots_packed_size_bf16 */
1798
+ NK_PUBLIC nk_size_t nk_dots_packed_size_bf16_loongsonasx(nk_size_t width, nk_size_t depth);
1799
+ /** @copydoc nk_dots_pack_bf16 */
1800
+ NK_PUBLIC void nk_dots_pack_bf16_loongsonasx(nk_bf16_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
1801
+ void *b_packed);
1802
+ /** @copydoc nk_dots_packed_bf16 */
1803
+ NK_PUBLIC void nk_dots_packed_bf16_loongsonasx(nk_bf16_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t height,
1804
+ nk_size_t width, nk_size_t depth, nk_size_t a_stride,
1805
+ nk_size_t c_stride);
1806
+ /** @copydoc nk_dots_symmetric_bf16 */
1807
+ NK_PUBLIC void nk_dots_symmetric_bf16_loongsonasx(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1808
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1809
+ nk_size_t row_start, nk_size_t row_count);
1810
+ /** @copydoc nk_dots_packed_size_i8 */
1811
+ NK_PUBLIC nk_size_t nk_dots_packed_size_i8_loongsonasx(nk_size_t width, nk_size_t depth);
1812
+ /** @copydoc nk_dots_pack_i8 */
1813
+ NK_PUBLIC void nk_dots_pack_i8_loongsonasx(nk_i8_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
1814
+ void *b_packed);
1815
+ /** @copydoc nk_dots_packed_i8 */
1816
+ NK_PUBLIC void nk_dots_packed_i8_loongsonasx(nk_i8_t const *a, void const *b_packed, nk_i32_t *c, nk_size_t height,
1817
+ nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1818
+ /** @copydoc nk_dots_symmetric_i8 */
1819
+ NK_PUBLIC void nk_dots_symmetric_i8_loongsonasx(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1820
+ nk_size_t stride, nk_i32_t *result, nk_size_t result_stride,
1821
+ nk_size_t row_start, nk_size_t row_count);
1822
+ /** @copydoc nk_dots_packed_size_u8 */
1823
+ NK_PUBLIC nk_size_t nk_dots_packed_size_u8_loongsonasx(nk_size_t width, nk_size_t depth);
1824
+ /** @copydoc nk_dots_pack_u8 */
1825
+ NK_PUBLIC void nk_dots_pack_u8_loongsonasx(nk_u8_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
1826
+ void *b_packed);
1827
+ /** @copydoc nk_dots_packed_u8 */
1828
+ NK_PUBLIC void nk_dots_packed_u8_loongsonasx(nk_u8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
1829
+ nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1830
+ /** @copydoc nk_dots_symmetric_u8 */
1831
+ NK_PUBLIC void nk_dots_symmetric_u8_loongsonasx(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1832
+ nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
1833
+ nk_size_t row_start, nk_size_t row_count);
1834
+ /** @copydoc nk_dots_packed_size_u1 */
1835
+ NK_PUBLIC nk_size_t nk_dots_packed_size_u1_loongsonasx(nk_size_t width, nk_size_t depth);
1836
+ /** @copydoc nk_dots_pack_u1 */
1837
+ NK_PUBLIC void nk_dots_pack_u1_loongsonasx(nk_u1x8_t const *b, nk_size_t width, nk_size_t depth, nk_size_t b_stride,
1838
+ void *b_packed);
1839
+ /** @copydoc nk_dots_packed_u1 */
1840
+ NK_PUBLIC void nk_dots_packed_u1_loongsonasx(nk_u1x8_t const *a, void const *b_packed, nk_u32_t *c, nk_size_t height,
1841
+ nk_size_t width, nk_size_t depth, nk_size_t a_stride, nk_size_t c_stride);
1842
+ /** @copydoc nk_dots_symmetric_u1 */
1843
+ NK_PUBLIC void nk_dots_symmetric_u1_loongsonasx(nk_u1x8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1844
+ nk_size_t stride, nk_u32_t *result, nk_size_t result_stride,
1845
+ nk_size_t row_start, nk_size_t row_count);
1846
+ #endif // NK_TARGET_LOONGSONASX
1847
+
1695
1848
  #if defined(__cplusplus)
1696
1849
  } // extern "C"
1697
1850
  #endif
@@ -1703,17 +1856,20 @@ NK_PUBLIC void nk_dots_symmetric_e5m2_rvv(nk_e5m2_t const *vectors, nk_size_t n_
1703
1856
  #include "numkong/dots/alder.h"
1704
1857
  #include "numkong/dots/sierra.h"
1705
1858
  #include "numkong/dots/genoa.h"
1859
+ #include "numkong/dots/diamond.h"
1706
1860
  #include "numkong/dots/sapphireamx.h"
1707
1861
  #include "numkong/dots/neon.h"
1708
1862
  #include "numkong/dots/neonsdot.h"
1709
- #include "numkong/dots/neonhalf.h"
1710
1863
  #include "numkong/dots/neonfhm.h"
1864
+ #include "numkong/dots/neonfp8.h"
1711
1865
  #include "numkong/dots/neonbfdot.h"
1712
1866
  #include "numkong/dots/sme.h"
1713
1867
  #include "numkong/dots/smef64.h"
1714
1868
  #include "numkong/dots/smebi32.h"
1715
1869
  #include "numkong/dots/rvv.h"
1870
+ #include "numkong/dots/powervsx.h"
1716
1871
  #include "numkong/dots/v128relaxed.h"
1872
+ #include "numkong/dots/loongsonasx.h"
1717
1873
 
1718
1874
  #if defined(__cplusplus)
1719
1875
  extern "C" {
@@ -1730,6 +1886,8 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_f32(nk_size_t width, nk_size_t depth) {
1730
1886
  return nk_dots_packed_size_f32_haswell(width, depth);
1731
1887
  #elif NK_TARGET_NEON
1732
1888
  return nk_dots_packed_size_f32_neon(width, depth);
1889
+ #elif NK_TARGET_POWERVSX
1890
+ return nk_dots_packed_size_f32_powervsx(width, depth);
1733
1891
  #elif NK_TARGET_RVV
1734
1892
  return nk_dots_packed_size_f32_rvv(width, depth);
1735
1893
  #elif NK_TARGET_V128RELAXED
@@ -1749,6 +1907,8 @@ NK_PUBLIC void nk_dots_pack_f32(nk_f32_t const *b, nk_size_t width, nk_size_t de
1749
1907
  nk_dots_pack_f32_haswell(b, width, depth, b_stride, b_packed);
1750
1908
  #elif NK_TARGET_NEON
1751
1909
  nk_dots_pack_f32_neon(b, width, depth, b_stride, b_packed);
1910
+ #elif NK_TARGET_POWERVSX
1911
+ nk_dots_pack_f32_powervsx(b, width, depth, b_stride, b_packed);
1752
1912
  #elif NK_TARGET_RVV
1753
1913
  nk_dots_pack_f32_rvv(b, width, depth, b_stride, b_packed);
1754
1914
  #elif NK_TARGET_V128RELAXED
@@ -1768,6 +1928,8 @@ NK_PUBLIC void nk_dots_packed_f32(nk_f32_t const *a, void const *b_packed, nk_f6
1768
1928
  nk_dots_packed_f32_haswell(a, b_packed, c, height, width, depth, a_stride, c_stride);
1769
1929
  #elif NK_TARGET_NEON
1770
1930
  nk_dots_packed_f32_neon(a, b_packed, c, height, width, depth, a_stride, c_stride);
1931
+ #elif NK_TARGET_POWERVSX
1932
+ nk_dots_packed_f32_powervsx(a, b_packed, c, height, width, depth, a_stride, c_stride);
1771
1933
  #elif NK_TARGET_RVV
1772
1934
  nk_dots_packed_f32_rvv(a, b_packed, c, height, width, depth, a_stride, c_stride);
1773
1935
  #elif NK_TARGET_V128RELAXED
@@ -1786,6 +1948,8 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_f64(nk_size_t width, nk_size_t depth) {
1786
1948
  return nk_dots_packed_size_f64_haswell(width, depth);
1787
1949
  #elif NK_TARGET_NEON
1788
1950
  return nk_dots_packed_size_f64_neon(width, depth);
1951
+ #elif NK_TARGET_POWERVSX
1952
+ return nk_dots_packed_size_f64_powervsx(width, depth);
1789
1953
  #elif NK_TARGET_RVV
1790
1954
  return nk_dots_packed_size_f64_rvv(width, depth);
1791
1955
  #elif NK_TARGET_V128RELAXED
@@ -1805,6 +1969,8 @@ NK_PUBLIC void nk_dots_pack_f64(nk_f64_t const *b, nk_size_t width, nk_size_t de
1805
1969
  nk_dots_pack_f64_haswell(b, width, depth, b_stride, b_packed);
1806
1970
  #elif NK_TARGET_NEON
1807
1971
  nk_dots_pack_f64_neon(b, width, depth, b_stride, b_packed);
1972
+ #elif NK_TARGET_POWERVSX
1973
+ nk_dots_pack_f64_powervsx(b, width, depth, b_stride, b_packed);
1808
1974
  #elif NK_TARGET_RVV
1809
1975
  nk_dots_pack_f64_rvv(b, width, depth, b_stride, b_packed);
1810
1976
  #elif NK_TARGET_V128RELAXED
@@ -1824,6 +1990,8 @@ NK_PUBLIC void nk_dots_packed_f64(nk_f64_t const *a, void const *b_packed, nk_f6
1824
1990
  nk_dots_packed_f64_haswell(a, b_packed, c, height, width, depth, a_stride, c_stride);
1825
1991
  #elif NK_TARGET_NEON
1826
1992
  nk_dots_packed_f64_neon(a, b_packed, c, height, width, depth, a_stride, c_stride);
1993
+ #elif NK_TARGET_POWERVSX
1994
+ nk_dots_packed_f64_powervsx(a, b_packed, c, height, width, depth, a_stride, c_stride);
1827
1995
  #elif NK_TARGET_RVV
1828
1996
  nk_dots_packed_f64_rvv(a, b_packed, c, height, width, depth, a_stride, c_stride);
1829
1997
  #elif NK_TARGET_V128RELAXED
@@ -1838,14 +2006,14 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_f16(nk_size_t width, nk_size_t depth) {
1838
2006
  return nk_dots_packed_size_f16_sme(width, depth);
1839
2007
  #elif NK_TARGET_NEONFHM
1840
2008
  return nk_dots_packed_size_f16_neonfhm(width, depth);
1841
- #elif NK_TARGET_NEONHALF
1842
- return nk_dots_packed_size_f16_neonhalf(width, depth);
1843
2009
  #elif NK_TARGET_NEON
1844
2010
  return nk_dots_packed_size_f16_neon(width, depth);
1845
2011
  #elif NK_TARGET_SKYLAKE
1846
2012
  return nk_dots_packed_size_f16_skylake(width, depth);
1847
2013
  #elif NK_TARGET_HASWELL
1848
2014
  return nk_dots_packed_size_f16_haswell(width, depth);
2015
+ #elif NK_TARGET_POWERVSX
2016
+ return nk_dots_packed_size_f16_powervsx(width, depth);
1849
2017
  #elif NK_TARGET_RVV
1850
2018
  return nk_dots_packed_size_f16_rvv(width, depth);
1851
2019
  #else
@@ -1859,14 +2027,14 @@ NK_PUBLIC void nk_dots_pack_f16(nk_f16_t const *b, nk_size_t width, nk_size_t de
1859
2027
  nk_dots_pack_f16_sme(b, width, depth, b_stride, b_packed);
1860
2028
  #elif NK_TARGET_NEONFHM
1861
2029
  nk_dots_pack_f16_neonfhm(b, width, depth, b_stride, b_packed);
1862
- #elif NK_TARGET_NEONHALF
1863
- nk_dots_pack_f16_neonhalf(b, width, depth, b_stride, b_packed);
1864
2030
  #elif NK_TARGET_NEON
1865
2031
  nk_dots_pack_f16_neon(b, width, depth, b_stride, b_packed);
1866
2032
  #elif NK_TARGET_SKYLAKE
1867
2033
  nk_dots_pack_f16_skylake(b, width, depth, b_stride, b_packed);
1868
2034
  #elif NK_TARGET_HASWELL
1869
2035
  nk_dots_pack_f16_haswell(b, width, depth, b_stride, b_packed);
2036
+ #elif NK_TARGET_POWERVSX
2037
+ nk_dots_pack_f16_powervsx(b, width, depth, b_stride, b_packed);
1870
2038
  #elif NK_TARGET_RVV
1871
2039
  nk_dots_pack_f16_rvv(b, width, depth, b_stride, b_packed);
1872
2040
  #else
@@ -1880,14 +2048,14 @@ NK_PUBLIC void nk_dots_packed_f16(nk_f16_t const *a, void const *b_packed, nk_f3
1880
2048
  nk_dots_packed_f16_sme(a, b_packed, c, height, width, depth, a_stride, c_stride);
1881
2049
  #elif NK_TARGET_NEONFHM
1882
2050
  nk_dots_packed_f16_neonfhm(a, b_packed, c, height, width, depth, a_stride, c_stride);
1883
- #elif NK_TARGET_NEONHALF
1884
- nk_dots_packed_f16_neonhalf(a, b_packed, c, height, width, depth, a_stride, c_stride);
1885
2051
  #elif NK_TARGET_NEON
1886
2052
  nk_dots_packed_f16_neon(a, b_packed, c, height, width, depth, a_stride, c_stride);
1887
2053
  #elif NK_TARGET_SKYLAKE
1888
2054
  nk_dots_packed_f16_skylake(a, b_packed, c, height, width, depth, a_stride, c_stride);
1889
2055
  #elif NK_TARGET_HASWELL
1890
2056
  nk_dots_packed_f16_haswell(a, b_packed, c, height, width, depth, a_stride, c_stride);
2057
+ #elif NK_TARGET_POWERVSX
2058
+ nk_dots_packed_f16_powervsx(a, b_packed, c, height, width, depth, a_stride, c_stride);
1891
2059
  #elif NK_TARGET_RVV
1892
2060
  nk_dots_packed_f16_rvv(a, b_packed, c, height, width, depth, a_stride, c_stride);
1893
2061
  #else
@@ -1908,6 +2076,8 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_bf16(nk_size_t width, nk_size_t depth) {
1908
2076
  return nk_dots_packed_size_bf16_skylake(width, depth);
1909
2077
  #elif NK_TARGET_HASWELL
1910
2078
  return nk_dots_packed_size_bf16_haswell(width, depth);
2079
+ #elif NK_TARGET_POWERVSX
2080
+ return nk_dots_packed_size_bf16_powervsx(width, depth);
1911
2081
  #elif NK_TARGET_RVV
1912
2082
  return nk_dots_packed_size_bf16_rvv(width, depth);
1913
2083
  #elif NK_TARGET_V128RELAXED
@@ -1931,6 +2101,8 @@ NK_PUBLIC void nk_dots_pack_bf16(nk_bf16_t const *b, nk_size_t width, nk_size_t
1931
2101
  nk_dots_pack_bf16_skylake(b, width, depth, b_stride, b_packed);
1932
2102
  #elif NK_TARGET_HASWELL
1933
2103
  nk_dots_pack_bf16_haswell(b, width, depth, b_stride, b_packed);
2104
+ #elif NK_TARGET_POWERVSX
2105
+ nk_dots_pack_bf16_powervsx(b, width, depth, b_stride, b_packed);
1934
2106
  #elif NK_TARGET_RVV
1935
2107
  nk_dots_pack_bf16_rvv(b, width, depth, b_stride, b_packed);
1936
2108
  #elif NK_TARGET_V128RELAXED
@@ -1954,6 +2126,8 @@ NK_PUBLIC void nk_dots_packed_bf16(nk_bf16_t const *a, void const *b_packed, nk_
1954
2126
  nk_dots_packed_bf16_skylake(a, b_packed, c, height, width, depth, a_stride, c_stride);
1955
2127
  #elif NK_TARGET_HASWELL
1956
2128
  nk_dots_packed_bf16_haswell(a, b_packed, c, height, width, depth, a_stride, c_stride);
2129
+ #elif NK_TARGET_POWERVSX
2130
+ nk_dots_packed_bf16_powervsx(a, b_packed, c, height, width, depth, a_stride, c_stride);
1957
2131
  #elif NK_TARGET_RVV
1958
2132
  nk_dots_packed_bf16_rvv(a, b_packed, c, height, width, depth, a_stride, c_stride);
1959
2133
  #elif NK_TARGET_V128RELAXED
@@ -1978,6 +2152,8 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_i8(nk_size_t width, nk_size_t depth) {
1978
2152
  return nk_dots_packed_size_i8_alder(width, depth);
1979
2153
  #elif NK_TARGET_HASWELL
1980
2154
  return nk_dots_packed_size_i8_haswell(width, depth);
2155
+ #elif NK_TARGET_POWERVSX
2156
+ return nk_dots_packed_size_i8_powervsx(width, depth);
1981
2157
  #elif NK_TARGET_RVV
1982
2158
  return nk_dots_packed_size_i8_rvv(width, depth);
1983
2159
  #elif NK_TARGET_V128RELAXED
@@ -2002,6 +2178,8 @@ NK_PUBLIC void nk_dots_pack_i8(nk_i8_t const *b, nk_size_t width, nk_size_t dept
2002
2178
  nk_dots_pack_i8_alder(b, width, depth, b_stride, b_packed);
2003
2179
  #elif NK_TARGET_HASWELL
2004
2180
  nk_dots_pack_i8_haswell(b, width, depth, b_stride, b_packed);
2181
+ #elif NK_TARGET_POWERVSX
2182
+ nk_dots_pack_i8_powervsx(b, width, depth, b_stride, b_packed);
2005
2183
  #elif NK_TARGET_RVV
2006
2184
  nk_dots_pack_i8_rvv(b, width, depth, b_stride, b_packed);
2007
2185
  #elif NK_TARGET_V128RELAXED
@@ -2027,6 +2205,8 @@ NK_PUBLIC void nk_dots_packed_i8(nk_i8_t const *a, void const *b_packed, nk_i32_
2027
2205
  nk_dots_packed_i8_alder(a, b_packed, c, height, width, depth, a_stride, c_stride);
2028
2206
  #elif NK_TARGET_HASWELL
2029
2207
  nk_dots_packed_i8_haswell(a, b_packed, c, height, width, depth, a_stride, c_stride);
2208
+ #elif NK_TARGET_POWERVSX
2209
+ nk_dots_packed_i8_powervsx(a, b_packed, c, height, width, depth, a_stride, c_stride);
2030
2210
  #elif NK_TARGET_RVV
2031
2211
  nk_dots_packed_i8_rvv(a, b_packed, c, height, width, depth, a_stride, c_stride);
2032
2212
  #elif NK_TARGET_V128RELAXED
@@ -2051,6 +2231,8 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_u8(nk_size_t width, nk_size_t depth) {
2051
2231
  return nk_dots_packed_size_u8_alder(width, depth);
2052
2232
  #elif NK_TARGET_HASWELL
2053
2233
  return nk_dots_packed_size_u8_haswell(width, depth);
2234
+ #elif NK_TARGET_POWERVSX
2235
+ return nk_dots_packed_size_u8_powervsx(width, depth);
2054
2236
  #elif NK_TARGET_RVV
2055
2237
  return nk_dots_packed_size_u8_rvv(width, depth);
2056
2238
  #elif NK_TARGET_V128RELAXED
@@ -2075,6 +2257,8 @@ NK_PUBLIC void nk_dots_pack_u8(nk_u8_t const *b, nk_size_t width, nk_size_t dept
2075
2257
  nk_dots_pack_u8_alder(b, width, depth, b_stride, b_packed);
2076
2258
  #elif NK_TARGET_HASWELL
2077
2259
  nk_dots_pack_u8_haswell(b, width, depth, b_stride, b_packed);
2260
+ #elif NK_TARGET_POWERVSX
2261
+ nk_dots_pack_u8_powervsx(b, width, depth, b_stride, b_packed);
2078
2262
  #elif NK_TARGET_RVV
2079
2263
  nk_dots_pack_u8_rvv(b, width, depth, b_stride, b_packed);
2080
2264
  #elif NK_TARGET_V128RELAXED
@@ -2100,6 +2284,8 @@ NK_PUBLIC void nk_dots_packed_u8(nk_u8_t const *a, void const *b_packed, nk_u32_
2100
2284
  nk_dots_packed_u8_alder(a, b_packed, c, height, width, depth, a_stride, c_stride);
2101
2285
  #elif NK_TARGET_HASWELL
2102
2286
  nk_dots_packed_u8_haswell(a, b_packed, c, height, width, depth, a_stride, c_stride);
2287
+ #elif NK_TARGET_POWERVSX
2288
+ nk_dots_packed_u8_powervsx(a, b_packed, c, height, width, depth, a_stride, c_stride);
2103
2289
  #elif NK_TARGET_RVV
2104
2290
  nk_dots_packed_u8_rvv(a, b_packed, c, height, width, depth, a_stride, c_stride);
2105
2291
  #elif NK_TARGET_V128RELAXED
@@ -2114,8 +2300,12 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_e4m3(nk_size_t width, nk_size_t depth) {
2114
2300
  return nk_dots_packed_size_e4m3_sme(width, depth);
2115
2301
  #elif NK_TARGET_SAPPHIREAMX
2116
2302
  return nk_dots_packed_size_e4m3_sapphireamx(width, depth);
2303
+ #elif NK_TARGET_NEONFP8
2304
+ return nk_dots_packed_size_e4m3_neonfp8(width, depth);
2117
2305
  #elif NK_TARGET_NEONFHM
2118
2306
  return nk_dots_packed_size_e4m3_neonfhm(width, depth);
2307
+ #elif NK_TARGET_DIAMOND
2308
+ return nk_dots_packed_size_e4m3_diamond(width, depth);
2119
2309
  #elif NK_TARGET_GENOA
2120
2310
  return nk_dots_packed_size_e4m3_genoa(width, depth);
2121
2311
  #elif NK_TARGET_SKYLAKE
@@ -2137,8 +2327,12 @@ NK_PUBLIC void nk_dots_pack_e4m3(nk_e4m3_t const *b, nk_size_t width, nk_size_t
2137
2327
  nk_dots_pack_e4m3_sme(b, width, depth, b_stride, b_packed);
2138
2328
  #elif NK_TARGET_SAPPHIREAMX
2139
2329
  nk_dots_pack_e4m3_sapphireamx(b, width, depth, b_stride, b_packed);
2330
+ #elif NK_TARGET_NEONFP8
2331
+ nk_dots_pack_e4m3_neonfp8(b, width, depth, b_stride, b_packed);
2140
2332
  #elif NK_TARGET_NEONFHM
2141
2333
  nk_dots_pack_e4m3_neonfhm(b, width, depth, b_stride, b_packed);
2334
+ #elif NK_TARGET_DIAMOND
2335
+ nk_dots_pack_e4m3_diamond(b, width, depth, b_stride, b_packed);
2142
2336
  #elif NK_TARGET_GENOA
2143
2337
  nk_dots_pack_e4m3_genoa(b, width, depth, b_stride, b_packed);
2144
2338
  #elif NK_TARGET_SKYLAKE
@@ -2160,8 +2354,12 @@ NK_PUBLIC void nk_dots_packed_e4m3(nk_e4m3_t const *a, void const *b_packed, nk_
2160
2354
  nk_dots_packed_e4m3_sme(a, b_packed, c, height, width, depth, a_stride, c_stride);
2161
2355
  #elif NK_TARGET_SAPPHIREAMX
2162
2356
  nk_dots_packed_e4m3_sapphireamx(a, b_packed, c, height, width, depth, a_stride, c_stride);
2357
+ #elif NK_TARGET_NEONFP8
2358
+ nk_dots_packed_e4m3_neonfp8(a, b_packed, c, height, width, depth, a_stride, c_stride);
2163
2359
  #elif NK_TARGET_NEONFHM
2164
2360
  nk_dots_packed_e4m3_neonfhm(a, b_packed, c, height, width, depth, a_stride, c_stride);
2361
+ #elif NK_TARGET_DIAMOND
2362
+ nk_dots_packed_e4m3_diamond(a, b_packed, c, height, width, depth, a_stride, c_stride);
2165
2363
  #elif NK_TARGET_GENOA
2166
2364
  nk_dots_packed_e4m3_genoa(a, b_packed, c, height, width, depth, a_stride, c_stride);
2167
2365
  #elif NK_TARGET_SKYLAKE
@@ -2182,8 +2380,12 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_e5m2(nk_size_t width, nk_size_t depth) {
2182
2380
  return nk_dots_packed_size_e5m2_sme(width, depth);
2183
2381
  #elif NK_TARGET_SAPPHIREAMX
2184
2382
  return nk_dots_packed_size_e5m2_sapphireamx(width, depth);
2383
+ #elif NK_TARGET_NEONFP8
2384
+ return nk_dots_packed_size_e5m2_neonfp8(width, depth);
2185
2385
  #elif NK_TARGET_NEONFHM
2186
2386
  return nk_dots_packed_size_e5m2_neonfhm(width, depth);
2387
+ #elif NK_TARGET_DIAMOND
2388
+ return nk_dots_packed_size_e5m2_diamond(width, depth);
2187
2389
  #elif NK_TARGET_GENOA
2188
2390
  return nk_dots_packed_size_e5m2_genoa(width, depth);
2189
2391
  #elif NK_TARGET_SKYLAKE
@@ -2205,8 +2407,12 @@ NK_PUBLIC void nk_dots_pack_e5m2(nk_e5m2_t const *b, nk_size_t width, nk_size_t
2205
2407
  nk_dots_pack_e5m2_sme(b, width, depth, b_stride, b_packed);
2206
2408
  #elif NK_TARGET_SAPPHIREAMX
2207
2409
  nk_dots_pack_e5m2_sapphireamx(b, width, depth, b_stride, b_packed);
2410
+ #elif NK_TARGET_NEONFP8
2411
+ nk_dots_pack_e5m2_neonfp8(b, width, depth, b_stride, b_packed);
2208
2412
  #elif NK_TARGET_NEONFHM
2209
2413
  nk_dots_pack_e5m2_neonfhm(b, width, depth, b_stride, b_packed);
2414
+ #elif NK_TARGET_DIAMOND
2415
+ nk_dots_pack_e5m2_diamond(b, width, depth, b_stride, b_packed);
2210
2416
  #elif NK_TARGET_GENOA
2211
2417
  nk_dots_pack_e5m2_genoa(b, width, depth, b_stride, b_packed);
2212
2418
  #elif NK_TARGET_SKYLAKE
@@ -2228,8 +2434,12 @@ NK_PUBLIC void nk_dots_packed_e5m2(nk_e5m2_t const *a, void const *b_packed, nk_
2228
2434
  nk_dots_packed_e5m2_sme(a, b_packed, c, height, width, depth, a_stride, c_stride);
2229
2435
  #elif NK_TARGET_SAPPHIREAMX
2230
2436
  nk_dots_packed_e5m2_sapphireamx(a, b_packed, c, height, width, depth, a_stride, c_stride);
2437
+ #elif NK_TARGET_NEONFP8
2438
+ nk_dots_packed_e5m2_neonfp8(a, b_packed, c, height, width, depth, a_stride, c_stride);
2231
2439
  #elif NK_TARGET_NEONFHM
2232
2440
  nk_dots_packed_e5m2_neonfhm(a, b_packed, c, height, width, depth, a_stride, c_stride);
2441
+ #elif NK_TARGET_DIAMOND
2442
+ nk_dots_packed_e5m2_diamond(a, b_packed, c, height, width, depth, a_stride, c_stride);
2233
2443
  #elif NK_TARGET_GENOA
2234
2444
  nk_dots_packed_e5m2_genoa(a, b_packed, c, height, width, depth, a_stride, c_stride);
2235
2445
  #elif NK_TARGET_SKYLAKE
@@ -2250,6 +2460,8 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_e2m3(nk_size_t width, nk_size_t depth) {
2250
2460
  return nk_dots_packed_size_e2m3_sme(width, depth);
2251
2461
  #elif NK_TARGET_SAPPHIREAMX
2252
2462
  return nk_dots_packed_size_e2m3_sapphireamx(width, depth);
2463
+ #elif NK_TARGET_NEONFP8
2464
+ return nk_dots_packed_size_e2m3_neonfp8(width, depth);
2253
2465
  #elif NK_TARGET_SKYLAKE
2254
2466
  return nk_dots_packed_size_e2m3_skylake(width, depth);
2255
2467
  #elif NK_TARGET_SIERRA
@@ -2273,6 +2485,8 @@ NK_PUBLIC void nk_dots_pack_e2m3(nk_e2m3_t const *b, nk_size_t width, nk_size_t
2273
2485
  nk_dots_pack_e2m3_sme(b, width, depth, b_stride, b_packed);
2274
2486
  #elif NK_TARGET_SAPPHIREAMX
2275
2487
  nk_dots_pack_e2m3_sapphireamx(b, width, depth, b_stride, b_packed);
2488
+ #elif NK_TARGET_NEONFP8
2489
+ nk_dots_pack_e2m3_neonfp8(b, width, depth, b_stride, b_packed);
2276
2490
  #elif NK_TARGET_SKYLAKE
2277
2491
  nk_dots_pack_e2m3_skylake(b, width, depth, b_stride, b_packed);
2278
2492
  #elif NK_TARGET_SIERRA
@@ -2296,6 +2510,8 @@ NK_PUBLIC void nk_dots_packed_e2m3(nk_e2m3_t const *a, void const *b_packed, nk_
2296
2510
  nk_dots_packed_e2m3_sme(a, b_packed, c, height, width, depth, a_stride, c_stride);
2297
2511
  #elif NK_TARGET_SAPPHIREAMX
2298
2512
  nk_dots_packed_e2m3_sapphireamx(a, b_packed, c, height, width, depth, a_stride, c_stride);
2513
+ #elif NK_TARGET_NEONFP8
2514
+ nk_dots_packed_e2m3_neonfp8(a, b_packed, c, height, width, depth, a_stride, c_stride);
2299
2515
  #elif NK_TARGET_SKYLAKE
2300
2516
  nk_dots_packed_e2m3_skylake(a, b_packed, c, height, width, depth, a_stride, c_stride);
2301
2517
  #elif NK_TARGET_SIERRA
@@ -2318,6 +2534,8 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_e3m2(nk_size_t width, nk_size_t depth) {
2318
2534
  return nk_dots_packed_size_e3m2_sme(width, depth);
2319
2535
  #elif NK_TARGET_SAPPHIREAMX
2320
2536
  return nk_dots_packed_size_e3m2_sapphireamx(width, depth);
2537
+ #elif NK_TARGET_NEONFP8
2538
+ return nk_dots_packed_size_e3m2_neonfp8(width, depth);
2321
2539
  #elif NK_TARGET_SKYLAKE
2322
2540
  return nk_dots_packed_size_e3m2_skylake(width, depth);
2323
2541
  #elif NK_TARGET_HASWELL
@@ -2335,6 +2553,8 @@ NK_PUBLIC void nk_dots_pack_e3m2(nk_e3m2_t const *b, nk_size_t width, nk_size_t
2335
2553
  nk_dots_pack_e3m2_sme(b, width, depth, b_stride, b_packed);
2336
2554
  #elif NK_TARGET_SAPPHIREAMX
2337
2555
  nk_dots_pack_e3m2_sapphireamx(b, width, depth, b_stride, b_packed);
2556
+ #elif NK_TARGET_NEONFP8
2557
+ nk_dots_pack_e3m2_neonfp8(b, width, depth, b_stride, b_packed);
2338
2558
  #elif NK_TARGET_SKYLAKE
2339
2559
  nk_dots_pack_e3m2_skylake(b, width, depth, b_stride, b_packed);
2340
2560
  #elif NK_TARGET_HASWELL
@@ -2352,6 +2572,8 @@ NK_PUBLIC void nk_dots_packed_e3m2(nk_e3m2_t const *a, void const *b_packed, nk_
2352
2572
  nk_dots_packed_e3m2_sme(a, b_packed, c, height, width, depth, a_stride, c_stride);
2353
2573
  #elif NK_TARGET_SAPPHIREAMX
2354
2574
  nk_dots_packed_e3m2_sapphireamx(a, b_packed, c, height, width, depth, a_stride, c_stride);
2575
+ #elif NK_TARGET_NEONFP8
2576
+ nk_dots_packed_e3m2_neonfp8(a, b_packed, c, height, width, depth, a_stride, c_stride);
2355
2577
  #elif NK_TARGET_SKYLAKE
2356
2578
  nk_dots_packed_e3m2_skylake(a, b_packed, c, height, width, depth, a_stride, c_stride);
2357
2579
  #elif NK_TARGET_HASWELL
@@ -2422,6 +2644,10 @@ NK_PUBLIC nk_size_t nk_dots_packed_size_u1(nk_size_t width, nk_size_t depth) {
2422
2644
  return nk_dots_packed_size_u1_haswell(width, depth);
2423
2645
  #elif NK_TARGET_NEON
2424
2646
  return nk_dots_packed_size_u1_neon(width, depth);
2647
+ #elif NK_TARGET_POWERVSX
2648
+ return nk_dots_packed_size_u1_powervsx(width, depth);
2649
+ #elif NK_TARGET_LOONGSONASX
2650
+ return nk_dots_packed_size_u1_loongsonasx(width, depth);
2425
2651
  #elif NK_TARGET_V128RELAXED
2426
2652
  return nk_dots_packed_size_u1_v128relaxed(width, depth);
2427
2653
  #else
@@ -2439,6 +2665,10 @@ NK_PUBLIC void nk_dots_pack_u1(nk_u1x8_t const *b, nk_size_t width, nk_size_t de
2439
2665
  nk_dots_pack_u1_haswell(b, width, depth, b_stride, b_packed);
2440
2666
  #elif NK_TARGET_NEON
2441
2667
  nk_dots_pack_u1_neon(b, width, depth, b_stride, b_packed);
2668
+ #elif NK_TARGET_POWERVSX
2669
+ nk_dots_pack_u1_powervsx(b, width, depth, b_stride, b_packed);
2670
+ #elif NK_TARGET_LOONGSONASX
2671
+ nk_dots_pack_u1_loongsonasx(b, width, depth, b_stride, b_packed);
2442
2672
  #elif NK_TARGET_V128RELAXED
2443
2673
  nk_dots_pack_u1_v128relaxed(b, width, depth, b_stride, b_packed);
2444
2674
  #else
@@ -2456,6 +2686,10 @@ NK_PUBLIC void nk_dots_packed_u1(nk_u1x8_t const *a, void const *b_packed, nk_u3
2456
2686
  nk_dots_packed_u1_haswell(a, b_packed, c, height, width, depth, a_stride, c_stride);
2457
2687
  #elif NK_TARGET_NEON
2458
2688
  nk_dots_packed_u1_neon(a, b_packed, c, height, width, depth, a_stride, c_stride);
2689
+ #elif NK_TARGET_POWERVSX
2690
+ nk_dots_packed_u1_powervsx(a, b_packed, c, height, width, depth, a_stride, c_stride);
2691
+ #elif NK_TARGET_LOONGSONASX
2692
+ nk_dots_packed_u1_loongsonasx(a, b_packed, c, height, width, depth, a_stride, c_stride);
2459
2693
  #elif NK_TARGET_V128RELAXED
2460
2694
  nk_dots_packed_u1_v128relaxed(a, b_packed, c, height, width, depth, a_stride, c_stride);
2461
2695
  #else
@@ -2513,285 +2747,331 @@ NK_PUBLIC void nk_dots_packed_i4(nk_i4x2_t const *a, void const *b_packed, nk_i3
2513
2747
  #endif
2514
2748
  }
2515
2749
 
2516
- NK_PUBLIC void nk_dots_symmetric_f16(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
2517
- nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
2750
+ NK_PUBLIC void nk_dots_symmetric_f16(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2751
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
2518
2752
  nk_size_t row_count) {
2519
2753
  #if NK_TARGET_SME
2520
- nk_dots_symmetric_f16_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2521
- #elif NK_TARGET_NEONHALF
2522
- nk_dots_symmetric_f16_neonhalf(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2754
+ nk_dots_symmetric_f16_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2523
2755
  #elif NK_TARGET_NEON
2524
- nk_dots_symmetric_f16_neon(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2756
+ nk_dots_symmetric_f16_neon(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2525
2757
  #elif NK_TARGET_NEONFHM
2526
- nk_dots_symmetric_f16_neonfhm(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2758
+ nk_dots_symmetric_f16_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2527
2759
  #elif NK_TARGET_SKYLAKE
2528
- nk_dots_symmetric_f16_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2760
+ nk_dots_symmetric_f16_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2529
2761
  #elif NK_TARGET_HASWELL
2530
- nk_dots_symmetric_f16_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2762
+ nk_dots_symmetric_f16_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2763
+ #elif NK_TARGET_POWERVSX
2764
+ nk_dots_symmetric_f16_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2531
2765
  #elif NK_TARGET_RVV
2532
- nk_dots_symmetric_f16_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2766
+ nk_dots_symmetric_f16_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2533
2767
  #else
2534
- nk_dots_symmetric_f16_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2768
+ nk_dots_symmetric_f16_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2535
2769
  #endif
2536
2770
  }
2537
2771
 
2538
- NK_PUBLIC void nk_dots_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
2539
- nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
2772
+ NK_PUBLIC void nk_dots_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2773
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
2540
2774
  nk_size_t row_count) {
2541
2775
  #if NK_TARGET_SME
2542
- nk_dots_symmetric_bf16_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2776
+ nk_dots_symmetric_bf16_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2543
2777
  #elif NK_TARGET_SAPPHIREAMX
2544
- nk_dots_symmetric_bf16_sapphireamx(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2778
+ nk_dots_symmetric_bf16_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2779
+ row_count);
2545
2780
  #elif NK_TARGET_NEONBFDOT
2546
- nk_dots_symmetric_bf16_neonbfdot(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2781
+ nk_dots_symmetric_bf16_neonbfdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2782
+ row_count);
2547
2783
  #elif NK_TARGET_GENOA
2548
- nk_dots_symmetric_bf16_genoa(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2784
+ nk_dots_symmetric_bf16_genoa(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2549
2785
  #elif NK_TARGET_SKYLAKE
2550
- nk_dots_symmetric_bf16_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2786
+ nk_dots_symmetric_bf16_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2551
2787
  #elif NK_TARGET_HASWELL
2552
- nk_dots_symmetric_bf16_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2788
+ nk_dots_symmetric_bf16_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2789
+ #elif NK_TARGET_POWERVSX
2790
+ nk_dots_symmetric_bf16_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2553
2791
  #elif NK_TARGET_RVV
2554
- nk_dots_symmetric_bf16_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2792
+ nk_dots_symmetric_bf16_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2555
2793
  #elif NK_TARGET_V128RELAXED
2556
- nk_dots_symmetric_bf16_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2794
+ nk_dots_symmetric_bf16_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2795
+ row_count);
2557
2796
  #else
2558
- nk_dots_symmetric_bf16_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2797
+ nk_dots_symmetric_bf16_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2559
2798
  #endif
2560
2799
  }
2561
2800
 
2562
- NK_PUBLIC void nk_dots_symmetric_i8(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
2801
+ NK_PUBLIC void nk_dots_symmetric_i8(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride,
2563
2802
  nk_i32_t *result, nk_size_t result_stride, nk_size_t row_start,
2564
2803
  nk_size_t row_count) {
2565
2804
  #if NK_TARGET_SME
2566
- nk_dots_symmetric_i8_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2805
+ nk_dots_symmetric_i8_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2567
2806
  #elif NK_TARGET_SAPPHIREAMX
2568
- nk_dots_symmetric_i8_sapphireamx(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2807
+ nk_dots_symmetric_i8_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2808
+ row_count);
2569
2809
  #elif NK_TARGET_NEONSDOT
2570
- nk_dots_symmetric_i8_neonsdot(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2810
+ nk_dots_symmetric_i8_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2571
2811
  #elif NK_TARGET_ICELAKE
2572
- nk_dots_symmetric_i8_icelake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2812
+ nk_dots_symmetric_i8_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2573
2813
  #elif NK_TARGET_SIERRA
2574
- nk_dots_symmetric_i8_sierra(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2814
+ nk_dots_symmetric_i8_sierra(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2575
2815
  #elif NK_TARGET_ALDER
2576
- nk_dots_symmetric_i8_alder(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2816
+ nk_dots_symmetric_i8_alder(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2577
2817
  #elif NK_TARGET_HASWELL
2578
- nk_dots_symmetric_i8_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2818
+ nk_dots_symmetric_i8_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2819
+ #elif NK_TARGET_POWERVSX
2820
+ nk_dots_symmetric_i8_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2579
2821
  #elif NK_TARGET_RVV
2580
- nk_dots_symmetric_i8_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2822
+ nk_dots_symmetric_i8_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2581
2823
  #elif NK_TARGET_V128RELAXED
2582
- nk_dots_symmetric_i8_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2824
+ nk_dots_symmetric_i8_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2825
+ row_count);
2583
2826
  #else
2584
- nk_dots_symmetric_i8_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2827
+ nk_dots_symmetric_i8_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2585
2828
  #endif
2586
2829
  }
2587
2830
 
2588
- NK_PUBLIC void nk_dots_symmetric_u8(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
2831
+ NK_PUBLIC void nk_dots_symmetric_u8(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride,
2589
2832
  nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
2590
2833
  nk_size_t row_count) {
2591
2834
  #if NK_TARGET_SME
2592
- nk_dots_symmetric_u8_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2835
+ nk_dots_symmetric_u8_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2593
2836
  #elif NK_TARGET_SAPPHIREAMX
2594
- nk_dots_symmetric_u8_sapphireamx(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2837
+ nk_dots_symmetric_u8_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2838
+ row_count);
2595
2839
  #elif NK_TARGET_ICELAKE
2596
- nk_dots_symmetric_u8_icelake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2840
+ nk_dots_symmetric_u8_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2597
2841
  #elif NK_TARGET_SIERRA
2598
- nk_dots_symmetric_u8_sierra(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2842
+ nk_dots_symmetric_u8_sierra(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2599
2843
  #elif NK_TARGET_ALDER
2600
- nk_dots_symmetric_u8_alder(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2844
+ nk_dots_symmetric_u8_alder(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2601
2845
  #elif NK_TARGET_NEONSDOT
2602
- nk_dots_symmetric_u8_neonsdot(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2846
+ nk_dots_symmetric_u8_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2603
2847
  #elif NK_TARGET_HASWELL
2604
- nk_dots_symmetric_u8_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2848
+ nk_dots_symmetric_u8_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2849
+ #elif NK_TARGET_POWERVSX
2850
+ nk_dots_symmetric_u8_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2605
2851
  #elif NK_TARGET_RVV
2606
- nk_dots_symmetric_u8_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2852
+ nk_dots_symmetric_u8_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2607
2853
  #elif NK_TARGET_V128RELAXED
2608
- nk_dots_symmetric_u8_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2854
+ nk_dots_symmetric_u8_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2855
+ row_count);
2609
2856
  #else
2610
- nk_dots_symmetric_u8_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2857
+ nk_dots_symmetric_u8_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2611
2858
  #endif
2612
2859
  }
2613
2860
 
2614
- NK_PUBLIC void nk_dots_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
2615
- nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
2861
+ NK_PUBLIC void nk_dots_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2862
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
2616
2863
  nk_size_t row_count) {
2617
2864
  #if NK_TARGET_SME
2618
- nk_dots_symmetric_e4m3_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2865
+ nk_dots_symmetric_e4m3_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2866
+ #elif NK_TARGET_NEONFP8
2867
+ nk_dots_symmetric_e4m3_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2619
2868
  #elif NK_TARGET_NEONFHM
2620
- nk_dots_symmetric_e4m3_neonfhm(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2869
+ nk_dots_symmetric_e4m3_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2621
2870
  #elif NK_TARGET_SAPPHIREAMX
2622
- nk_dots_symmetric_e4m3_sapphireamx(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2871
+ nk_dots_symmetric_e4m3_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2872
+ row_count);
2873
+ #elif NK_TARGET_DIAMOND
2874
+ nk_dots_symmetric_e4m3_diamond(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2623
2875
  #elif NK_TARGET_GENOA
2624
- nk_dots_symmetric_e4m3_genoa(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2876
+ nk_dots_symmetric_e4m3_genoa(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2625
2877
  #elif NK_TARGET_SKYLAKE
2626
- nk_dots_symmetric_e4m3_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2878
+ nk_dots_symmetric_e4m3_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2627
2879
  #elif NK_TARGET_HASWELL
2628
- nk_dots_symmetric_e4m3_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2880
+ nk_dots_symmetric_e4m3_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2629
2881
  #elif NK_TARGET_RVV
2630
- nk_dots_symmetric_e4m3_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2882
+ nk_dots_symmetric_e4m3_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2631
2883
  #elif NK_TARGET_V128RELAXED
2632
- nk_dots_symmetric_e4m3_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2884
+ nk_dots_symmetric_e4m3_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2885
+ row_count);
2633
2886
  #else
2634
- nk_dots_symmetric_e4m3_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2887
+ nk_dots_symmetric_e4m3_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2635
2888
  #endif
2636
2889
  }
2637
2890
 
2638
- NK_PUBLIC void nk_dots_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
2639
- nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
2891
+ NK_PUBLIC void nk_dots_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2892
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
2640
2893
  nk_size_t row_count) {
2641
2894
  #if NK_TARGET_SME
2642
- nk_dots_symmetric_e5m2_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2895
+ nk_dots_symmetric_e5m2_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2896
+ #elif NK_TARGET_NEONFP8
2897
+ nk_dots_symmetric_e5m2_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2643
2898
  #elif NK_TARGET_NEONFHM
2644
- nk_dots_symmetric_e5m2_neonfhm(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2899
+ nk_dots_symmetric_e5m2_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2645
2900
  #elif NK_TARGET_SAPPHIREAMX
2646
- nk_dots_symmetric_e5m2_sapphireamx(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2901
+ nk_dots_symmetric_e5m2_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2902
+ row_count);
2903
+ #elif NK_TARGET_DIAMOND
2904
+ nk_dots_symmetric_e5m2_diamond(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2647
2905
  #elif NK_TARGET_GENOA
2648
- nk_dots_symmetric_e5m2_genoa(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2906
+ nk_dots_symmetric_e5m2_genoa(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2649
2907
  #elif NK_TARGET_SKYLAKE
2650
- nk_dots_symmetric_e5m2_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2908
+ nk_dots_symmetric_e5m2_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2651
2909
  #elif NK_TARGET_HASWELL
2652
- nk_dots_symmetric_e5m2_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2910
+ nk_dots_symmetric_e5m2_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2653
2911
  #elif NK_TARGET_RVV
2654
- nk_dots_symmetric_e5m2_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2912
+ nk_dots_symmetric_e5m2_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2655
2913
  #elif NK_TARGET_V128RELAXED
2656
- nk_dots_symmetric_e5m2_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2914
+ nk_dots_symmetric_e5m2_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2915
+ row_count);
2657
2916
  #else
2658
- nk_dots_symmetric_e5m2_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2917
+ nk_dots_symmetric_e5m2_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2659
2918
  #endif
2660
2919
  }
2661
2920
 
2662
- NK_PUBLIC void nk_dots_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
2663
- nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
2921
+ NK_PUBLIC void nk_dots_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2922
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
2664
2923
  nk_size_t row_count) {
2665
2924
  #if NK_TARGET_SME
2666
- nk_dots_symmetric_e2m3_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2925
+ nk_dots_symmetric_e2m3_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2667
2926
  #elif NK_TARGET_SAPPHIREAMX
2668
- nk_dots_symmetric_e2m3_sapphireamx(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2927
+ nk_dots_symmetric_e2m3_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2928
+ row_count);
2929
+ #elif NK_TARGET_NEONFP8
2930
+ nk_dots_symmetric_e2m3_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2669
2931
  #elif NK_TARGET_SKYLAKE
2670
- nk_dots_symmetric_e2m3_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2932
+ nk_dots_symmetric_e2m3_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2671
2933
  #elif NK_TARGET_SIERRA
2672
- nk_dots_symmetric_e2m3_sierra(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2934
+ nk_dots_symmetric_e2m3_sierra(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2673
2935
  #elif NK_TARGET_ALDER
2674
- nk_dots_symmetric_e2m3_alder(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2936
+ nk_dots_symmetric_e2m3_alder(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2675
2937
  #elif NK_TARGET_HASWELL
2676
- nk_dots_symmetric_e2m3_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2938
+ nk_dots_symmetric_e2m3_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2677
2939
  #elif NK_TARGET_RVV
2678
- nk_dots_symmetric_e2m3_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2940
+ nk_dots_symmetric_e2m3_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2679
2941
  #elif NK_TARGET_V128RELAXED
2680
- nk_dots_symmetric_e2m3_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2942
+ nk_dots_symmetric_e2m3_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2943
+ row_count);
2681
2944
  #else
2682
- nk_dots_symmetric_e2m3_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2945
+ nk_dots_symmetric_e2m3_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2683
2946
  #endif
2684
2947
  }
2685
2948
 
2686
- NK_PUBLIC void nk_dots_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
2687
- nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
2949
+ NK_PUBLIC void nk_dots_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2950
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
2688
2951
  nk_size_t row_count) {
2689
2952
  #if NK_TARGET_SME
2690
- nk_dots_symmetric_e3m2_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2953
+ nk_dots_symmetric_e3m2_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2691
2954
  #elif NK_TARGET_SAPPHIREAMX
2692
- nk_dots_symmetric_e3m2_sapphireamx(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2955
+ nk_dots_symmetric_e3m2_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2956
+ row_count);
2957
+ #elif NK_TARGET_NEONFP8
2958
+ nk_dots_symmetric_e3m2_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2693
2959
  #elif NK_TARGET_SKYLAKE
2694
- nk_dots_symmetric_e3m2_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2960
+ nk_dots_symmetric_e3m2_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2695
2961
  #elif NK_TARGET_HASWELL
2696
- nk_dots_symmetric_e3m2_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2962
+ nk_dots_symmetric_e3m2_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2697
2963
  #elif NK_TARGET_RVV
2698
- nk_dots_symmetric_e3m2_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2964
+ nk_dots_symmetric_e3m2_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2699
2965
  #else
2700
- nk_dots_symmetric_e3m2_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2966
+ nk_dots_symmetric_e3m2_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2701
2967
  #endif
2702
2968
  }
2703
2969
 
2704
- NK_PUBLIC void nk_dots_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
2705
- nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
2970
+ NK_PUBLIC void nk_dots_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2971
+ nk_size_t stride, nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
2706
2972
  nk_size_t row_count) {
2707
2973
  #if NK_TARGET_SME
2708
- nk_dots_symmetric_u4_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2974
+ nk_dots_symmetric_u4_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2709
2975
  #elif NK_TARGET_ICELAKE
2710
- nk_dots_symmetric_u4_icelake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2976
+ nk_dots_symmetric_u4_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2711
2977
  #elif NK_TARGET_NEONSDOT
2712
- nk_dots_symmetric_u4_neonsdot(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2978
+ nk_dots_symmetric_u4_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2713
2979
  #elif NK_TARGET_HASWELL
2714
- nk_dots_symmetric_u4_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2980
+ nk_dots_symmetric_u4_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2715
2981
  #elif NK_TARGET_V128RELAXED
2716
- nk_dots_symmetric_u4_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2982
+ nk_dots_symmetric_u4_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2983
+ row_count);
2717
2984
  #else
2718
- nk_dots_symmetric_u4_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2985
+ nk_dots_symmetric_u4_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2719
2986
  #endif
2720
2987
  }
2721
2988
 
2722
- NK_PUBLIC void nk_dots_symmetric_u1(nk_u1x8_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
2723
- nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
2989
+ NK_PUBLIC void nk_dots_symmetric_u1(nk_u1x8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2990
+ nk_size_t stride, nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
2724
2991
  nk_size_t row_count) {
2725
2992
  #if NK_TARGET_SMEBI32
2726
- nk_dots_symmetric_u1_smebi32(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2993
+ nk_dots_symmetric_u1_smebi32(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2727
2994
  #elif NK_TARGET_ICELAKE
2728
- nk_dots_symmetric_u1_icelake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2995
+ nk_dots_symmetric_u1_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2729
2996
  #elif NK_TARGET_HASWELL
2730
- nk_dots_symmetric_u1_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2997
+ nk_dots_symmetric_u1_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2731
2998
  #elif NK_TARGET_NEON
2732
- nk_dots_symmetric_u1_neon(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2999
+ nk_dots_symmetric_u1_neon(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
3000
+ #elif NK_TARGET_POWERVSX
3001
+ nk_dots_symmetric_u1_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
3002
+ #elif NK_TARGET_LOONGSONASX
3003
+ nk_dots_symmetric_u1_loongsonasx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3004
+ row_count);
2733
3005
  #elif NK_TARGET_V128RELAXED
2734
- nk_dots_symmetric_u1_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3006
+ nk_dots_symmetric_u1_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3007
+ row_count);
2735
3008
  #else
2736
- nk_dots_symmetric_u1_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3009
+ nk_dots_symmetric_u1_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2737
3010
  #endif
2738
3011
  }
2739
3012
 
2740
- NK_PUBLIC void nk_dots_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
2741
- nk_i32_t *result, nk_size_t result_stride, nk_size_t row_start,
3013
+ NK_PUBLIC void nk_dots_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
3014
+ nk_size_t stride, nk_i32_t *result, nk_size_t result_stride, nk_size_t row_start,
2742
3015
  nk_size_t row_count) {
2743
3016
  #if NK_TARGET_SME
2744
- nk_dots_symmetric_i4_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3017
+ nk_dots_symmetric_i4_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2745
3018
  #elif NK_TARGET_ICELAKE
2746
- nk_dots_symmetric_i4_icelake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3019
+ nk_dots_symmetric_i4_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2747
3020
  #elif NK_TARGET_NEONSDOT
2748
- nk_dots_symmetric_i4_neonsdot(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3021
+ nk_dots_symmetric_i4_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2749
3022
  #elif NK_TARGET_HASWELL
2750
- nk_dots_symmetric_i4_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3023
+ nk_dots_symmetric_i4_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2751
3024
  #elif NK_TARGET_V128RELAXED
2752
- nk_dots_symmetric_i4_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3025
+ nk_dots_symmetric_i4_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3026
+ row_count);
2753
3027
  #else
2754
- nk_dots_symmetric_i4_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3028
+ nk_dots_symmetric_i4_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2755
3029
  #endif
2756
3030
  }
2757
3031
 
2758
- NK_PUBLIC void nk_dots_symmetric_f32(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
2759
- nk_f64_t *result, nk_size_t result_stride, nk_size_t row_start,
3032
+ NK_PUBLIC void nk_dots_symmetric_f32(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
3033
+ nk_size_t stride, nk_f64_t *result, nk_size_t result_stride, nk_size_t row_start,
2760
3034
  nk_size_t row_count) {
2761
3035
  #if NK_TARGET_SMEF64
2762
- nk_dots_symmetric_f32_smef64(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3036
+ nk_dots_symmetric_f32_smef64(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2763
3037
  #elif NK_TARGET_SKYLAKE
2764
- nk_dots_symmetric_f32_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3038
+ nk_dots_symmetric_f32_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2765
3039
  #elif NK_TARGET_HASWELL
2766
- nk_dots_symmetric_f32_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3040
+ nk_dots_symmetric_f32_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2767
3041
  #elif NK_TARGET_NEON
2768
- nk_dots_symmetric_f32_neon(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3042
+ nk_dots_symmetric_f32_neon(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
3043
+ #elif NK_TARGET_POWERVSX
3044
+ nk_dots_symmetric_f32_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2769
3045
  #elif NK_TARGET_RVV
2770
- nk_dots_symmetric_f32_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3046
+ nk_dots_symmetric_f32_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2771
3047
  #elif NK_TARGET_V128RELAXED
2772
- nk_dots_symmetric_f32_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3048
+ nk_dots_symmetric_f32_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3049
+ row_count);
2773
3050
  #else
2774
- nk_dots_symmetric_f32_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3051
+ nk_dots_symmetric_f32_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2775
3052
  #endif
2776
3053
  }
2777
3054
 
2778
- NK_PUBLIC void nk_dots_symmetric_f64(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
2779
- nk_f64_t *result, nk_size_t result_stride, nk_size_t row_start,
3055
+ NK_PUBLIC void nk_dots_symmetric_f64(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
3056
+ nk_size_t stride, nk_f64_t *result, nk_size_t result_stride, nk_size_t row_start,
2780
3057
  nk_size_t row_count) {
2781
3058
  #if NK_TARGET_SMEF64
2782
- nk_dots_symmetric_f64_smef64(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3059
+ nk_dots_symmetric_f64_smef64(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2783
3060
  #elif NK_TARGET_SKYLAKE
2784
- nk_dots_symmetric_f64_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3061
+ nk_dots_symmetric_f64_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2785
3062
  #elif NK_TARGET_HASWELL
2786
- nk_dots_symmetric_f64_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3063
+ nk_dots_symmetric_f64_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2787
3064
  #elif NK_TARGET_NEON
2788
- nk_dots_symmetric_f64_neon(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3065
+ nk_dots_symmetric_f64_neon(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
3066
+ #elif NK_TARGET_POWERVSX
3067
+ nk_dots_symmetric_f64_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2789
3068
  #elif NK_TARGET_RVV
2790
- nk_dots_symmetric_f64_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3069
+ nk_dots_symmetric_f64_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2791
3070
  #elif NK_TARGET_V128RELAXED
2792
- nk_dots_symmetric_f64_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3071
+ nk_dots_symmetric_f64_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3072
+ row_count);
2793
3073
  #else
2794
- nk_dots_symmetric_f64_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3074
+ nk_dots_symmetric_f64_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2795
3075
  #endif
2796
3076
  }
2797
3077