numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +197 -124
  2. package/binding.gyp +34 -484
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -76,15 +76,15 @@ NK_DYNAMIC void nk_angulars_packed_f32(nk_f32_t const *a, void const *b_packed,
76
76
  /**
77
77
  * @brief Computes symmetric angular distance matrix (Gram-style) for a set of vectors.
78
78
  * @param[in] vectors Input matrix of row vectors in row-major order.
79
- * @param[in] n_vectors Number of vectors (rows) in the input matrix.
79
+ * @param[in] vectors_count Number of vectors (rows) in the input matrix.
80
80
  * @param[in] depth Dimension of each vector (columns).
81
81
  * @param[in] stride Row stride in bytes for the input matrix.
82
- * @param[out] result Output symmetric matrix (n_vectors x n_vectors).
82
+ * @param[out] result Output symmetric matrix (vectors_count x vectors_count).
83
83
  * @param[in] result_stride Row stride in bytes for the result matrix.
84
84
  * @param[in] row_start Starting row offset of results to compute (for parallelism).
85
85
  * @param[in] row_count Number of rows of results to compute (for parallelism).
86
86
  */
87
- NK_DYNAMIC void nk_angulars_symmetric_f32(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
87
+ NK_DYNAMIC void nk_angulars_symmetric_f32(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
88
88
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
89
89
  nk_size_t row_start, nk_size_t row_count);
90
90
 
@@ -106,15 +106,15 @@ NK_DYNAMIC void nk_euclideans_packed_f32(nk_f32_t const *a, void const *b_packed
106
106
  /**
107
107
  * @brief Computes symmetric euclidean distance matrix (Gram-style) for a set of vectors.
108
108
  * @param[in] vectors Input matrix of row vectors in row-major order.
109
- * @param[in] n_vectors Number of vectors (rows) in the input matrix.
109
+ * @param[in] vectors_count Number of vectors (rows) in the input matrix.
110
110
  * @param[in] depth Dimension of each vector (columns).
111
111
  * @param[in] stride Row stride in bytes for the input matrix.
112
- * @param[out] result Output symmetric matrix (n_vectors x n_vectors).
112
+ * @param[out] result Output symmetric matrix (vectors_count x vectors_count).
113
113
  * @param[in] result_stride Row stride in bytes for the result matrix.
114
114
  * @param[in] row_start Starting row offset of results to compute (for parallelism).
115
115
  * @param[in] row_count Number of rows of results to compute (for parallelism).
116
116
  */
117
- NK_DYNAMIC void nk_euclideans_symmetric_f32(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
117
+ NK_DYNAMIC void nk_euclideans_symmetric_f32(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
118
118
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
119
119
  nk_size_t row_start, nk_size_t row_count);
120
120
 
@@ -123,7 +123,7 @@ NK_DYNAMIC void nk_angulars_packed_f64(nk_f64_t const *a, void const *b_packed,
123
123
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
124
124
  nk_size_t r_stride_in_bytes);
125
125
  /** @copydoc nk_angulars_symmetric_f32 */
126
- NK_DYNAMIC void nk_angulars_symmetric_f64(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
126
+ NK_DYNAMIC void nk_angulars_symmetric_f64(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
127
127
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
128
128
  nk_size_t row_start, nk_size_t row_count);
129
129
  /** @copydoc nk_euclideans_packed_f32 */
@@ -131,7 +131,7 @@ NK_DYNAMIC void nk_euclideans_packed_f64(nk_f64_t const *a, void const *b_packed
131
131
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
132
132
  nk_size_t r_stride_in_bytes);
133
133
  /** @copydoc nk_euclideans_symmetric_f32 */
134
- NK_DYNAMIC void nk_euclideans_symmetric_f64(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
134
+ NK_DYNAMIC void nk_euclideans_symmetric_f64(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
135
135
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
136
136
  nk_size_t row_start, nk_size_t row_count);
137
137
 
@@ -140,7 +140,7 @@ NK_DYNAMIC void nk_angulars_packed_f16(nk_f16_t const *a, void const *b_packed,
140
140
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
141
141
  nk_size_t r_stride_in_bytes);
142
142
  /** @copydoc nk_angulars_symmetric_f32 */
143
- NK_DYNAMIC void nk_angulars_symmetric_f16(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
143
+ NK_DYNAMIC void nk_angulars_symmetric_f16(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
144
144
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
145
145
  nk_size_t row_start, nk_size_t row_count);
146
146
  /** @copydoc nk_euclideans_packed_f32 */
@@ -148,7 +148,7 @@ NK_DYNAMIC void nk_euclideans_packed_f16(nk_f16_t const *a, void const *b_packed
148
148
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
149
149
  nk_size_t r_stride_in_bytes);
150
150
  /** @copydoc nk_euclideans_symmetric_f32 */
151
- NK_DYNAMIC void nk_euclideans_symmetric_f16(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
151
+ NK_DYNAMIC void nk_euclideans_symmetric_f16(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
152
152
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
153
153
  nk_size_t row_start, nk_size_t row_count);
154
154
 
@@ -157,7 +157,7 @@ NK_DYNAMIC void nk_angulars_packed_bf16(nk_bf16_t const *a, void const *b_packed
157
157
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
158
158
  nk_size_t r_stride_in_bytes);
159
159
  /** @copydoc nk_angulars_symmetric_f32 */
160
- NK_DYNAMIC void nk_angulars_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
160
+ NK_DYNAMIC void nk_angulars_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
161
161
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
162
162
  nk_size_t row_start, nk_size_t row_count);
163
163
  /** @copydoc nk_euclideans_packed_f32 */
@@ -165,7 +165,7 @@ NK_DYNAMIC void nk_euclideans_packed_bf16(nk_bf16_t const *a, void const *b_pack
165
165
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
166
166
  nk_size_t r_stride_in_bytes);
167
167
  /** @copydoc nk_euclideans_symmetric_f32 */
168
- NK_DYNAMIC void nk_euclideans_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
168
+ NK_DYNAMIC void nk_euclideans_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
169
169
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
170
170
  nk_size_t row_start, nk_size_t row_count);
171
171
 
@@ -174,7 +174,7 @@ NK_DYNAMIC void nk_angulars_packed_e4m3(nk_e4m3_t const *a, void const *b_packed
174
174
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
175
175
  nk_size_t r_stride_in_bytes);
176
176
  /** @copydoc nk_angulars_symmetric_f32 */
177
- NK_DYNAMIC void nk_angulars_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
177
+ NK_DYNAMIC void nk_angulars_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
178
178
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
179
179
  nk_size_t row_start, nk_size_t row_count);
180
180
  /** @copydoc nk_euclideans_packed_f32 */
@@ -182,7 +182,7 @@ NK_DYNAMIC void nk_euclideans_packed_e4m3(nk_e4m3_t const *a, void const *b_pack
182
182
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
183
183
  nk_size_t r_stride_in_bytes);
184
184
  /** @copydoc nk_euclideans_symmetric_f32 */
185
- NK_DYNAMIC void nk_euclideans_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
185
+ NK_DYNAMIC void nk_euclideans_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
186
186
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
187
187
  nk_size_t row_start, nk_size_t row_count);
188
188
 
@@ -191,7 +191,7 @@ NK_DYNAMIC void nk_angulars_packed_e5m2(nk_e5m2_t const *a, void const *b_packed
191
191
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
192
192
  nk_size_t r_stride_in_bytes);
193
193
  /** @copydoc nk_angulars_symmetric_f32 */
194
- NK_DYNAMIC void nk_angulars_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
194
+ NK_DYNAMIC void nk_angulars_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
195
195
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
196
196
  nk_size_t row_start, nk_size_t row_count);
197
197
  /** @copydoc nk_euclideans_packed_f32 */
@@ -199,7 +199,7 @@ NK_DYNAMIC void nk_euclideans_packed_e5m2(nk_e5m2_t const *a, void const *b_pack
199
199
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
200
200
  nk_size_t r_stride_in_bytes);
201
201
  /** @copydoc nk_euclideans_symmetric_f32 */
202
- NK_DYNAMIC void nk_euclideans_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
202
+ NK_DYNAMIC void nk_euclideans_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
203
203
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
204
204
  nk_size_t row_start, nk_size_t row_count);
205
205
 
@@ -208,7 +208,7 @@ NK_DYNAMIC void nk_angulars_packed_e2m3(nk_e2m3_t const *a, void const *b_packed
208
208
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
209
209
  nk_size_t r_stride_in_bytes);
210
210
  /** @copydoc nk_angulars_symmetric_f32 */
211
- NK_DYNAMIC void nk_angulars_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
211
+ NK_DYNAMIC void nk_angulars_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
212
212
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
213
213
  nk_size_t row_start, nk_size_t row_count);
214
214
  /** @copydoc nk_euclideans_packed_f32 */
@@ -216,7 +216,7 @@ NK_DYNAMIC void nk_euclideans_packed_e2m3(nk_e2m3_t const *a, void const *b_pack
216
216
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
217
217
  nk_size_t r_stride_in_bytes);
218
218
  /** @copydoc nk_euclideans_symmetric_f32 */
219
- NK_DYNAMIC void nk_euclideans_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
219
+ NK_DYNAMIC void nk_euclideans_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
220
220
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
221
221
  nk_size_t row_start, nk_size_t row_count);
222
222
 
@@ -225,7 +225,7 @@ NK_DYNAMIC void nk_angulars_packed_e3m2(nk_e3m2_t const *a, void const *b_packed
225
225
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
226
226
  nk_size_t r_stride_in_bytes);
227
227
  /** @copydoc nk_angulars_symmetric_f32 */
228
- NK_DYNAMIC void nk_angulars_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
228
+ NK_DYNAMIC void nk_angulars_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
229
229
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
230
230
  nk_size_t row_start, nk_size_t row_count);
231
231
  /** @copydoc nk_euclideans_packed_f32 */
@@ -233,7 +233,7 @@ NK_DYNAMIC void nk_euclideans_packed_e3m2(nk_e3m2_t const *a, void const *b_pack
233
233
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
234
234
  nk_size_t r_stride_in_bytes);
235
235
  /** @copydoc nk_euclideans_symmetric_f32 */
236
- NK_DYNAMIC void nk_euclideans_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
236
+ NK_DYNAMIC void nk_euclideans_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
237
237
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
238
238
  nk_size_t row_start, nk_size_t row_count);
239
239
 
@@ -242,15 +242,15 @@ NK_DYNAMIC void nk_angulars_packed_i8(nk_i8_t const *a, void const *b_packed, nk
242
242
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
243
243
  nk_size_t r_stride_in_bytes);
244
244
  /** @copydoc nk_angulars_symmetric_f32 */
245
- NK_DYNAMIC void nk_angulars_symmetric_i8(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
246
- nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
247
- nk_size_t row_count);
245
+ NK_DYNAMIC void nk_angulars_symmetric_i8(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
246
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
247
+ nk_size_t row_start, nk_size_t row_count);
248
248
  /** @copydoc nk_euclideans_packed_f32 */
249
249
  NK_DYNAMIC void nk_euclideans_packed_i8(nk_i8_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
250
250
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
251
251
  nk_size_t r_stride_in_bytes);
252
252
  /** @copydoc nk_euclideans_symmetric_f32 */
253
- NK_DYNAMIC void nk_euclideans_symmetric_i8(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
253
+ NK_DYNAMIC void nk_euclideans_symmetric_i8(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
254
254
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
255
255
  nk_size_t row_start, nk_size_t row_count);
256
256
 
@@ -259,15 +259,15 @@ NK_DYNAMIC void nk_angulars_packed_u8(nk_u8_t const *a, void const *b_packed, nk
259
259
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
260
260
  nk_size_t r_stride_in_bytes);
261
261
  /** @copydoc nk_angulars_symmetric_f32 */
262
- NK_DYNAMIC void nk_angulars_symmetric_u8(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
263
- nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
264
- nk_size_t row_count);
262
+ NK_DYNAMIC void nk_angulars_symmetric_u8(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
263
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
264
+ nk_size_t row_start, nk_size_t row_count);
265
265
  /** @copydoc nk_euclideans_packed_f32 */
266
266
  NK_DYNAMIC void nk_euclideans_packed_u8(nk_u8_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
267
267
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
268
268
  nk_size_t r_stride_in_bytes);
269
269
  /** @copydoc nk_euclideans_symmetric_f32 */
270
- NK_DYNAMIC void nk_euclideans_symmetric_u8(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
270
+ NK_DYNAMIC void nk_euclideans_symmetric_u8(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
271
271
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
272
272
  nk_size_t row_start, nk_size_t row_count);
273
273
 
@@ -276,7 +276,7 @@ NK_DYNAMIC void nk_angulars_packed_i4(nk_i4x2_t const *a, void const *b_packed,
276
276
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
277
277
  nk_size_t r_stride_in_bytes);
278
278
  /** @copydoc nk_angulars_symmetric_f32 */
279
- NK_DYNAMIC void nk_angulars_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
279
+ NK_DYNAMIC void nk_angulars_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
280
280
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
281
281
  nk_size_t row_start, nk_size_t row_count);
282
282
  /** @copydoc nk_euclideans_packed_f32 */
@@ -284,7 +284,7 @@ NK_DYNAMIC void nk_euclideans_packed_i4(nk_i4x2_t const *a, void const *b_packed
284
284
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
285
285
  nk_size_t r_stride_in_bytes);
286
286
  /** @copydoc nk_euclideans_symmetric_f32 */
287
- NK_DYNAMIC void nk_euclideans_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
287
+ NK_DYNAMIC void nk_euclideans_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
288
288
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
289
289
  nk_size_t row_start, nk_size_t row_count);
290
290
 
@@ -293,7 +293,7 @@ NK_DYNAMIC void nk_angulars_packed_u4(nk_u4x2_t const *a, void const *b_packed,
293
293
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
294
294
  nk_size_t r_stride_in_bytes);
295
295
  /** @copydoc nk_angulars_symmetric_f32 */
296
- NK_DYNAMIC void nk_angulars_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
296
+ NK_DYNAMIC void nk_angulars_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
297
297
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
298
298
  nk_size_t row_start, nk_size_t row_count);
299
299
  /** @copydoc nk_euclideans_packed_f32 */
@@ -301,7 +301,7 @@ NK_DYNAMIC void nk_euclideans_packed_u4(nk_u4x2_t const *a, void const *b_packed
301
301
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
302
302
  nk_size_t r_stride_in_bytes);
303
303
  /** @copydoc nk_euclideans_symmetric_f32 */
304
- NK_DYNAMIC void nk_euclideans_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
304
+ NK_DYNAMIC void nk_euclideans_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
305
305
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
306
306
  nk_size_t row_start, nk_size_t row_count);
307
307
 
@@ -310,7 +310,7 @@ NK_PUBLIC void nk_angulars_packed_f32_serial(nk_f32_t const *a, void const *b_pa
310
310
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
311
311
  nk_size_t r_stride_in_bytes);
312
312
  /** @copydoc nk_angulars_symmetric_f32 */
313
- NK_PUBLIC void nk_angulars_symmetric_f32_serial(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
313
+ NK_PUBLIC void nk_angulars_symmetric_f32_serial(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
314
314
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
315
315
  nk_size_t row_start, nk_size_t row_count);
316
316
  /** @copydoc nk_euclideans_packed_f32 */
@@ -318,7 +318,7 @@ NK_PUBLIC void nk_euclideans_packed_f32_serial(nk_f32_t const *a, void const *b_
318
318
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
319
319
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
320
320
  /** @copydoc nk_euclideans_symmetric_f32 */
321
- NK_PUBLIC void nk_euclideans_symmetric_f32_serial(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
321
+ NK_PUBLIC void nk_euclideans_symmetric_f32_serial(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
322
322
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
323
323
  nk_size_t row_start, nk_size_t row_count);
324
324
 
@@ -327,7 +327,7 @@ NK_PUBLIC void nk_angulars_packed_f64_serial(nk_f64_t const *a, void const *b_pa
327
327
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
328
328
  nk_size_t r_stride_in_bytes);
329
329
  /** @copydoc nk_angulars_symmetric_f64 */
330
- NK_PUBLIC void nk_angulars_symmetric_f64_serial(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
330
+ NK_PUBLIC void nk_angulars_symmetric_f64_serial(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
331
331
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
332
332
  nk_size_t row_start, nk_size_t row_count);
333
333
  /** @copydoc nk_euclideans_packed_f64 */
@@ -335,7 +335,7 @@ NK_PUBLIC void nk_euclideans_packed_f64_serial(nk_f64_t const *a, void const *b_
335
335
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
336
336
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
337
337
  /** @copydoc nk_euclideans_symmetric_f64 */
338
- NK_PUBLIC void nk_euclideans_symmetric_f64_serial(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
338
+ NK_PUBLIC void nk_euclideans_symmetric_f64_serial(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
339
339
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
340
340
  nk_size_t row_start, nk_size_t row_count);
341
341
 
@@ -344,7 +344,7 @@ NK_PUBLIC void nk_angulars_packed_f16_serial(nk_f16_t const *a, void const *b_pa
344
344
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
345
345
  nk_size_t r_stride_in_bytes);
346
346
  /** @copydoc nk_angulars_symmetric_f16 */
347
- NK_PUBLIC void nk_angulars_symmetric_f16_serial(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
347
+ NK_PUBLIC void nk_angulars_symmetric_f16_serial(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
348
348
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
349
349
  nk_size_t row_start, nk_size_t row_count);
350
350
  /** @copydoc nk_euclideans_packed_f16 */
@@ -352,7 +352,7 @@ NK_PUBLIC void nk_euclideans_packed_f16_serial(nk_f16_t const *a, void const *b_
352
352
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
353
353
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
354
354
  /** @copydoc nk_euclideans_symmetric_f16 */
355
- NK_PUBLIC void nk_euclideans_symmetric_f16_serial(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
355
+ NK_PUBLIC void nk_euclideans_symmetric_f16_serial(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
356
356
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
357
357
  nk_size_t row_start, nk_size_t row_count);
358
358
 
@@ -361,7 +361,7 @@ NK_PUBLIC void nk_angulars_packed_bf16_serial(nk_bf16_t const *a, void const *b_
361
361
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
362
362
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
363
363
  /** @copydoc nk_angulars_symmetric_bf16 */
364
- NK_PUBLIC void nk_angulars_symmetric_bf16_serial(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
364
+ NK_PUBLIC void nk_angulars_symmetric_bf16_serial(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
365
365
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
366
366
  nk_size_t row_start, nk_size_t row_count);
367
367
  /** @copydoc nk_euclideans_packed_bf16 */
@@ -369,7 +369,7 @@ NK_PUBLIC void nk_euclideans_packed_bf16_serial(nk_bf16_t const *a, void const *
369
369
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
370
370
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
371
371
  /** @copydoc nk_euclideans_symmetric_bf16 */
372
- NK_PUBLIC void nk_euclideans_symmetric_bf16_serial(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
372
+ NK_PUBLIC void nk_euclideans_symmetric_bf16_serial(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
373
373
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
374
374
  nk_size_t row_start, nk_size_t row_count);
375
375
 
@@ -378,7 +378,7 @@ NK_PUBLIC void nk_angulars_packed_e4m3_serial(nk_e4m3_t const *a, void const *b_
378
378
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
379
379
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
380
380
  /** @copydoc nk_angulars_symmetric_e4m3 */
381
- NK_PUBLIC void nk_angulars_symmetric_e4m3_serial(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
381
+ NK_PUBLIC void nk_angulars_symmetric_e4m3_serial(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
382
382
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
383
383
  nk_size_t row_start, nk_size_t row_count);
384
384
  /** @copydoc nk_euclideans_packed_e4m3 */
@@ -386,7 +386,7 @@ NK_PUBLIC void nk_euclideans_packed_e4m3_serial(nk_e4m3_t const *a, void const *
386
386
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
387
387
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
388
388
  /** @copydoc nk_euclideans_symmetric_e4m3 */
389
- NK_PUBLIC void nk_euclideans_symmetric_e4m3_serial(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
389
+ NK_PUBLIC void nk_euclideans_symmetric_e4m3_serial(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
390
390
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
391
391
  nk_size_t row_start, nk_size_t row_count);
392
392
 
@@ -395,7 +395,7 @@ NK_PUBLIC void nk_angulars_packed_e5m2_serial(nk_e5m2_t const *a, void const *b_
395
395
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
396
396
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
397
397
  /** @copydoc nk_angulars_symmetric_e5m2 */
398
- NK_PUBLIC void nk_angulars_symmetric_e5m2_serial(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
398
+ NK_PUBLIC void nk_angulars_symmetric_e5m2_serial(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
399
399
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
400
400
  nk_size_t row_start, nk_size_t row_count);
401
401
  /** @copydoc nk_euclideans_packed_e5m2 */
@@ -403,7 +403,7 @@ NK_PUBLIC void nk_euclideans_packed_e5m2_serial(nk_e5m2_t const *a, void const *
403
403
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
404
404
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
405
405
  /** @copydoc nk_euclideans_symmetric_e5m2 */
406
- NK_PUBLIC void nk_euclideans_symmetric_e5m2_serial(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
406
+ NK_PUBLIC void nk_euclideans_symmetric_e5m2_serial(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
407
407
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
408
408
  nk_size_t row_start, nk_size_t row_count);
409
409
 
@@ -412,7 +412,7 @@ NK_PUBLIC void nk_angulars_packed_e2m3_serial(nk_e2m3_t const *a, void const *b_
412
412
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
413
413
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
414
414
  /** @copydoc nk_angulars_symmetric_e2m3 */
415
- NK_PUBLIC void nk_angulars_symmetric_e2m3_serial(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
415
+ NK_PUBLIC void nk_angulars_symmetric_e2m3_serial(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
416
416
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
417
417
  nk_size_t row_start, nk_size_t row_count);
418
418
  /** @copydoc nk_euclideans_packed_e2m3 */
@@ -420,7 +420,7 @@ NK_PUBLIC void nk_euclideans_packed_e2m3_serial(nk_e2m3_t const *a, void const *
420
420
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
421
421
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
422
422
  /** @copydoc nk_euclideans_symmetric_e2m3 */
423
- NK_PUBLIC void nk_euclideans_symmetric_e2m3_serial(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
423
+ NK_PUBLIC void nk_euclideans_symmetric_e2m3_serial(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
424
424
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
425
425
  nk_size_t row_start, nk_size_t row_count);
426
426
 
@@ -429,7 +429,7 @@ NK_PUBLIC void nk_angulars_packed_e3m2_serial(nk_e3m2_t const *a, void const *b_
429
429
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
430
430
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
431
431
  /** @copydoc nk_angulars_symmetric_e3m2 */
432
- NK_PUBLIC void nk_angulars_symmetric_e3m2_serial(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
432
+ NK_PUBLIC void nk_angulars_symmetric_e3m2_serial(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
433
433
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
434
434
  nk_size_t row_start, nk_size_t row_count);
435
435
  /** @copydoc nk_euclideans_packed_e3m2 */
@@ -437,7 +437,7 @@ NK_PUBLIC void nk_euclideans_packed_e3m2_serial(nk_e3m2_t const *a, void const *
437
437
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
438
438
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
439
439
  /** @copydoc nk_euclideans_symmetric_e3m2 */
440
- NK_PUBLIC void nk_euclideans_symmetric_e3m2_serial(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
440
+ NK_PUBLIC void nk_euclideans_symmetric_e3m2_serial(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
441
441
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
442
442
  nk_size_t row_start, nk_size_t row_count);
443
443
 
@@ -446,7 +446,7 @@ NK_PUBLIC void nk_angulars_packed_i8_serial(nk_i8_t const *a, void const *b_pack
446
446
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
447
447
  nk_size_t r_stride_in_bytes);
448
448
  /** @copydoc nk_angulars_symmetric_i8 */
449
- NK_PUBLIC void nk_angulars_symmetric_i8_serial(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
449
+ NK_PUBLIC void nk_angulars_symmetric_i8_serial(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
450
450
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
451
451
  nk_size_t row_start, nk_size_t row_count);
452
452
  /** @copydoc nk_euclideans_packed_i8 */
@@ -454,7 +454,7 @@ NK_PUBLIC void nk_euclideans_packed_i8_serial(nk_i8_t const *a, void const *b_pa
454
454
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
455
455
  nk_size_t r_stride_in_bytes);
456
456
  /** @copydoc nk_euclideans_symmetric_i8 */
457
- NK_PUBLIC void nk_euclideans_symmetric_i8_serial(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
457
+ NK_PUBLIC void nk_euclideans_symmetric_i8_serial(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
458
458
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
459
459
  nk_size_t row_start, nk_size_t row_count);
460
460
 
@@ -463,7 +463,7 @@ NK_PUBLIC void nk_angulars_packed_u8_serial(nk_u8_t const *a, void const *b_pack
463
463
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
464
464
  nk_size_t r_stride_in_bytes);
465
465
  /** @copydoc nk_angulars_symmetric_u8 */
466
- NK_PUBLIC void nk_angulars_symmetric_u8_serial(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
466
+ NK_PUBLIC void nk_angulars_symmetric_u8_serial(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
467
467
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
468
468
  nk_size_t row_start, nk_size_t row_count);
469
469
  /** @copydoc nk_euclideans_packed_u8 */
@@ -471,7 +471,7 @@ NK_PUBLIC void nk_euclideans_packed_u8_serial(nk_u8_t const *a, void const *b_pa
471
471
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
472
472
  nk_size_t r_stride_in_bytes);
473
473
  /** @copydoc nk_euclideans_symmetric_u8 */
474
- NK_PUBLIC void nk_euclideans_symmetric_u8_serial(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
474
+ NK_PUBLIC void nk_euclideans_symmetric_u8_serial(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
475
475
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
476
476
  nk_size_t row_start, nk_size_t row_count);
477
477
 
@@ -480,7 +480,7 @@ NK_PUBLIC void nk_angulars_packed_i4_serial(nk_i4x2_t const *a, void const *b_pa
480
480
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
481
481
  nk_size_t r_stride_in_bytes);
482
482
  /** @copydoc nk_angulars_symmetric_i4 */
483
- NK_PUBLIC void nk_angulars_symmetric_i4_serial(nk_i4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
483
+ NK_PUBLIC void nk_angulars_symmetric_i4_serial(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
484
484
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
485
485
  nk_size_t row_start, nk_size_t row_count);
486
486
  /** @copydoc nk_euclideans_packed_i4 */
@@ -488,7 +488,7 @@ NK_PUBLIC void nk_euclideans_packed_i4_serial(nk_i4x2_t const *a, void const *b_
488
488
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
489
489
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
490
490
  /** @copydoc nk_euclideans_symmetric_i4 */
491
- NK_PUBLIC void nk_euclideans_symmetric_i4_serial(nk_i4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
491
+ NK_PUBLIC void nk_euclideans_symmetric_i4_serial(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
492
492
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
493
493
  nk_size_t row_start, nk_size_t row_count);
494
494
 
@@ -497,7 +497,7 @@ NK_PUBLIC void nk_angulars_packed_u4_serial(nk_u4x2_t const *a, void const *b_pa
497
497
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
498
498
  nk_size_t r_stride_in_bytes);
499
499
  /** @copydoc nk_angulars_symmetric_u4 */
500
- NK_PUBLIC void nk_angulars_symmetric_u4_serial(nk_u4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
500
+ NK_PUBLIC void nk_angulars_symmetric_u4_serial(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
501
501
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
502
502
  nk_size_t row_start, nk_size_t row_count);
503
503
  /** @copydoc nk_euclideans_packed_u4 */
@@ -505,7 +505,7 @@ NK_PUBLIC void nk_euclideans_packed_u4_serial(nk_u4x2_t const *a, void const *b_
505
505
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
506
506
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
507
507
  /** @copydoc nk_euclideans_symmetric_u4 */
508
- NK_PUBLIC void nk_euclideans_symmetric_u4_serial(nk_u4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
508
+ NK_PUBLIC void nk_euclideans_symmetric_u4_serial(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
509
509
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
510
510
  nk_size_t row_start, nk_size_t row_count);
511
511
 
@@ -519,7 +519,7 @@ NK_PUBLIC void nk_angulars_packed_bf16_genoa(nk_bf16_t const *a, void const *b_p
519
519
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
520
520
  nk_size_t r_stride_in_bytes);
521
521
  /** @copydoc nk_angulars_symmetric_bf16 */
522
- NK_PUBLIC void nk_angulars_symmetric_bf16_genoa(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
522
+ NK_PUBLIC void nk_angulars_symmetric_bf16_genoa(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
523
523
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
524
524
  nk_size_t row_start, nk_size_t row_count);
525
525
  /** @copydoc nk_euclideans_packed_bf16 */
@@ -527,7 +527,7 @@ NK_PUBLIC void nk_euclideans_packed_bf16_genoa(nk_bf16_t const *a, void const *b
527
527
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
528
528
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
529
529
  /** @copydoc nk_euclideans_symmetric_bf16 */
530
- NK_PUBLIC void nk_euclideans_symmetric_bf16_genoa(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
530
+ NK_PUBLIC void nk_euclideans_symmetric_bf16_genoa(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
531
531
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
532
532
  nk_size_t row_start, nk_size_t row_count);
533
533
 
@@ -536,7 +536,7 @@ NK_PUBLIC void nk_angulars_packed_e4m3_genoa(nk_e4m3_t const *a, void const *b_p
536
536
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
537
537
  nk_size_t r_stride_in_bytes);
538
538
  /** @copydoc nk_angulars_symmetric_e4m3 */
539
- NK_PUBLIC void nk_angulars_symmetric_e4m3_genoa(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
539
+ NK_PUBLIC void nk_angulars_symmetric_e4m3_genoa(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
540
540
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
541
541
  nk_size_t row_start, nk_size_t row_count);
542
542
  /** @copydoc nk_euclideans_packed_e4m3 */
@@ -544,7 +544,7 @@ NK_PUBLIC void nk_euclideans_packed_e4m3_genoa(nk_e4m3_t const *a, void const *b
544
544
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
545
545
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
546
546
  /** @copydoc nk_euclideans_symmetric_e4m3 */
547
- NK_PUBLIC void nk_euclideans_symmetric_e4m3_genoa(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
547
+ NK_PUBLIC void nk_euclideans_symmetric_e4m3_genoa(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
548
548
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
549
549
  nk_size_t row_start, nk_size_t row_count);
550
550
 
@@ -553,7 +553,7 @@ NK_PUBLIC void nk_angulars_packed_e5m2_genoa(nk_e5m2_t const *a, void const *b_p
553
553
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
554
554
  nk_size_t r_stride_in_bytes);
555
555
  /** @copydoc nk_angulars_symmetric_e5m2 */
556
- NK_PUBLIC void nk_angulars_symmetric_e5m2_genoa(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
556
+ NK_PUBLIC void nk_angulars_symmetric_e5m2_genoa(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
557
557
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
558
558
  nk_size_t row_start, nk_size_t row_count);
559
559
  /** @copydoc nk_euclideans_packed_e5m2 */
@@ -561,12 +561,48 @@ NK_PUBLIC void nk_euclideans_packed_e5m2_genoa(nk_e5m2_t const *a, void const *b
561
561
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
562
562
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
563
563
  /** @copydoc nk_euclideans_symmetric_e5m2 */
564
- NK_PUBLIC void nk_euclideans_symmetric_e5m2_genoa(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
564
+ NK_PUBLIC void nk_euclideans_symmetric_e5m2_genoa(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
565
565
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
566
566
  nk_size_t row_start, nk_size_t row_count);
567
567
 
568
568
  #endif // NK_TARGET_GENOA
569
569
 
570
+ #if NK_TARGET_DIAMOND
571
+ /** @copydoc nk_angulars_packed_e4m3 */
572
+ NK_PUBLIC void nk_angulars_packed_e4m3_diamond(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *result,
573
+ nk_size_t rows, nk_size_t cols, nk_size_t depth,
574
+ nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
575
+ /** @copydoc nk_angulars_symmetric_e4m3 */
576
+ NK_PUBLIC void nk_angulars_symmetric_e4m3_diamond(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
577
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
578
+ nk_size_t row_start, nk_size_t row_count);
579
+ /** @copydoc nk_euclideans_packed_e4m3 */
580
+ NK_PUBLIC void nk_euclideans_packed_e4m3_diamond(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *result,
581
+ nk_size_t rows, nk_size_t cols, nk_size_t depth,
582
+ nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
583
+ /** @copydoc nk_euclideans_symmetric_e4m3 */
584
+ NK_PUBLIC void nk_euclideans_symmetric_e4m3_diamond(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
585
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
586
+ nk_size_t row_start, nk_size_t row_count);
587
+
588
+ /** @copydoc nk_angulars_packed_e5m2 */
589
+ NK_PUBLIC void nk_angulars_packed_e5m2_diamond(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *result,
590
+ nk_size_t rows, nk_size_t cols, nk_size_t depth,
591
+ nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
592
+ /** @copydoc nk_angulars_symmetric_e5m2 */
593
+ NK_PUBLIC void nk_angulars_symmetric_e5m2_diamond(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
594
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
595
+ nk_size_t row_start, nk_size_t row_count);
596
+ /** @copydoc nk_euclideans_packed_e5m2 */
597
+ NK_PUBLIC void nk_euclideans_packed_e5m2_diamond(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *result,
598
+ nk_size_t rows, nk_size_t cols, nk_size_t depth,
599
+ nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
600
+ /** @copydoc nk_euclideans_symmetric_e5m2 */
601
+ NK_PUBLIC void nk_euclideans_symmetric_e5m2_diamond(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
602
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
603
+ nk_size_t row_start, nk_size_t row_count);
604
+ #endif // NK_TARGET_DIAMOND
605
+
570
606
  /* Sapphire Rapids backends using Intel AMX (Advanced Matrix Extensions).
571
607
  * AMX provides 8 tile registers (TMM0-TMM7), each holding up to 1KB of data.
572
608
  * Tiles are configured as 16 rows x 64 bytes, enabling (16 x 32) BF16 or (16 x 64) INT8 tiles.
@@ -578,92 +614,102 @@ NK_PUBLIC void nk_angulars_packed_bf16_sapphireamx(nk_bf16_t const *a, void cons
578
614
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
579
615
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
580
616
  /** @copydoc nk_angulars_symmetric_bf16 */
581
- NK_PUBLIC void nk_angulars_symmetric_bf16_sapphireamx(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
582
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
583
- nk_size_t row_start, nk_size_t row_count);
617
+ NK_PUBLIC void nk_angulars_symmetric_bf16_sapphireamx(nk_bf16_t const *vectors, nk_size_t vectors_count,
618
+ nk_size_t depth, nk_size_t stride, nk_f32_t *result,
619
+ nk_size_t result_stride, nk_size_t row_start,
620
+ nk_size_t row_count);
584
621
  /** @copydoc nk_euclideans_packed_bf16 */
585
622
  NK_PUBLIC void nk_euclideans_packed_bf16_sapphireamx(nk_bf16_t const *a, void const *b_packed, nk_f32_t *result,
586
623
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
587
624
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
588
625
  /** @copydoc nk_euclideans_symmetric_bf16 */
589
- NK_PUBLIC void nk_euclideans_symmetric_bf16_sapphireamx(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
590
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
591
- nk_size_t row_start, nk_size_t row_count);
626
+ NK_PUBLIC void nk_euclideans_symmetric_bf16_sapphireamx(nk_bf16_t const *vectors, nk_size_t vectors_count,
627
+ nk_size_t depth, nk_size_t stride, nk_f32_t *result,
628
+ nk_size_t result_stride, nk_size_t row_start,
629
+ nk_size_t row_count);
592
630
 
593
631
  /** @copydoc nk_angulars_packed_e4m3 */
594
632
  NK_PUBLIC void nk_angulars_packed_e4m3_sapphireamx(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *result,
595
633
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
596
634
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
597
635
  /** @copydoc nk_angulars_symmetric_e4m3 */
598
- NK_PUBLIC void nk_angulars_symmetric_e4m3_sapphireamx(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
599
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
600
- nk_size_t row_start, nk_size_t row_count);
636
+ NK_PUBLIC void nk_angulars_symmetric_e4m3_sapphireamx(nk_e4m3_t const *vectors, nk_size_t vectors_count,
637
+ nk_size_t depth, nk_size_t stride, nk_f32_t *result,
638
+ nk_size_t result_stride, nk_size_t row_start,
639
+ nk_size_t row_count);
601
640
  /** @copydoc nk_euclideans_packed_e4m3 */
602
641
  NK_PUBLIC void nk_euclideans_packed_e4m3_sapphireamx(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *result,
603
642
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
604
643
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
605
644
  /** @copydoc nk_euclideans_symmetric_e4m3 */
606
- NK_PUBLIC void nk_euclideans_symmetric_e4m3_sapphireamx(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
607
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
608
- nk_size_t row_start, nk_size_t row_count);
645
+ NK_PUBLIC void nk_euclideans_symmetric_e4m3_sapphireamx(nk_e4m3_t const *vectors, nk_size_t vectors_count,
646
+ nk_size_t depth, nk_size_t stride, nk_f32_t *result,
647
+ nk_size_t result_stride, nk_size_t row_start,
648
+ nk_size_t row_count);
609
649
 
610
650
  /** @copydoc nk_angulars_packed_e5m2 */
611
651
  NK_PUBLIC void nk_angulars_packed_e5m2_sapphireamx(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *result,
612
652
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
613
653
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
614
654
  /** @copydoc nk_angulars_symmetric_e5m2 */
615
- NK_PUBLIC void nk_angulars_symmetric_e5m2_sapphireamx(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
616
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
617
- nk_size_t row_start, nk_size_t row_count);
655
+ NK_PUBLIC void nk_angulars_symmetric_e5m2_sapphireamx(nk_e5m2_t const *vectors, nk_size_t vectors_count,
656
+ nk_size_t depth, nk_size_t stride, nk_f32_t *result,
657
+ nk_size_t result_stride, nk_size_t row_start,
658
+ nk_size_t row_count);
618
659
  /** @copydoc nk_euclideans_packed_e5m2 */
619
660
  NK_PUBLIC void nk_euclideans_packed_e5m2_sapphireamx(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *result,
620
661
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
621
662
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
622
663
  /** @copydoc nk_euclideans_symmetric_e5m2 */
623
- NK_PUBLIC void nk_euclideans_symmetric_e5m2_sapphireamx(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
624
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
625
- nk_size_t row_start, nk_size_t row_count);
664
+ NK_PUBLIC void nk_euclideans_symmetric_e5m2_sapphireamx(nk_e5m2_t const *vectors, nk_size_t vectors_count,
665
+ nk_size_t depth, nk_size_t stride, nk_f32_t *result,
666
+ nk_size_t result_stride, nk_size_t row_start,
667
+ nk_size_t row_count);
626
668
 
627
669
  /** @copydoc nk_angulars_packed_e2m3 */
628
670
  NK_PUBLIC void nk_angulars_packed_e2m3_sapphireamx(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *result,
629
671
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
630
672
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
631
673
  /** @copydoc nk_angulars_symmetric_e2m3 */
632
- NK_PUBLIC void nk_angulars_symmetric_e2m3_sapphireamx(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
633
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
634
- nk_size_t row_start, nk_size_t row_count);
674
+ NK_PUBLIC void nk_angulars_symmetric_e2m3_sapphireamx(nk_e2m3_t const *vectors, nk_size_t vectors_count,
675
+ nk_size_t depth, nk_size_t stride, nk_f32_t *result,
676
+ nk_size_t result_stride, nk_size_t row_start,
677
+ nk_size_t row_count);
635
678
  /** @copydoc nk_euclideans_packed_e2m3 */
636
679
  NK_PUBLIC void nk_euclideans_packed_e2m3_sapphireamx(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *result,
637
680
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
638
681
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
639
682
  /** @copydoc nk_euclideans_symmetric_e2m3 */
640
- NK_PUBLIC void nk_euclideans_symmetric_e2m3_sapphireamx(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
641
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
642
- nk_size_t row_start, nk_size_t row_count);
683
+ NK_PUBLIC void nk_euclideans_symmetric_e2m3_sapphireamx(nk_e2m3_t const *vectors, nk_size_t vectors_count,
684
+ nk_size_t depth, nk_size_t stride, nk_f32_t *result,
685
+ nk_size_t result_stride, nk_size_t row_start,
686
+ nk_size_t row_count);
643
687
 
644
688
  /** @copydoc nk_angulars_packed_e3m2 */
645
689
  NK_PUBLIC void nk_angulars_packed_e3m2_sapphireamx(nk_e3m2_t const *a, void const *b_packed, nk_f32_t *result,
646
690
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
647
691
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
648
692
  /** @copydoc nk_angulars_symmetric_e3m2 */
649
- NK_PUBLIC void nk_angulars_symmetric_e3m2_sapphireamx(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
650
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
651
- nk_size_t row_start, nk_size_t row_count);
693
+ NK_PUBLIC void nk_angulars_symmetric_e3m2_sapphireamx(nk_e3m2_t const *vectors, nk_size_t vectors_count,
694
+ nk_size_t depth, nk_size_t stride, nk_f32_t *result,
695
+ nk_size_t result_stride, nk_size_t row_start,
696
+ nk_size_t row_count);
652
697
  /** @copydoc nk_euclideans_packed_e3m2 */
653
698
  NK_PUBLIC void nk_euclideans_packed_e3m2_sapphireamx(nk_e3m2_t const *a, void const *b_packed, nk_f32_t *result,
654
699
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
655
700
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
656
701
  /** @copydoc nk_euclideans_symmetric_e3m2 */
657
- NK_PUBLIC void nk_euclideans_symmetric_e3m2_sapphireamx(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
658
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
659
- nk_size_t row_start, nk_size_t row_count);
702
+ NK_PUBLIC void nk_euclideans_symmetric_e3m2_sapphireamx(nk_e3m2_t const *vectors, nk_size_t vectors_count,
703
+ nk_size_t depth, nk_size_t stride, nk_f32_t *result,
704
+ nk_size_t result_stride, nk_size_t row_start,
705
+ nk_size_t row_count);
660
706
 
661
707
  /** @copydoc nk_angulars_packed_i8 */
662
708
  NK_PUBLIC void nk_angulars_packed_i8_sapphireamx(nk_i8_t const *a, void const *b_packed, nk_f32_t *result,
663
709
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
664
710
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
665
711
  /** @copydoc nk_angulars_symmetric_i8 */
666
- NK_PUBLIC void nk_angulars_symmetric_i8_sapphireamx(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
712
+ NK_PUBLIC void nk_angulars_symmetric_i8_sapphireamx(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
667
713
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
668
714
  nk_size_t row_start, nk_size_t row_count);
669
715
  /** @copydoc nk_euclideans_packed_i8 */
@@ -671,7 +717,7 @@ NK_PUBLIC void nk_euclideans_packed_i8_sapphireamx(nk_i8_t const *a, void const
671
717
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
672
718
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
673
719
  /** @copydoc nk_euclideans_symmetric_i8 */
674
- NK_PUBLIC void nk_euclideans_symmetric_i8_sapphireamx(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
720
+ NK_PUBLIC void nk_euclideans_symmetric_i8_sapphireamx(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
675
721
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
676
722
  nk_size_t row_start, nk_size_t row_count);
677
723
 
@@ -680,7 +726,7 @@ NK_PUBLIC void nk_angulars_packed_u8_sapphireamx(nk_u8_t const *a, void const *b
680
726
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
681
727
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
682
728
  /** @copydoc nk_angulars_symmetric_u8 */
683
- NK_PUBLIC void nk_angulars_symmetric_u8_sapphireamx(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
729
+ NK_PUBLIC void nk_angulars_symmetric_u8_sapphireamx(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
684
730
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
685
731
  nk_size_t row_start, nk_size_t row_count);
686
732
  /** @copydoc nk_euclideans_packed_u8 */
@@ -688,7 +734,7 @@ NK_PUBLIC void nk_euclideans_packed_u8_sapphireamx(nk_u8_t const *a, void const
688
734
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
689
735
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
690
736
  /** @copydoc nk_euclideans_symmetric_u8 */
691
- NK_PUBLIC void nk_euclideans_symmetric_u8_sapphireamx(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
737
+ NK_PUBLIC void nk_euclideans_symmetric_u8_sapphireamx(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
692
738
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
693
739
  nk_size_t row_start, nk_size_t row_count);
694
740
  #endif // NK_TARGET_SAPPHIREAMX
@@ -703,7 +749,7 @@ NK_PUBLIC void nk_angulars_packed_f16_sme(nk_f16_t const *a, void const *b_packe
703
749
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
704
750
  nk_size_t r_stride_in_bytes);
705
751
  /** @copydoc nk_angulars_symmetric_f16 */
706
- NK_PUBLIC void nk_angulars_symmetric_f16_sme(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
752
+ NK_PUBLIC void nk_angulars_symmetric_f16_sme(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
707
753
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
708
754
  nk_size_t row_start, nk_size_t row_count);
709
755
  /** @copydoc nk_euclideans_packed_f16 */
@@ -711,7 +757,7 @@ NK_PUBLIC void nk_euclideans_packed_f16_sme(nk_f16_t const *a, void const *b_pac
711
757
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
712
758
  nk_size_t r_stride_in_bytes);
713
759
  /** @copydoc nk_euclideans_symmetric_f16 */
714
- NK_PUBLIC void nk_euclideans_symmetric_f16_sme(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
760
+ NK_PUBLIC void nk_euclideans_symmetric_f16_sme(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
715
761
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
716
762
  nk_size_t row_start, nk_size_t row_count);
717
763
 
@@ -720,7 +766,7 @@ NK_PUBLIC void nk_angulars_packed_bf16_sme(nk_bf16_t const *a, void const *b_pac
720
766
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
721
767
  nk_size_t r_stride_in_bytes);
722
768
  /** @copydoc nk_angulars_symmetric_bf16 */
723
- NK_PUBLIC void nk_angulars_symmetric_bf16_sme(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
769
+ NK_PUBLIC void nk_angulars_symmetric_bf16_sme(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
724
770
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
725
771
  nk_size_t row_start, nk_size_t row_count);
726
772
  /** @copydoc nk_euclideans_packed_bf16 */
@@ -728,7 +774,7 @@ NK_PUBLIC void nk_euclideans_packed_bf16_sme(nk_bf16_t const *a, void const *b_p
728
774
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
729
775
  nk_size_t r_stride_in_bytes);
730
776
  /** @copydoc nk_euclideans_symmetric_bf16 */
731
- NK_PUBLIC void nk_euclideans_symmetric_bf16_sme(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
777
+ NK_PUBLIC void nk_euclideans_symmetric_bf16_sme(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
732
778
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
733
779
  nk_size_t row_start, nk_size_t row_count);
734
780
 
@@ -737,7 +783,7 @@ NK_PUBLIC void nk_angulars_packed_e4m3_sme(nk_e4m3_t const *a, void const *b_pac
737
783
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
738
784
  nk_size_t r_stride_in_bytes);
739
785
  /** @copydoc nk_angulars_symmetric_e4m3 */
740
- NK_PUBLIC void nk_angulars_symmetric_e4m3_sme(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
786
+ NK_PUBLIC void nk_angulars_symmetric_e4m3_sme(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
741
787
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
742
788
  nk_size_t row_start, nk_size_t row_count);
743
789
  /** @copydoc nk_euclideans_packed_e4m3 */
@@ -745,7 +791,7 @@ NK_PUBLIC void nk_euclideans_packed_e4m3_sme(nk_e4m3_t const *a, void const *b_p
745
791
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
746
792
  nk_size_t r_stride_in_bytes);
747
793
  /** @copydoc nk_euclideans_symmetric_e4m3 */
748
- NK_PUBLIC void nk_euclideans_symmetric_e4m3_sme(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
794
+ NK_PUBLIC void nk_euclideans_symmetric_e4m3_sme(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
749
795
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
750
796
  nk_size_t row_start, nk_size_t row_count);
751
797
 
@@ -754,7 +800,7 @@ NK_PUBLIC void nk_angulars_packed_e5m2_sme(nk_e5m2_t const *a, void const *b_pac
754
800
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
755
801
  nk_size_t r_stride_in_bytes);
756
802
  /** @copydoc nk_angulars_symmetric_e5m2 */
757
- NK_PUBLIC void nk_angulars_symmetric_e5m2_sme(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
803
+ NK_PUBLIC void nk_angulars_symmetric_e5m2_sme(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
758
804
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
759
805
  nk_size_t row_start, nk_size_t row_count);
760
806
  /** @copydoc nk_euclideans_packed_e5m2 */
@@ -762,7 +808,7 @@ NK_PUBLIC void nk_euclideans_packed_e5m2_sme(nk_e5m2_t const *a, void const *b_p
762
808
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
763
809
  nk_size_t r_stride_in_bytes);
764
810
  /** @copydoc nk_euclideans_symmetric_e5m2 */
765
- NK_PUBLIC void nk_euclideans_symmetric_e5m2_sme(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
811
+ NK_PUBLIC void nk_euclideans_symmetric_e5m2_sme(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
766
812
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
767
813
  nk_size_t row_start, nk_size_t row_count);
768
814
 
@@ -771,7 +817,7 @@ NK_PUBLIC void nk_angulars_packed_e2m3_sme(nk_e2m3_t const *a, void const *b_pac
771
817
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
772
818
  nk_size_t r_stride_in_bytes);
773
819
  /** @copydoc nk_angulars_symmetric_e2m3 */
774
- NK_PUBLIC void nk_angulars_symmetric_e2m3_sme(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
820
+ NK_PUBLIC void nk_angulars_symmetric_e2m3_sme(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
775
821
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
776
822
  nk_size_t row_start, nk_size_t row_count);
777
823
  /** @copydoc nk_euclideans_packed_e2m3 */
@@ -779,7 +825,7 @@ NK_PUBLIC void nk_euclideans_packed_e2m3_sme(nk_e2m3_t const *a, void const *b_p
779
825
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
780
826
  nk_size_t r_stride_in_bytes);
781
827
  /** @copydoc nk_euclideans_symmetric_e2m3 */
782
- NK_PUBLIC void nk_euclideans_symmetric_e2m3_sme(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
828
+ NK_PUBLIC void nk_euclideans_symmetric_e2m3_sme(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
783
829
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
784
830
  nk_size_t row_start, nk_size_t row_count);
785
831
 
@@ -788,7 +834,7 @@ NK_PUBLIC void nk_angulars_packed_e3m2_sme(nk_e3m2_t const *a, void const *b_pac
788
834
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
789
835
  nk_size_t r_stride_in_bytes);
790
836
  /** @copydoc nk_angulars_symmetric_e3m2 */
791
- NK_PUBLIC void nk_angulars_symmetric_e3m2_sme(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
837
+ NK_PUBLIC void nk_angulars_symmetric_e3m2_sme(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
792
838
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
793
839
  nk_size_t row_start, nk_size_t row_count);
794
840
  /** @copydoc nk_euclideans_packed_e3m2 */
@@ -796,7 +842,7 @@ NK_PUBLIC void nk_euclideans_packed_e3m2_sme(nk_e3m2_t const *a, void const *b_p
796
842
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
797
843
  nk_size_t r_stride_in_bytes);
798
844
  /** @copydoc nk_euclideans_symmetric_e3m2 */
799
- NK_PUBLIC void nk_euclideans_symmetric_e3m2_sme(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
845
+ NK_PUBLIC void nk_euclideans_symmetric_e3m2_sme(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
800
846
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
801
847
  nk_size_t row_start, nk_size_t row_count);
802
848
 
@@ -805,7 +851,7 @@ NK_PUBLIC void nk_angulars_packed_i8_sme(nk_i8_t const *a, void const *b_packed,
805
851
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
806
852
  nk_size_t r_stride_in_bytes);
807
853
  /** @copydoc nk_angulars_symmetric_i8 */
808
- NK_PUBLIC void nk_angulars_symmetric_i8_sme(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
854
+ NK_PUBLIC void nk_angulars_symmetric_i8_sme(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
809
855
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
810
856
  nk_size_t row_start, nk_size_t row_count);
811
857
  /** @copydoc nk_euclideans_packed_i8 */
@@ -813,7 +859,7 @@ NK_PUBLIC void nk_euclideans_packed_i8_sme(nk_i8_t const *a, void const *b_packe
813
859
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
814
860
  nk_size_t r_stride_in_bytes);
815
861
  /** @copydoc nk_euclideans_symmetric_i8 */
816
- NK_PUBLIC void nk_euclideans_symmetric_i8_sme(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
862
+ NK_PUBLIC void nk_euclideans_symmetric_i8_sme(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
817
863
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
818
864
  nk_size_t row_start, nk_size_t row_count);
819
865
 
@@ -822,7 +868,7 @@ NK_PUBLIC void nk_angulars_packed_u8_sme(nk_u8_t const *a, void const *b_packed,
822
868
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
823
869
  nk_size_t r_stride_in_bytes);
824
870
  /** @copydoc nk_angulars_symmetric_u8 */
825
- NK_PUBLIC void nk_angulars_symmetric_u8_sme(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
871
+ NK_PUBLIC void nk_angulars_symmetric_u8_sme(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
826
872
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
827
873
  nk_size_t row_start, nk_size_t row_count);
828
874
  /** @copydoc nk_euclideans_packed_u8 */
@@ -830,7 +876,7 @@ NK_PUBLIC void nk_euclideans_packed_u8_sme(nk_u8_t const *a, void const *b_packe
830
876
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
831
877
  nk_size_t r_stride_in_bytes);
832
878
  /** @copydoc nk_euclideans_symmetric_u8 */
833
- NK_PUBLIC void nk_euclideans_symmetric_u8_sme(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
879
+ NK_PUBLIC void nk_euclideans_symmetric_u8_sme(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
834
880
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
835
881
  nk_size_t row_start, nk_size_t row_count);
836
882
 
@@ -839,7 +885,7 @@ NK_PUBLIC void nk_angulars_packed_i4_sme(nk_i4x2_t const *a, void const *b_packe
839
885
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
840
886
  nk_size_t r_stride_in_bytes);
841
887
  /** @copydoc nk_angulars_symmetric_i4 */
842
- NK_PUBLIC void nk_angulars_symmetric_i4_sme(nk_i4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
888
+ NK_PUBLIC void nk_angulars_symmetric_i4_sme(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
843
889
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
844
890
  nk_size_t row_start, nk_size_t row_count);
845
891
  /** @copydoc nk_euclideans_packed_i4 */
@@ -847,7 +893,7 @@ NK_PUBLIC void nk_euclideans_packed_i4_sme(nk_i4x2_t const *a, void const *b_pac
847
893
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
848
894
  nk_size_t r_stride_in_bytes);
849
895
  /** @copydoc nk_euclideans_symmetric_i4 */
850
- NK_PUBLIC void nk_euclideans_symmetric_i4_sme(nk_i4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
896
+ NK_PUBLIC void nk_euclideans_symmetric_i4_sme(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
851
897
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
852
898
  nk_size_t row_start, nk_size_t row_count);
853
899
 
@@ -856,7 +902,7 @@ NK_PUBLIC void nk_angulars_packed_u4_sme(nk_u4x2_t const *a, void const *b_packe
856
902
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
857
903
  nk_size_t r_stride_in_bytes);
858
904
  /** @copydoc nk_angulars_symmetric_u4 */
859
- NK_PUBLIC void nk_angulars_symmetric_u4_sme(nk_u4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
905
+ NK_PUBLIC void nk_angulars_symmetric_u4_sme(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
860
906
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
861
907
  nk_size_t row_start, nk_size_t row_count);
862
908
  /** @copydoc nk_euclideans_packed_u4 */
@@ -864,7 +910,7 @@ NK_PUBLIC void nk_euclideans_packed_u4_sme(nk_u4x2_t const *a, void const *b_pac
864
910
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
865
911
  nk_size_t r_stride_in_bytes);
866
912
  /** @copydoc nk_euclideans_symmetric_u4 */
867
- NK_PUBLIC void nk_euclideans_symmetric_u4_sme(nk_u4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
913
+ NK_PUBLIC void nk_euclideans_symmetric_u4_sme(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
868
914
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
869
915
  nk_size_t row_start, nk_size_t row_count);
870
916
  #endif // NK_TARGET_SME
@@ -878,7 +924,7 @@ NK_PUBLIC void nk_angulars_packed_f32_smef64(nk_f32_t const *a, void const *b_pa
878
924
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
879
925
  nk_size_t r_stride_in_bytes);
880
926
  /** @copydoc nk_angulars_symmetric_f32 */
881
- NK_PUBLIC void nk_angulars_symmetric_f32_smef64(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
927
+ NK_PUBLIC void nk_angulars_symmetric_f32_smef64(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
882
928
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
883
929
  nk_size_t row_start, nk_size_t row_count);
884
930
  /** @copydoc nk_euclideans_packed_f32 */
@@ -886,7 +932,7 @@ NK_PUBLIC void nk_euclideans_packed_f32_smef64(nk_f32_t const *a, void const *b_
886
932
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
887
933
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
888
934
  /** @copydoc nk_euclideans_symmetric_f32 */
889
- NK_PUBLIC void nk_euclideans_symmetric_f32_smef64(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
935
+ NK_PUBLIC void nk_euclideans_symmetric_f32_smef64(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
890
936
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
891
937
  nk_size_t row_start, nk_size_t row_count);
892
938
 
@@ -895,7 +941,7 @@ NK_PUBLIC void nk_angulars_packed_f64_smef64(nk_f64_t const *a, void const *b_pa
895
941
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
896
942
  nk_size_t r_stride_in_bytes);
897
943
  /** @copydoc nk_angulars_symmetric_f64 */
898
- NK_PUBLIC void nk_angulars_symmetric_f64_smef64(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
944
+ NK_PUBLIC void nk_angulars_symmetric_f64_smef64(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
899
945
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
900
946
  nk_size_t row_start, nk_size_t row_count);
901
947
  /** @copydoc nk_euclideans_packed_f64 */
@@ -903,7 +949,7 @@ NK_PUBLIC void nk_euclideans_packed_f64_smef64(nk_f64_t const *a, void const *b_
903
949
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
904
950
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
905
951
  /** @copydoc nk_euclideans_symmetric_f64 */
906
- NK_PUBLIC void nk_euclideans_symmetric_f64_smef64(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
952
+ NK_PUBLIC void nk_euclideans_symmetric_f64_smef64(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
907
953
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
908
954
  nk_size_t row_start, nk_size_t row_count);
909
955
  #endif // NK_TARGET_SMEF64
@@ -917,7 +963,7 @@ NK_PUBLIC void nk_angulars_packed_f32_haswell(nk_f32_t const *a, void const *b_p
917
963
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
918
964
  nk_size_t r_stride_in_bytes);
919
965
  /** @copydoc nk_angulars_symmetric_f32 */
920
- NK_PUBLIC void nk_angulars_symmetric_f32_haswell(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
966
+ NK_PUBLIC void nk_angulars_symmetric_f32_haswell(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
921
967
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
922
968
  nk_size_t row_start, nk_size_t row_count);
923
969
  /** @copydoc nk_euclideans_packed_f32 */
@@ -925,7 +971,7 @@ NK_PUBLIC void nk_euclideans_packed_f32_haswell(nk_f32_t const *a, void const *b
925
971
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
926
972
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
927
973
  /** @copydoc nk_euclideans_symmetric_f32 */
928
- NK_PUBLIC void nk_euclideans_symmetric_f32_haswell(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
974
+ NK_PUBLIC void nk_euclideans_symmetric_f32_haswell(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
929
975
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
930
976
  nk_size_t row_start, nk_size_t row_count);
931
977
 
@@ -934,7 +980,7 @@ NK_PUBLIC void nk_angulars_packed_f64_haswell(nk_f64_t const *a, void const *b_p
934
980
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
935
981
  nk_size_t r_stride_in_bytes);
936
982
  /** @copydoc nk_angulars_symmetric_f64 */
937
- NK_PUBLIC void nk_angulars_symmetric_f64_haswell(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
983
+ NK_PUBLIC void nk_angulars_symmetric_f64_haswell(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
938
984
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
939
985
  nk_size_t row_start, nk_size_t row_count);
940
986
  /** @copydoc nk_euclideans_packed_f64 */
@@ -942,7 +988,7 @@ NK_PUBLIC void nk_euclideans_packed_f64_haswell(nk_f64_t const *a, void const *b
942
988
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
943
989
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
944
990
  /** @copydoc nk_euclideans_symmetric_f64 */
945
- NK_PUBLIC void nk_euclideans_symmetric_f64_haswell(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
991
+ NK_PUBLIC void nk_euclideans_symmetric_f64_haswell(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
946
992
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
947
993
  nk_size_t row_start, nk_size_t row_count);
948
994
 
@@ -951,7 +997,7 @@ NK_PUBLIC void nk_angulars_packed_f16_haswell(nk_f16_t const *a, void const *b_p
951
997
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
952
998
  nk_size_t r_stride_in_bytes);
953
999
  /** @copydoc nk_angulars_symmetric_f16 */
954
- NK_PUBLIC void nk_angulars_symmetric_f16_haswell(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1000
+ NK_PUBLIC void nk_angulars_symmetric_f16_haswell(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
955
1001
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
956
1002
  nk_size_t row_start, nk_size_t row_count);
957
1003
  /** @copydoc nk_euclideans_packed_f16 */
@@ -959,7 +1005,7 @@ NK_PUBLIC void nk_euclideans_packed_f16_haswell(nk_f16_t const *a, void const *b
959
1005
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
960
1006
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
961
1007
  /** @copydoc nk_euclideans_symmetric_f16 */
962
- NK_PUBLIC void nk_euclideans_symmetric_f16_haswell(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1008
+ NK_PUBLIC void nk_euclideans_symmetric_f16_haswell(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
963
1009
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
964
1010
  nk_size_t row_start, nk_size_t row_count);
965
1011
 
@@ -968,7 +1014,7 @@ NK_PUBLIC void nk_angulars_packed_bf16_haswell(nk_bf16_t const *a, void const *b
968
1014
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
969
1015
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
970
1016
  /** @copydoc nk_angulars_symmetric_bf16 */
971
- NK_PUBLIC void nk_angulars_symmetric_bf16_haswell(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1017
+ NK_PUBLIC void nk_angulars_symmetric_bf16_haswell(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
972
1018
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
973
1019
  nk_size_t row_start, nk_size_t row_count);
974
1020
  /** @copydoc nk_euclideans_packed_bf16 */
@@ -976,7 +1022,7 @@ NK_PUBLIC void nk_euclideans_packed_bf16_haswell(nk_bf16_t const *a, void const
976
1022
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
977
1023
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
978
1024
  /** @copydoc nk_euclideans_symmetric_bf16 */
979
- NK_PUBLIC void nk_euclideans_symmetric_bf16_haswell(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1025
+ NK_PUBLIC void nk_euclideans_symmetric_bf16_haswell(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
980
1026
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
981
1027
  nk_size_t row_start, nk_size_t row_count);
982
1028
 
@@ -985,7 +1031,7 @@ NK_PUBLIC void nk_angulars_packed_e4m3_haswell(nk_e4m3_t const *a, void const *b
985
1031
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
986
1032
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
987
1033
  /** @copydoc nk_angulars_symmetric_e4m3 */
988
- NK_PUBLIC void nk_angulars_symmetric_e4m3_haswell(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1034
+ NK_PUBLIC void nk_angulars_symmetric_e4m3_haswell(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
989
1035
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
990
1036
  nk_size_t row_start, nk_size_t row_count);
991
1037
  /** @copydoc nk_euclideans_packed_e4m3 */
@@ -993,7 +1039,7 @@ NK_PUBLIC void nk_euclideans_packed_e4m3_haswell(nk_e4m3_t const *a, void const
993
1039
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
994
1040
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
995
1041
  /** @copydoc nk_euclideans_symmetric_e4m3 */
996
- NK_PUBLIC void nk_euclideans_symmetric_e4m3_haswell(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1042
+ NK_PUBLIC void nk_euclideans_symmetric_e4m3_haswell(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
997
1043
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
998
1044
  nk_size_t row_start, nk_size_t row_count);
999
1045
 
@@ -1002,7 +1048,7 @@ NK_PUBLIC void nk_angulars_packed_e5m2_haswell(nk_e5m2_t const *a, void const *b
1002
1048
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1003
1049
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1004
1050
  /** @copydoc nk_angulars_symmetric_e5m2 */
1005
- NK_PUBLIC void nk_angulars_symmetric_e5m2_haswell(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1051
+ NK_PUBLIC void nk_angulars_symmetric_e5m2_haswell(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1006
1052
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1007
1053
  nk_size_t row_start, nk_size_t row_count);
1008
1054
  /** @copydoc nk_euclideans_packed_e5m2 */
@@ -1010,7 +1056,7 @@ NK_PUBLIC void nk_euclideans_packed_e5m2_haswell(nk_e5m2_t const *a, void const
1010
1056
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1011
1057
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1012
1058
  /** @copydoc nk_euclideans_symmetric_e5m2 */
1013
- NK_PUBLIC void nk_euclideans_symmetric_e5m2_haswell(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1059
+ NK_PUBLIC void nk_euclideans_symmetric_e5m2_haswell(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1014
1060
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1015
1061
  nk_size_t row_start, nk_size_t row_count);
1016
1062
 
@@ -1019,7 +1065,7 @@ NK_PUBLIC void nk_angulars_packed_e2m3_haswell(nk_e2m3_t const *a, void const *b
1019
1065
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1020
1066
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1021
1067
  /** @copydoc nk_angulars_symmetric_e2m3 */
1022
- NK_PUBLIC void nk_angulars_symmetric_e2m3_haswell(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1068
+ NK_PUBLIC void nk_angulars_symmetric_e2m3_haswell(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1023
1069
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1024
1070
  nk_size_t row_start, nk_size_t row_count);
1025
1071
  /** @copydoc nk_euclideans_packed_e2m3 */
@@ -1027,7 +1073,7 @@ NK_PUBLIC void nk_euclideans_packed_e2m3_haswell(nk_e2m3_t const *a, void const
1027
1073
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1028
1074
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1029
1075
  /** @copydoc nk_euclideans_symmetric_e2m3 */
1030
- NK_PUBLIC void nk_euclideans_symmetric_e2m3_haswell(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1076
+ NK_PUBLIC void nk_euclideans_symmetric_e2m3_haswell(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1031
1077
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1032
1078
  nk_size_t row_start, nk_size_t row_count);
1033
1079
 
@@ -1036,7 +1082,7 @@ NK_PUBLIC void nk_angulars_packed_e3m2_haswell(nk_e3m2_t const *a, void const *b
1036
1082
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1037
1083
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1038
1084
  /** @copydoc nk_angulars_symmetric_e3m2 */
1039
- NK_PUBLIC void nk_angulars_symmetric_e3m2_haswell(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1085
+ NK_PUBLIC void nk_angulars_symmetric_e3m2_haswell(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1040
1086
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1041
1087
  nk_size_t row_start, nk_size_t row_count);
1042
1088
  /** @copydoc nk_euclideans_packed_e3m2 */
@@ -1044,7 +1090,7 @@ NK_PUBLIC void nk_euclideans_packed_e3m2_haswell(nk_e3m2_t const *a, void const
1044
1090
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1045
1091
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1046
1092
  /** @copydoc nk_euclideans_symmetric_e3m2 */
1047
- NK_PUBLIC void nk_euclideans_symmetric_e3m2_haswell(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1093
+ NK_PUBLIC void nk_euclideans_symmetric_e3m2_haswell(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1048
1094
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1049
1095
  nk_size_t row_start, nk_size_t row_count);
1050
1096
  /** @copydoc nk_angulars_packed_i8 */
@@ -1052,7 +1098,7 @@ NK_PUBLIC void nk_angulars_packed_i8_haswell(nk_i8_t const *a, void const *b_pac
1052
1098
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1053
1099
  nk_size_t r_stride_in_bytes);
1054
1100
  /** @copydoc nk_angulars_symmetric_i8 */
1055
- NK_PUBLIC void nk_angulars_symmetric_i8_haswell(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1101
+ NK_PUBLIC void nk_angulars_symmetric_i8_haswell(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1056
1102
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1057
1103
  nk_size_t row_start, nk_size_t row_count);
1058
1104
  /** @copydoc nk_euclideans_packed_i8 */
@@ -1060,7 +1106,7 @@ NK_PUBLIC void nk_euclideans_packed_i8_haswell(nk_i8_t const *a, void const *b_p
1060
1106
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1061
1107
  nk_size_t r_stride_in_bytes);
1062
1108
  /** @copydoc nk_euclideans_symmetric_i8 */
1063
- NK_PUBLIC void nk_euclideans_symmetric_i8_haswell(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1109
+ NK_PUBLIC void nk_euclideans_symmetric_i8_haswell(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1064
1110
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1065
1111
  nk_size_t row_start, nk_size_t row_count);
1066
1112
  /** @copydoc nk_angulars_packed_u8 */
@@ -1068,7 +1114,7 @@ NK_PUBLIC void nk_angulars_packed_u8_haswell(nk_u8_t const *a, void const *b_pac
1068
1114
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1069
1115
  nk_size_t r_stride_in_bytes);
1070
1116
  /** @copydoc nk_angulars_symmetric_u8 */
1071
- NK_PUBLIC void nk_angulars_symmetric_u8_haswell(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1117
+ NK_PUBLIC void nk_angulars_symmetric_u8_haswell(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1072
1118
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1073
1119
  nk_size_t row_start, nk_size_t row_count);
1074
1120
  /** @copydoc nk_euclideans_packed_u8 */
@@ -1076,7 +1122,7 @@ NK_PUBLIC void nk_euclideans_packed_u8_haswell(nk_u8_t const *a, void const *b_p
1076
1122
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1077
1123
  nk_size_t r_stride_in_bytes);
1078
1124
  /** @copydoc nk_euclideans_symmetric_u8 */
1079
- NK_PUBLIC void nk_euclideans_symmetric_u8_haswell(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1125
+ NK_PUBLIC void nk_euclideans_symmetric_u8_haswell(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1080
1126
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1081
1127
  nk_size_t row_start, nk_size_t row_count);
1082
1128
  #endif // NK_TARGET_HASWELL
@@ -1090,7 +1136,7 @@ NK_PUBLIC void nk_angulars_packed_f32_skylake(nk_f32_t const *a, void const *b_p
1090
1136
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1091
1137
  nk_size_t r_stride_in_bytes);
1092
1138
  /** @copydoc nk_angulars_symmetric_f32 */
1093
- NK_PUBLIC void nk_angulars_symmetric_f32_skylake(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1139
+ NK_PUBLIC void nk_angulars_symmetric_f32_skylake(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1094
1140
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1095
1141
  nk_size_t row_start, nk_size_t row_count);
1096
1142
  /** @copydoc nk_euclideans_packed_f32 */
@@ -1098,7 +1144,7 @@ NK_PUBLIC void nk_euclideans_packed_f32_skylake(nk_f32_t const *a, void const *b
1098
1144
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1099
1145
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1100
1146
  /** @copydoc nk_euclideans_symmetric_f32 */
1101
- NK_PUBLIC void nk_euclideans_symmetric_f32_skylake(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1147
+ NK_PUBLIC void nk_euclideans_symmetric_f32_skylake(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1102
1148
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1103
1149
  nk_size_t row_start, nk_size_t row_count);
1104
1150
 
@@ -1107,7 +1153,7 @@ NK_PUBLIC void nk_angulars_packed_f64_skylake(nk_f64_t const *a, void const *b_p
1107
1153
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1108
1154
  nk_size_t r_stride_in_bytes);
1109
1155
  /** @copydoc nk_angulars_symmetric_f64 */
1110
- NK_PUBLIC void nk_angulars_symmetric_f64_skylake(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1156
+ NK_PUBLIC void nk_angulars_symmetric_f64_skylake(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1111
1157
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1112
1158
  nk_size_t row_start, nk_size_t row_count);
1113
1159
  /** @copydoc nk_euclideans_packed_f64 */
@@ -1115,7 +1161,7 @@ NK_PUBLIC void nk_euclideans_packed_f64_skylake(nk_f64_t const *a, void const *b
1115
1161
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1116
1162
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1117
1163
  /** @copydoc nk_euclideans_symmetric_f64 */
1118
- NK_PUBLIC void nk_euclideans_symmetric_f64_skylake(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1164
+ NK_PUBLIC void nk_euclideans_symmetric_f64_skylake(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1119
1165
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1120
1166
  nk_size_t row_start, nk_size_t row_count);
1121
1167
 
@@ -1124,7 +1170,7 @@ NK_PUBLIC void nk_angulars_packed_f16_skylake(nk_f16_t const *a, void const *b_p
1124
1170
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1125
1171
  nk_size_t r_stride_in_bytes);
1126
1172
  /** @copydoc nk_angulars_symmetric_f16 */
1127
- NK_PUBLIC void nk_angulars_symmetric_f16_skylake(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1173
+ NK_PUBLIC void nk_angulars_symmetric_f16_skylake(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1128
1174
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1129
1175
  nk_size_t row_start, nk_size_t row_count);
1130
1176
  /** @copydoc nk_euclideans_packed_f16 */
@@ -1132,7 +1178,7 @@ NK_PUBLIC void nk_euclideans_packed_f16_skylake(nk_f16_t const *a, void const *b
1132
1178
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1133
1179
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1134
1180
  /** @copydoc nk_euclideans_symmetric_f16 */
1135
- NK_PUBLIC void nk_euclideans_symmetric_f16_skylake(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1181
+ NK_PUBLIC void nk_euclideans_symmetric_f16_skylake(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1136
1182
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1137
1183
  nk_size_t row_start, nk_size_t row_count);
1138
1184
 
@@ -1141,7 +1187,7 @@ NK_PUBLIC void nk_angulars_packed_bf16_skylake(nk_bf16_t const *a, void const *b
1141
1187
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1142
1188
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1143
1189
  /** @copydoc nk_angulars_symmetric_bf16 */
1144
- NK_PUBLIC void nk_angulars_symmetric_bf16_skylake(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1190
+ NK_PUBLIC void nk_angulars_symmetric_bf16_skylake(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1145
1191
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1146
1192
  nk_size_t row_start, nk_size_t row_count);
1147
1193
  /** @copydoc nk_euclideans_packed_bf16 */
@@ -1149,7 +1195,7 @@ NK_PUBLIC void nk_euclideans_packed_bf16_skylake(nk_bf16_t const *a, void const
1149
1195
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1150
1196
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1151
1197
  /** @copydoc nk_euclideans_symmetric_bf16 */
1152
- NK_PUBLIC void nk_euclideans_symmetric_bf16_skylake(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1198
+ NK_PUBLIC void nk_euclideans_symmetric_bf16_skylake(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1153
1199
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1154
1200
  nk_size_t row_start, nk_size_t row_count);
1155
1201
 
@@ -1158,7 +1204,7 @@ NK_PUBLIC void nk_angulars_packed_e4m3_skylake(nk_e4m3_t const *a, void const *b
1158
1204
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1159
1205
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1160
1206
  /** @copydoc nk_angulars_symmetric_e4m3 */
1161
- NK_PUBLIC void nk_angulars_symmetric_e4m3_skylake(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1207
+ NK_PUBLIC void nk_angulars_symmetric_e4m3_skylake(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1162
1208
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1163
1209
  nk_size_t row_start, nk_size_t row_count);
1164
1210
  /** @copydoc nk_euclideans_packed_e4m3 */
@@ -1166,7 +1212,7 @@ NK_PUBLIC void nk_euclideans_packed_e4m3_skylake(nk_e4m3_t const *a, void const
1166
1212
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1167
1213
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1168
1214
  /** @copydoc nk_euclideans_symmetric_e4m3 */
1169
- NK_PUBLIC void nk_euclideans_symmetric_e4m3_skylake(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1215
+ NK_PUBLIC void nk_euclideans_symmetric_e4m3_skylake(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1170
1216
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1171
1217
  nk_size_t row_start, nk_size_t row_count);
1172
1218
 
@@ -1175,7 +1221,7 @@ NK_PUBLIC void nk_angulars_packed_e5m2_skylake(nk_e5m2_t const *a, void const *b
1175
1221
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1176
1222
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1177
1223
  /** @copydoc nk_angulars_symmetric_e5m2 */
1178
- NK_PUBLIC void nk_angulars_symmetric_e5m2_skylake(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1224
+ NK_PUBLIC void nk_angulars_symmetric_e5m2_skylake(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1179
1225
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1180
1226
  nk_size_t row_start, nk_size_t row_count);
1181
1227
  /** @copydoc nk_euclideans_packed_e5m2 */
@@ -1183,7 +1229,7 @@ NK_PUBLIC void nk_euclideans_packed_e5m2_skylake(nk_e5m2_t const *a, void const
1183
1229
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1184
1230
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1185
1231
  /** @copydoc nk_euclideans_symmetric_e5m2 */
1186
- NK_PUBLIC void nk_euclideans_symmetric_e5m2_skylake(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1232
+ NK_PUBLIC void nk_euclideans_symmetric_e5m2_skylake(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1187
1233
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1188
1234
  nk_size_t row_start, nk_size_t row_count);
1189
1235
 
@@ -1192,7 +1238,7 @@ NK_PUBLIC void nk_angulars_packed_e2m3_skylake(nk_e2m3_t const *a, void const *b
1192
1238
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1193
1239
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1194
1240
  /** @copydoc nk_angulars_symmetric_e2m3 */
1195
- NK_PUBLIC void nk_angulars_symmetric_e2m3_skylake(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1241
+ NK_PUBLIC void nk_angulars_symmetric_e2m3_skylake(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1196
1242
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1197
1243
  nk_size_t row_start, nk_size_t row_count);
1198
1244
  /** @copydoc nk_euclideans_packed_e2m3 */
@@ -1200,7 +1246,7 @@ NK_PUBLIC void nk_euclideans_packed_e2m3_skylake(nk_e2m3_t const *a, void const
1200
1246
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1201
1247
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1202
1248
  /** @copydoc nk_euclideans_symmetric_e2m3 */
1203
- NK_PUBLIC void nk_euclideans_symmetric_e2m3_skylake(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1249
+ NK_PUBLIC void nk_euclideans_symmetric_e2m3_skylake(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1204
1250
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1205
1251
  nk_size_t row_start, nk_size_t row_count);
1206
1252
 
@@ -1209,7 +1255,7 @@ NK_PUBLIC void nk_angulars_packed_e3m2_skylake(nk_e3m2_t const *a, void const *b
1209
1255
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1210
1256
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1211
1257
  /** @copydoc nk_angulars_symmetric_e3m2 */
1212
- NK_PUBLIC void nk_angulars_symmetric_e3m2_skylake(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1258
+ NK_PUBLIC void nk_angulars_symmetric_e3m2_skylake(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1213
1259
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1214
1260
  nk_size_t row_start, nk_size_t row_count);
1215
1261
  /** @copydoc nk_euclideans_packed_e3m2 */
@@ -1217,7 +1263,7 @@ NK_PUBLIC void nk_euclideans_packed_e3m2_skylake(nk_e3m2_t const *a, void const
1217
1263
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1218
1264
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1219
1265
  /** @copydoc nk_euclideans_symmetric_e3m2 */
1220
- NK_PUBLIC void nk_euclideans_symmetric_e3m2_skylake(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1266
+ NK_PUBLIC void nk_euclideans_symmetric_e3m2_skylake(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1221
1267
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1222
1268
  nk_size_t row_start, nk_size_t row_count);
1223
1269
  #endif // NK_TARGET_SKYLAKE
@@ -1231,7 +1277,7 @@ NK_PUBLIC void nk_angulars_packed_i8_icelake(nk_i8_t const *a, void const *b_pac
1231
1277
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1232
1278
  nk_size_t r_stride_in_bytes);
1233
1279
  /** @copydoc nk_angulars_symmetric_i8 */
1234
- NK_PUBLIC void nk_angulars_symmetric_i8_icelake(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1280
+ NK_PUBLIC void nk_angulars_symmetric_i8_icelake(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1235
1281
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1236
1282
  nk_size_t row_start, nk_size_t row_count);
1237
1283
  /** @copydoc nk_euclideans_packed_i8 */
@@ -1239,7 +1285,7 @@ NK_PUBLIC void nk_euclideans_packed_i8_icelake(nk_i8_t const *a, void const *b_p
1239
1285
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1240
1286
  nk_size_t r_stride_in_bytes);
1241
1287
  /** @copydoc nk_euclideans_symmetric_i8 */
1242
- NK_PUBLIC void nk_euclideans_symmetric_i8_icelake(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1288
+ NK_PUBLIC void nk_euclideans_symmetric_i8_icelake(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1243
1289
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1244
1290
  nk_size_t row_start, nk_size_t row_count);
1245
1291
 
@@ -1248,7 +1294,7 @@ NK_PUBLIC void nk_angulars_packed_u8_icelake(nk_u8_t const *a, void const *b_pac
1248
1294
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1249
1295
  nk_size_t r_stride_in_bytes);
1250
1296
  /** @copydoc nk_angulars_symmetric_u8 */
1251
- NK_PUBLIC void nk_angulars_symmetric_u8_icelake(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1297
+ NK_PUBLIC void nk_angulars_symmetric_u8_icelake(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1252
1298
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1253
1299
  nk_size_t row_start, nk_size_t row_count);
1254
1300
  /** @copydoc nk_euclideans_packed_u8 */
@@ -1256,7 +1302,7 @@ NK_PUBLIC void nk_euclideans_packed_u8_icelake(nk_u8_t const *a, void const *b_p
1256
1302
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1257
1303
  nk_size_t r_stride_in_bytes);
1258
1304
  /** @copydoc nk_euclideans_symmetric_u8 */
1259
- NK_PUBLIC void nk_euclideans_symmetric_u8_icelake(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1305
+ NK_PUBLIC void nk_euclideans_symmetric_u8_icelake(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1260
1306
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1261
1307
  nk_size_t row_start, nk_size_t row_count);
1262
1308
 
@@ -1265,7 +1311,7 @@ NK_PUBLIC void nk_angulars_packed_i4_icelake(nk_i4x2_t const *a, void const *b_p
1265
1311
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1266
1312
  nk_size_t r_stride_in_bytes);
1267
1313
  /** @copydoc nk_angulars_symmetric_i4 */
1268
- NK_PUBLIC void nk_angulars_symmetric_i4_icelake(nk_i4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1314
+ NK_PUBLIC void nk_angulars_symmetric_i4_icelake(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1269
1315
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1270
1316
  nk_size_t row_start, nk_size_t row_count);
1271
1317
  /** @copydoc nk_euclideans_packed_i4 */
@@ -1273,7 +1319,7 @@ NK_PUBLIC void nk_euclideans_packed_i4_icelake(nk_i4x2_t const *a, void const *b
1273
1319
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1274
1320
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1275
1321
  /** @copydoc nk_euclideans_symmetric_i4 */
1276
- NK_PUBLIC void nk_euclideans_symmetric_i4_icelake(nk_i4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1322
+ NK_PUBLIC void nk_euclideans_symmetric_i4_icelake(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1277
1323
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1278
1324
  nk_size_t row_start, nk_size_t row_count);
1279
1325
 
@@ -1282,7 +1328,7 @@ NK_PUBLIC void nk_angulars_packed_u4_icelake(nk_u4x2_t const *a, void const *b_p
1282
1328
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1283
1329
  nk_size_t r_stride_in_bytes);
1284
1330
  /** @copydoc nk_angulars_symmetric_u4 */
1285
- NK_PUBLIC void nk_angulars_symmetric_u4_icelake(nk_u4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1331
+ NK_PUBLIC void nk_angulars_symmetric_u4_icelake(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1286
1332
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1287
1333
  nk_size_t row_start, nk_size_t row_count);
1288
1334
  /** @copydoc nk_euclideans_packed_u4 */
@@ -1290,7 +1336,7 @@ NK_PUBLIC void nk_euclideans_packed_u4_icelake(nk_u4x2_t const *a, void const *b
1290
1336
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1291
1337
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1292
1338
  /** @copydoc nk_euclideans_symmetric_u4 */
1293
- NK_PUBLIC void nk_euclideans_symmetric_u4_icelake(nk_u4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1339
+ NK_PUBLIC void nk_euclideans_symmetric_u4_icelake(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1294
1340
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1295
1341
  nk_size_t row_start, nk_size_t row_count);
1296
1342
  #endif // NK_TARGET_ICELAKE
@@ -1301,7 +1347,7 @@ NK_PUBLIC void nk_angulars_packed_i8_alder(nk_i8_t const *a, void const *b_packe
1301
1347
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1302
1348
  nk_size_t r_stride_in_bytes);
1303
1349
  /** @copydoc nk_angulars_symmetric_i8 */
1304
- NK_PUBLIC void nk_angulars_symmetric_i8_alder(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1350
+ NK_PUBLIC void nk_angulars_symmetric_i8_alder(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1305
1351
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1306
1352
  nk_size_t row_start, nk_size_t row_count);
1307
1353
  /** @copydoc nk_euclideans_packed_i8 */
@@ -1309,7 +1355,7 @@ NK_PUBLIC void nk_euclideans_packed_i8_alder(nk_i8_t const *a, void const *b_pac
1309
1355
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1310
1356
  nk_size_t r_stride_in_bytes);
1311
1357
  /** @copydoc nk_euclideans_symmetric_i8 */
1312
- NK_PUBLIC void nk_euclideans_symmetric_i8_alder(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1358
+ NK_PUBLIC void nk_euclideans_symmetric_i8_alder(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1313
1359
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1314
1360
  nk_size_t row_start, nk_size_t row_count);
1315
1361
  /** @copydoc nk_angulars_packed_u8 */
@@ -1317,7 +1363,7 @@ NK_PUBLIC void nk_angulars_packed_u8_alder(nk_u8_t const *a, void const *b_packe
1317
1363
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1318
1364
  nk_size_t r_stride_in_bytes);
1319
1365
  /** @copydoc nk_angulars_symmetric_u8 */
1320
- NK_PUBLIC void nk_angulars_symmetric_u8_alder(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1366
+ NK_PUBLIC void nk_angulars_symmetric_u8_alder(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1321
1367
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1322
1368
  nk_size_t row_start, nk_size_t row_count);
1323
1369
  /** @copydoc nk_euclideans_packed_u8 */
@@ -1325,7 +1371,7 @@ NK_PUBLIC void nk_euclideans_packed_u8_alder(nk_u8_t const *a, void const *b_pac
1325
1371
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1326
1372
  nk_size_t r_stride_in_bytes);
1327
1373
  /** @copydoc nk_euclideans_symmetric_u8 */
1328
- NK_PUBLIC void nk_euclideans_symmetric_u8_alder(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1374
+ NK_PUBLIC void nk_euclideans_symmetric_u8_alder(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1329
1375
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1330
1376
  nk_size_t row_start, nk_size_t row_count);
1331
1377
  /** @copydoc nk_angulars_packed_e2m3 */
@@ -1333,7 +1379,7 @@ NK_PUBLIC void nk_angulars_packed_e2m3_alder(nk_e2m3_t const *a, void const *b_p
1333
1379
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1334
1380
  nk_size_t r_stride_in_bytes);
1335
1381
  /** @copydoc nk_angulars_symmetric_e2m3 */
1336
- NK_PUBLIC void nk_angulars_symmetric_e2m3_alder(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1382
+ NK_PUBLIC void nk_angulars_symmetric_e2m3_alder(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1337
1383
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1338
1384
  nk_size_t row_start, nk_size_t row_count);
1339
1385
  /** @copydoc nk_euclideans_packed_e2m3 */
@@ -1341,7 +1387,7 @@ NK_PUBLIC void nk_euclideans_packed_e2m3_alder(nk_e2m3_t const *a, void const *b
1341
1387
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1342
1388
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1343
1389
  /** @copydoc nk_euclideans_symmetric_e2m3 */
1344
- NK_PUBLIC void nk_euclideans_symmetric_e2m3_alder(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1390
+ NK_PUBLIC void nk_euclideans_symmetric_e2m3_alder(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1345
1391
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1346
1392
  nk_size_t row_start, nk_size_t row_count);
1347
1393
  #endif // NK_TARGET_ALDER
@@ -1355,7 +1401,7 @@ NK_PUBLIC void nk_angulars_packed_i8_sierra(nk_i8_t const *a, void const *b_pack
1355
1401
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1356
1402
  nk_size_t r_stride_in_bytes);
1357
1403
  /** @copydoc nk_angulars_symmetric_i8 */
1358
- NK_PUBLIC void nk_angulars_symmetric_i8_sierra(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1404
+ NK_PUBLIC void nk_angulars_symmetric_i8_sierra(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1359
1405
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1360
1406
  nk_size_t row_start, nk_size_t row_count);
1361
1407
  /** @copydoc nk_euclideans_packed_i8 */
@@ -1363,7 +1409,7 @@ NK_PUBLIC void nk_euclideans_packed_i8_sierra(nk_i8_t const *a, void const *b_pa
1363
1409
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1364
1410
  nk_size_t r_stride_in_bytes);
1365
1411
  /** @copydoc nk_euclideans_symmetric_i8 */
1366
- NK_PUBLIC void nk_euclideans_symmetric_i8_sierra(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1412
+ NK_PUBLIC void nk_euclideans_symmetric_i8_sierra(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1367
1413
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1368
1414
  nk_size_t row_start, nk_size_t row_count);
1369
1415
  /** @copydoc nk_angulars_packed_u8 */
@@ -1371,7 +1417,7 @@ NK_PUBLIC void nk_angulars_packed_u8_sierra(nk_u8_t const *a, void const *b_pack
1371
1417
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1372
1418
  nk_size_t r_stride_in_bytes);
1373
1419
  /** @copydoc nk_angulars_symmetric_u8 */
1374
- NK_PUBLIC void nk_angulars_symmetric_u8_sierra(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1420
+ NK_PUBLIC void nk_angulars_symmetric_u8_sierra(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1375
1421
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1376
1422
  nk_size_t row_start, nk_size_t row_count);
1377
1423
  /** @copydoc nk_euclideans_packed_u8 */
@@ -1379,7 +1425,7 @@ NK_PUBLIC void nk_euclideans_packed_u8_sierra(nk_u8_t const *a, void const *b_pa
1379
1425
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1380
1426
  nk_size_t r_stride_in_bytes);
1381
1427
  /** @copydoc nk_euclideans_symmetric_u8 */
1382
- NK_PUBLIC void nk_euclideans_symmetric_u8_sierra(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1428
+ NK_PUBLIC void nk_euclideans_symmetric_u8_sierra(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1383
1429
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1384
1430
  nk_size_t row_start, nk_size_t row_count);
1385
1431
  /** @copydoc nk_angulars_packed_e2m3 */
@@ -1387,7 +1433,7 @@ NK_PUBLIC void nk_angulars_packed_e2m3_sierra(nk_e2m3_t const *a, void const *b_
1387
1433
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1388
1434
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1389
1435
  /** @copydoc nk_angulars_symmetric_e2m3 */
1390
- NK_PUBLIC void nk_angulars_symmetric_e2m3_sierra(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1436
+ NK_PUBLIC void nk_angulars_symmetric_e2m3_sierra(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1391
1437
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1392
1438
  nk_size_t row_start, nk_size_t row_count);
1393
1439
  /** @copydoc nk_euclideans_packed_e2m3 */
@@ -1395,7 +1441,7 @@ NK_PUBLIC void nk_euclideans_packed_e2m3_sierra(nk_e2m3_t const *a, void const *
1395
1441
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1396
1442
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1397
1443
  /** @copydoc nk_euclideans_symmetric_e2m3 */
1398
- NK_PUBLIC void nk_euclideans_symmetric_e2m3_sierra(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1444
+ NK_PUBLIC void nk_euclideans_symmetric_e2m3_sierra(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1399
1445
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1400
1446
  nk_size_t row_start, nk_size_t row_count);
1401
1447
  #endif // NK_TARGET_SIERRA
@@ -1409,7 +1455,7 @@ NK_PUBLIC void nk_angulars_packed_i8_v128relaxed(nk_i8_t const *a, void const *b
1409
1455
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1410
1456
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1411
1457
  /** @copydoc nk_angulars_symmetric_i8 */
1412
- NK_PUBLIC void nk_angulars_symmetric_i8_v128relaxed(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1458
+ NK_PUBLIC void nk_angulars_symmetric_i8_v128relaxed(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1413
1459
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1414
1460
  nk_size_t row_start, nk_size_t row_count);
1415
1461
  /** @copydoc nk_euclideans_packed_i8 */
@@ -1417,7 +1463,7 @@ NK_PUBLIC void nk_euclideans_packed_i8_v128relaxed(nk_i8_t const *a, void const
1417
1463
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1418
1464
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1419
1465
  /** @copydoc nk_euclideans_symmetric_i8 */
1420
- NK_PUBLIC void nk_euclideans_symmetric_i8_v128relaxed(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1466
+ NK_PUBLIC void nk_euclideans_symmetric_i8_v128relaxed(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1421
1467
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1422
1468
  nk_size_t row_start, nk_size_t row_count);
1423
1469
  /** @copydoc nk_angulars_packed_u8 */
@@ -1425,7 +1471,7 @@ NK_PUBLIC void nk_angulars_packed_u8_v128relaxed(nk_u8_t const *a, void const *b
1425
1471
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1426
1472
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1427
1473
  /** @copydoc nk_angulars_symmetric_u8 */
1428
- NK_PUBLIC void nk_angulars_symmetric_u8_v128relaxed(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1474
+ NK_PUBLIC void nk_angulars_symmetric_u8_v128relaxed(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1429
1475
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1430
1476
  nk_size_t row_start, nk_size_t row_count);
1431
1477
  /** @copydoc nk_euclideans_packed_u8 */
@@ -1433,7 +1479,7 @@ NK_PUBLIC void nk_euclideans_packed_u8_v128relaxed(nk_u8_t const *a, void const
1433
1479
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1434
1480
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1435
1481
  /** @copydoc nk_euclideans_symmetric_u8 */
1436
- NK_PUBLIC void nk_euclideans_symmetric_u8_v128relaxed(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1482
+ NK_PUBLIC void nk_euclideans_symmetric_u8_v128relaxed(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1437
1483
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1438
1484
  nk_size_t row_start, nk_size_t row_count);
1439
1485
  /** @copydoc nk_angulars_packed_e2m3 */
@@ -1441,71 +1487,79 @@ NK_PUBLIC void nk_angulars_packed_e2m3_v128relaxed(nk_e2m3_t const *a, void cons
1441
1487
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1442
1488
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1443
1489
  /** @copydoc nk_angulars_symmetric_e2m3 */
1444
- NK_PUBLIC void nk_angulars_symmetric_e2m3_v128relaxed(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1445
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1446
- nk_size_t row_start, nk_size_t row_count);
1490
+ NK_PUBLIC void nk_angulars_symmetric_e2m3_v128relaxed(nk_e2m3_t const *vectors, nk_size_t vectors_count,
1491
+ nk_size_t depth, nk_size_t stride, nk_f32_t *result,
1492
+ nk_size_t result_stride, nk_size_t row_start,
1493
+ nk_size_t row_count);
1447
1494
  /** @copydoc nk_euclideans_packed_e2m3 */
1448
1495
  NK_PUBLIC void nk_euclideans_packed_e2m3_v128relaxed(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *result,
1449
1496
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1450
1497
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1451
1498
  /** @copydoc nk_euclideans_symmetric_e2m3 */
1452
- NK_PUBLIC void nk_euclideans_symmetric_e2m3_v128relaxed(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1453
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1454
- nk_size_t row_start, nk_size_t row_count);
1499
+ NK_PUBLIC void nk_euclideans_symmetric_e2m3_v128relaxed(nk_e2m3_t const *vectors, nk_size_t vectors_count,
1500
+ nk_size_t depth, nk_size_t stride, nk_f32_t *result,
1501
+ nk_size_t result_stride, nk_size_t row_start,
1502
+ nk_size_t row_count);
1455
1503
  /** @copydoc nk_angulars_packed_e4m3 */
1456
1504
  NK_PUBLIC void nk_angulars_packed_e4m3_v128relaxed(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *result,
1457
1505
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1458
1506
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1459
1507
  /** @copydoc nk_angulars_symmetric_e4m3 */
1460
- NK_PUBLIC void nk_angulars_symmetric_e4m3_v128relaxed(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1461
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1462
- nk_size_t row_start, nk_size_t row_count);
1508
+ NK_PUBLIC void nk_angulars_symmetric_e4m3_v128relaxed(nk_e4m3_t const *vectors, nk_size_t vectors_count,
1509
+ nk_size_t depth, nk_size_t stride, nk_f32_t *result,
1510
+ nk_size_t result_stride, nk_size_t row_start,
1511
+ nk_size_t row_count);
1463
1512
  /** @copydoc nk_euclideans_packed_e4m3 */
1464
1513
  NK_PUBLIC void nk_euclideans_packed_e4m3_v128relaxed(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *result,
1465
1514
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1466
1515
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1467
1516
  /** @copydoc nk_euclideans_symmetric_e4m3 */
1468
- NK_PUBLIC void nk_euclideans_symmetric_e4m3_v128relaxed(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1469
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1470
- nk_size_t row_start, nk_size_t row_count);
1517
+ NK_PUBLIC void nk_euclideans_symmetric_e4m3_v128relaxed(nk_e4m3_t const *vectors, nk_size_t vectors_count,
1518
+ nk_size_t depth, nk_size_t stride, nk_f32_t *result,
1519
+ nk_size_t result_stride, nk_size_t row_start,
1520
+ nk_size_t row_count);
1471
1521
  /** @copydoc nk_angulars_packed_e5m2 */
1472
1522
  NK_PUBLIC void nk_angulars_packed_e5m2_v128relaxed(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *result,
1473
1523
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1474
1524
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1475
1525
  /** @copydoc nk_angulars_symmetric_e5m2 */
1476
- NK_PUBLIC void nk_angulars_symmetric_e5m2_v128relaxed(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1477
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1478
- nk_size_t row_start, nk_size_t row_count);
1526
+ NK_PUBLIC void nk_angulars_symmetric_e5m2_v128relaxed(nk_e5m2_t const *vectors, nk_size_t vectors_count,
1527
+ nk_size_t depth, nk_size_t stride, nk_f32_t *result,
1528
+ nk_size_t result_stride, nk_size_t row_start,
1529
+ nk_size_t row_count);
1479
1530
  /** @copydoc nk_euclideans_packed_e5m2 */
1480
1531
  NK_PUBLIC void nk_euclideans_packed_e5m2_v128relaxed(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *result,
1481
1532
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1482
1533
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1483
1534
  /** @copydoc nk_euclideans_symmetric_e5m2 */
1484
- NK_PUBLIC void nk_euclideans_symmetric_e5m2_v128relaxed(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1485
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1486
- nk_size_t row_start, nk_size_t row_count);
1535
+ NK_PUBLIC void nk_euclideans_symmetric_e5m2_v128relaxed(nk_e5m2_t const *vectors, nk_size_t vectors_count,
1536
+ nk_size_t depth, nk_size_t stride, nk_f32_t *result,
1537
+ nk_size_t result_stride, nk_size_t row_start,
1538
+ nk_size_t row_count);
1487
1539
  /** @copydoc nk_angulars_packed_bf16 */
1488
1540
  NK_PUBLIC void nk_angulars_packed_bf16_v128relaxed(nk_bf16_t const *a, void const *b_packed, nk_f32_t *result,
1489
1541
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1490
1542
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1491
1543
  /** @copydoc nk_angulars_symmetric_bf16 */
1492
- NK_PUBLIC void nk_angulars_symmetric_bf16_v128relaxed(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1493
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1494
- nk_size_t row_start, nk_size_t row_count);
1544
+ NK_PUBLIC void nk_angulars_symmetric_bf16_v128relaxed(nk_bf16_t const *vectors, nk_size_t vectors_count,
1545
+ nk_size_t depth, nk_size_t stride, nk_f32_t *result,
1546
+ nk_size_t result_stride, nk_size_t row_start,
1547
+ nk_size_t row_count);
1495
1548
  /** @copydoc nk_euclideans_packed_bf16 */
1496
1549
  NK_PUBLIC void nk_euclideans_packed_bf16_v128relaxed(nk_bf16_t const *a, void const *b_packed, nk_f32_t *result,
1497
1550
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1498
1551
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1499
1552
  /** @copydoc nk_euclideans_symmetric_bf16 */
1500
- NK_PUBLIC void nk_euclideans_symmetric_bf16_v128relaxed(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1501
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1502
- nk_size_t row_start, nk_size_t row_count);
1553
+ NK_PUBLIC void nk_euclideans_symmetric_bf16_v128relaxed(nk_bf16_t const *vectors, nk_size_t vectors_count,
1554
+ nk_size_t depth, nk_size_t stride, nk_f32_t *result,
1555
+ nk_size_t result_stride, nk_size_t row_start,
1556
+ nk_size_t row_count);
1503
1557
  /** @copydoc nk_angulars_packed_f32 */
1504
1558
  NK_PUBLIC void nk_angulars_packed_f32_v128relaxed(nk_f32_t const *a, void const *b_packed, nk_f64_t *result,
1505
1559
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1506
1560
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1507
1561
  /** @copydoc nk_angulars_symmetric_f32 */
1508
- NK_PUBLIC void nk_angulars_symmetric_f32_v128relaxed(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1562
+ NK_PUBLIC void nk_angulars_symmetric_f32_v128relaxed(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1509
1563
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1510
1564
  nk_size_t row_start, nk_size_t row_count);
1511
1565
  /** @copydoc nk_euclideans_packed_f32 */
@@ -1513,15 +1567,16 @@ NK_PUBLIC void nk_euclideans_packed_f32_v128relaxed(nk_f32_t const *a, void cons
1513
1567
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1514
1568
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1515
1569
  /** @copydoc nk_euclideans_symmetric_f32 */
1516
- NK_PUBLIC void nk_euclideans_symmetric_f32_v128relaxed(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1517
- nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1518
- nk_size_t row_start, nk_size_t row_count);
1570
+ NK_PUBLIC void nk_euclideans_symmetric_f32_v128relaxed(nk_f32_t const *vectors, nk_size_t vectors_count,
1571
+ nk_size_t depth, nk_size_t stride, nk_f64_t *result,
1572
+ nk_size_t result_stride, nk_size_t row_start,
1573
+ nk_size_t row_count);
1519
1574
  /** @copydoc nk_angulars_packed_f64 */
1520
1575
  NK_PUBLIC void nk_angulars_packed_f64_v128relaxed(nk_f64_t const *a, void const *b_packed, nk_f64_t *result,
1521
1576
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1522
1577
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1523
1578
  /** @copydoc nk_angulars_symmetric_f64 */
1524
- NK_PUBLIC void nk_angulars_symmetric_f64_v128relaxed(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1579
+ NK_PUBLIC void nk_angulars_symmetric_f64_v128relaxed(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1525
1580
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1526
1581
  nk_size_t row_start, nk_size_t row_count);
1527
1582
  /** @copydoc nk_euclideans_packed_f64 */
@@ -1529,9 +1584,10 @@ NK_PUBLIC void nk_euclideans_packed_f64_v128relaxed(nk_f64_t const *a, void cons
1529
1584
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1530
1585
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1531
1586
  /** @copydoc nk_euclideans_symmetric_f64 */
1532
- NK_PUBLIC void nk_euclideans_symmetric_f64_v128relaxed(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1533
- nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1534
- nk_size_t row_start, nk_size_t row_count);
1587
+ NK_PUBLIC void nk_euclideans_symmetric_f64_v128relaxed(nk_f64_t const *vectors, nk_size_t vectors_count,
1588
+ nk_size_t depth, nk_size_t stride, nk_f64_t *result,
1589
+ nk_size_t result_stride, nk_size_t row_start,
1590
+ nk_size_t row_count);
1535
1591
  #endif // NK_TARGET_V128RELAXED
1536
1592
 
1537
1593
  /* ARM NEON backends (base NEON with F32/F64 support).
@@ -1543,7 +1599,7 @@ NK_PUBLIC void nk_angulars_packed_f32_neon(nk_f32_t const *a, void const *b_pack
1543
1599
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1544
1600
  nk_size_t r_stride_in_bytes);
1545
1601
  /** @copydoc nk_angulars_symmetric_f32 */
1546
- NK_PUBLIC void nk_angulars_symmetric_f32_neon(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1602
+ NK_PUBLIC void nk_angulars_symmetric_f32_neon(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1547
1603
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1548
1604
  nk_size_t row_start, nk_size_t row_count);
1549
1605
  /** @copydoc nk_euclideans_packed_f32 */
@@ -1551,7 +1607,7 @@ NK_PUBLIC void nk_euclideans_packed_f32_neon(nk_f32_t const *a, void const *b_pa
1551
1607
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1552
1608
  nk_size_t r_stride_in_bytes);
1553
1609
  /** @copydoc nk_euclideans_symmetric_f32 */
1554
- NK_PUBLIC void nk_euclideans_symmetric_f32_neon(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1610
+ NK_PUBLIC void nk_euclideans_symmetric_f32_neon(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1555
1611
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1556
1612
  nk_size_t row_start, nk_size_t row_count);
1557
1613
 
@@ -1560,7 +1616,7 @@ NK_PUBLIC void nk_angulars_packed_f64_neon(nk_f64_t const *a, void const *b_pack
1560
1616
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1561
1617
  nk_size_t r_stride_in_bytes);
1562
1618
  /** @copydoc nk_angulars_symmetric_f64 */
1563
- NK_PUBLIC void nk_angulars_symmetric_f64_neon(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1619
+ NK_PUBLIC void nk_angulars_symmetric_f64_neon(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1564
1620
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1565
1621
  nk_size_t row_start, nk_size_t row_count);
1566
1622
  /** @copydoc nk_euclideans_packed_f64 */
@@ -1568,7 +1624,7 @@ NK_PUBLIC void nk_euclideans_packed_f64_neon(nk_f64_t const *a, void const *b_pa
1568
1624
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1569
1625
  nk_size_t r_stride_in_bytes);
1570
1626
  /** @copydoc nk_euclideans_symmetric_f64 */
1571
- NK_PUBLIC void nk_euclideans_symmetric_f64_neon(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1627
+ NK_PUBLIC void nk_euclideans_symmetric_f64_neon(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1572
1628
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1573
1629
  nk_size_t row_start, nk_size_t row_count);
1574
1630
  /** @copydoc nk_angulars_packed_bf16 */
@@ -1576,7 +1632,7 @@ NK_PUBLIC void nk_angulars_packed_bf16_neon(nk_bf16_t const *a, void const *b_pa
1576
1632
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1577
1633
  nk_size_t r_stride_in_bytes);
1578
1634
  /** @copydoc nk_angulars_symmetric_bf16 */
1579
- NK_PUBLIC void nk_angulars_symmetric_bf16_neon(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1635
+ NK_PUBLIC void nk_angulars_symmetric_bf16_neon(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1580
1636
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1581
1637
  nk_size_t row_start, nk_size_t row_count);
1582
1638
  /** @copydoc nk_euclideans_packed_bf16 */
@@ -1584,7 +1640,7 @@ NK_PUBLIC void nk_euclideans_packed_bf16_neon(nk_bf16_t const *a, void const *b_
1584
1640
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1585
1641
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1586
1642
  /** @copydoc nk_euclideans_symmetric_bf16 */
1587
- NK_PUBLIC void nk_euclideans_symmetric_bf16_neon(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1643
+ NK_PUBLIC void nk_euclideans_symmetric_bf16_neon(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1588
1644
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1589
1645
  nk_size_t row_start, nk_size_t row_count);
1590
1646
  /** @copydoc nk_angulars_packed_f16 */
@@ -1592,7 +1648,7 @@ NK_PUBLIC void nk_angulars_packed_f16_neon(nk_f16_t const *a, void const *b_pack
1592
1648
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1593
1649
  nk_size_t r_stride_in_bytes);
1594
1650
  /** @copydoc nk_angulars_symmetric_f16 */
1595
- NK_PUBLIC void nk_angulars_symmetric_f16_neon(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1651
+ NK_PUBLIC void nk_angulars_symmetric_f16_neon(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1596
1652
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1597
1653
  nk_size_t row_start, nk_size_t row_count);
1598
1654
  /** @copydoc nk_euclideans_packed_f16 */
@@ -1600,33 +1656,11 @@ NK_PUBLIC void nk_euclideans_packed_f16_neon(nk_f16_t const *a, void const *b_pa
1600
1656
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1601
1657
  nk_size_t r_stride_in_bytes);
1602
1658
  /** @copydoc nk_euclideans_symmetric_f16 */
1603
- NK_PUBLIC void nk_euclideans_symmetric_f16_neon(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1659
+ NK_PUBLIC void nk_euclideans_symmetric_f16_neon(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1604
1660
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1605
1661
  nk_size_t row_start, nk_size_t row_count);
1606
1662
  #endif // NK_TARGET_NEON
1607
1663
 
1608
- /* ARM NEON with F16 arithmetic (ARMv8.2-A FP16).
1609
- * Provides native F16 FMLA for half-precision dot products.
1610
- */
1611
- #if NK_TARGET_NEONHALF
1612
- /** @copydoc nk_angulars_packed_f16 */
1613
- NK_PUBLIC void nk_angulars_packed_f16_neonhalf(nk_f16_t const *a, void const *b_packed, nk_f32_t *result,
1614
- nk_size_t rows, nk_size_t cols, nk_size_t depth,
1615
- nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1616
- /** @copydoc nk_angulars_symmetric_f16 */
1617
- NK_PUBLIC void nk_angulars_symmetric_f16_neonhalf(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1618
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1619
- nk_size_t row_start, nk_size_t row_count);
1620
- /** @copydoc nk_euclideans_packed_f16 */
1621
- NK_PUBLIC void nk_euclideans_packed_f16_neonhalf(nk_f16_t const *a, void const *b_packed, nk_f32_t *result,
1622
- nk_size_t rows, nk_size_t cols, nk_size_t depth,
1623
- nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1624
- /** @copydoc nk_euclideans_symmetric_f16 */
1625
- NK_PUBLIC void nk_euclideans_symmetric_f16_neonhalf(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1626
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1627
- nk_size_t row_start, nk_size_t row_count);
1628
- #endif // NK_TARGET_NEONHALF
1629
-
1630
1664
  /* ARM NEON with BF16 dot product (ARMv8.6-A BF16).
1631
1665
  * Uses BFDOT/BFMMLA for efficient BF16 matrix operations.
1632
1666
  */
@@ -1636,7 +1670,7 @@ NK_PUBLIC void nk_angulars_packed_bf16_neonbfdot(nk_bf16_t const *a, void const
1636
1670
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1637
1671
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1638
1672
  /** @copydoc nk_angulars_symmetric_bf16 */
1639
- NK_PUBLIC void nk_angulars_symmetric_bf16_neonbfdot(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1673
+ NK_PUBLIC void nk_angulars_symmetric_bf16_neonbfdot(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1640
1674
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1641
1675
  nk_size_t row_start, nk_size_t row_count);
1642
1676
  /** @copydoc nk_euclideans_packed_bf16 */
@@ -1644,9 +1678,10 @@ NK_PUBLIC void nk_euclideans_packed_bf16_neonbfdot(nk_bf16_t const *a, void cons
1644
1678
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1645
1679
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1646
1680
  /** @copydoc nk_euclideans_symmetric_bf16 */
1647
- NK_PUBLIC void nk_euclideans_symmetric_bf16_neonbfdot(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1648
- nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1649
- nk_size_t row_start, nk_size_t row_count);
1681
+ NK_PUBLIC void nk_euclideans_symmetric_bf16_neonbfdot(nk_bf16_t const *vectors, nk_size_t vectors_count,
1682
+ nk_size_t depth, nk_size_t stride, nk_f32_t *result,
1683
+ nk_size_t result_stride, nk_size_t row_start,
1684
+ nk_size_t row_count);
1650
1685
  #endif // NK_TARGET_NEONBFDOT
1651
1686
 
1652
1687
  /* ARM NEON with signed/unsigned dot product (ARMv8.2-A DotProd).
@@ -1658,7 +1693,7 @@ NK_PUBLIC void nk_angulars_packed_i8_neonsdot(nk_i8_t const *a, void const *b_pa
1658
1693
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1659
1694
  nk_size_t r_stride_in_bytes);
1660
1695
  /** @copydoc nk_angulars_symmetric_i8 */
1661
- NK_PUBLIC void nk_angulars_symmetric_i8_neonsdot(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1696
+ NK_PUBLIC void nk_angulars_symmetric_i8_neonsdot(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1662
1697
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1663
1698
  nk_size_t row_start, nk_size_t row_count);
1664
1699
  /** @copydoc nk_euclideans_packed_i8 */
@@ -1666,7 +1701,7 @@ NK_PUBLIC void nk_euclideans_packed_i8_neonsdot(nk_i8_t const *a, void const *b_
1666
1701
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1667
1702
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1668
1703
  /** @copydoc nk_euclideans_symmetric_i8 */
1669
- NK_PUBLIC void nk_euclideans_symmetric_i8_neonsdot(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1704
+ NK_PUBLIC void nk_euclideans_symmetric_i8_neonsdot(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1670
1705
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1671
1706
  nk_size_t row_start, nk_size_t row_count);
1672
1707
 
@@ -1675,7 +1710,7 @@ NK_PUBLIC void nk_angulars_packed_u8_neonsdot(nk_u8_t const *a, void const *b_pa
1675
1710
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1676
1711
  nk_size_t r_stride_in_bytes);
1677
1712
  /** @copydoc nk_angulars_symmetric_u8 */
1678
- NK_PUBLIC void nk_angulars_symmetric_u8_neonsdot(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1713
+ NK_PUBLIC void nk_angulars_symmetric_u8_neonsdot(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1679
1714
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1680
1715
  nk_size_t row_start, nk_size_t row_count);
1681
1716
  /** @copydoc nk_euclideans_packed_u8 */
@@ -1683,7 +1718,7 @@ NK_PUBLIC void nk_euclideans_packed_u8_neonsdot(nk_u8_t const *a, void const *b_
1683
1718
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1684
1719
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1685
1720
  /** @copydoc nk_euclideans_symmetric_u8 */
1686
- NK_PUBLIC void nk_euclideans_symmetric_u8_neonsdot(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1721
+ NK_PUBLIC void nk_euclideans_symmetric_u8_neonsdot(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1687
1722
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1688
1723
  nk_size_t row_start, nk_size_t row_count);
1689
1724
 
@@ -1692,7 +1727,7 @@ NK_PUBLIC void nk_angulars_packed_i4_neonsdot(nk_i4x2_t const *a, void const *b_
1692
1727
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1693
1728
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1694
1729
  /** @copydoc nk_angulars_symmetric_i4 */
1695
- NK_PUBLIC void nk_angulars_symmetric_i4_neonsdot(nk_i4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1730
+ NK_PUBLIC void nk_angulars_symmetric_i4_neonsdot(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1696
1731
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1697
1732
  nk_size_t row_start, nk_size_t row_count);
1698
1733
  /** @copydoc nk_euclideans_packed_i4 */
@@ -1700,7 +1735,7 @@ NK_PUBLIC void nk_euclideans_packed_i4_neonsdot(nk_i4x2_t const *a, void const *
1700
1735
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1701
1736
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1702
1737
  /** @copydoc nk_euclideans_symmetric_i4 */
1703
- NK_PUBLIC void nk_euclideans_symmetric_i4_neonsdot(nk_i4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1738
+ NK_PUBLIC void nk_euclideans_symmetric_i4_neonsdot(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1704
1739
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1705
1740
  nk_size_t row_start, nk_size_t row_count);
1706
1741
 
@@ -1709,7 +1744,7 @@ NK_PUBLIC void nk_angulars_packed_u4_neonsdot(nk_u4x2_t const *a, void const *b_
1709
1744
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1710
1745
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1711
1746
  /** @copydoc nk_angulars_symmetric_u4 */
1712
- NK_PUBLIC void nk_angulars_symmetric_u4_neonsdot(nk_u4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1747
+ NK_PUBLIC void nk_angulars_symmetric_u4_neonsdot(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1713
1748
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1714
1749
  nk_size_t row_start, nk_size_t row_count);
1715
1750
  /** @copydoc nk_euclideans_packed_u4 */
@@ -1717,7 +1752,7 @@ NK_PUBLIC void nk_euclideans_packed_u4_neonsdot(nk_u4x2_t const *a, void const *
1717
1752
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1718
1753
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1719
1754
  /** @copydoc nk_euclideans_symmetric_u4 */
1720
- NK_PUBLIC void nk_euclideans_symmetric_u4_neonsdot(nk_u4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1755
+ NK_PUBLIC void nk_euclideans_symmetric_u4_neonsdot(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1721
1756
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1722
1757
  nk_size_t row_start, nk_size_t row_count);
1723
1758
  #endif // NK_TARGET_NEONSDOT
@@ -1731,7 +1766,7 @@ NK_PUBLIC void nk_angulars_packed_f16_neonfhm(nk_f16_t const *a, void const *b_p
1731
1766
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1732
1767
  nk_size_t r_stride_in_bytes);
1733
1768
  /** @copydoc nk_angulars_symmetric_f16 */
1734
- NK_PUBLIC void nk_angulars_symmetric_f16_neonfhm(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1769
+ NK_PUBLIC void nk_angulars_symmetric_f16_neonfhm(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1735
1770
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1736
1771
  nk_size_t row_start, nk_size_t row_count);
1737
1772
  /** @copydoc nk_euclideans_packed_f16 */
@@ -1739,7 +1774,7 @@ NK_PUBLIC void nk_euclideans_packed_f16_neonfhm(nk_f16_t const *a, void const *b
1739
1774
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1740
1775
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1741
1776
  /** @copydoc nk_euclideans_symmetric_f16 */
1742
- NK_PUBLIC void nk_euclideans_symmetric_f16_neonfhm(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1777
+ NK_PUBLIC void nk_euclideans_symmetric_f16_neonfhm(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1743
1778
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1744
1779
  nk_size_t row_start, nk_size_t row_count);
1745
1780
 
@@ -1748,7 +1783,7 @@ NK_PUBLIC void nk_angulars_packed_e4m3_neonfhm(nk_e4m3_t const *a, void const *b
1748
1783
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1749
1784
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1750
1785
  /** @copydoc nk_angulars_symmetric_e4m3 */
1751
- NK_PUBLIC void nk_angulars_symmetric_e4m3_neonfhm(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1786
+ NK_PUBLIC void nk_angulars_symmetric_e4m3_neonfhm(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1752
1787
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1753
1788
  nk_size_t row_start, nk_size_t row_count);
1754
1789
  /** @copydoc nk_euclideans_packed_e4m3 */
@@ -1756,7 +1791,7 @@ NK_PUBLIC void nk_euclideans_packed_e4m3_neonfhm(nk_e4m3_t const *a, void const
1756
1791
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1757
1792
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1758
1793
  /** @copydoc nk_euclideans_symmetric_e4m3 */
1759
- NK_PUBLIC void nk_euclideans_symmetric_e4m3_neonfhm(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1794
+ NK_PUBLIC void nk_euclideans_symmetric_e4m3_neonfhm(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1760
1795
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1761
1796
  nk_size_t row_start, nk_size_t row_count);
1762
1797
 
@@ -1765,7 +1800,7 @@ NK_PUBLIC void nk_angulars_packed_e5m2_neonfhm(nk_e5m2_t const *a, void const *b
1765
1800
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1766
1801
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1767
1802
  /** @copydoc nk_angulars_symmetric_e5m2 */
1768
- NK_PUBLIC void nk_angulars_symmetric_e5m2_neonfhm(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1803
+ NK_PUBLIC void nk_angulars_symmetric_e5m2_neonfhm(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1769
1804
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1770
1805
  nk_size_t row_start, nk_size_t row_count);
1771
1806
  /** @copydoc nk_euclideans_packed_e5m2 */
@@ -1773,19 +1808,93 @@ NK_PUBLIC void nk_euclideans_packed_e5m2_neonfhm(nk_e5m2_t const *a, void const
1773
1808
  nk_size_t rows, nk_size_t cols, nk_size_t depth,
1774
1809
  nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1775
1810
  /** @copydoc nk_euclideans_symmetric_e5m2 */
1776
- NK_PUBLIC void nk_euclideans_symmetric_e5m2_neonfhm(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1811
+ NK_PUBLIC void nk_euclideans_symmetric_e5m2_neonfhm(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1777
1812
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1778
1813
  nk_size_t row_start, nk_size_t row_count);
1779
1814
 
1780
1815
  #endif // NK_TARGET_NEONFHM
1781
1816
 
1817
+ /* ARM NEON with FP8 (ARMv9.2-A FP8).
1818
+ * Uses native FP8 dot-product instructions for E4M3/E5M2/E2M3/E3M2 operations.
1819
+ */
1820
+ #if NK_TARGET_NEONFP8
1821
+ /** @copydoc nk_angulars_packed_e4m3 */
1822
+ NK_PUBLIC void nk_angulars_packed_e4m3_neonfp8(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *result,
1823
+ nk_size_t rows, nk_size_t cols, nk_size_t depth,
1824
+ nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1825
+ /** @copydoc nk_angulars_symmetric_e4m3 */
1826
+ NK_PUBLIC void nk_angulars_symmetric_e4m3_neonfp8(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1827
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1828
+ nk_size_t row_start, nk_size_t row_count);
1829
+ /** @copydoc nk_euclideans_packed_e4m3 */
1830
+ NK_PUBLIC void nk_euclideans_packed_e4m3_neonfp8(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *result,
1831
+ nk_size_t rows, nk_size_t cols, nk_size_t depth,
1832
+ nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1833
+ /** @copydoc nk_euclideans_symmetric_e4m3 */
1834
+ NK_PUBLIC void nk_euclideans_symmetric_e4m3_neonfp8(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1835
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1836
+ nk_size_t row_start, nk_size_t row_count);
1837
+
1838
+ /** @copydoc nk_angulars_packed_e5m2 */
1839
+ NK_PUBLIC void nk_angulars_packed_e5m2_neonfp8(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *result,
1840
+ nk_size_t rows, nk_size_t cols, nk_size_t depth,
1841
+ nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1842
+ /** @copydoc nk_angulars_symmetric_e5m2 */
1843
+ NK_PUBLIC void nk_angulars_symmetric_e5m2_neonfp8(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1844
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1845
+ nk_size_t row_start, nk_size_t row_count);
1846
+ /** @copydoc nk_euclideans_packed_e5m2 */
1847
+ NK_PUBLIC void nk_euclideans_packed_e5m2_neonfp8(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *result,
1848
+ nk_size_t rows, nk_size_t cols, nk_size_t depth,
1849
+ nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1850
+ /** @copydoc nk_euclideans_symmetric_e5m2 */
1851
+ NK_PUBLIC void nk_euclideans_symmetric_e5m2_neonfp8(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1852
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1853
+ nk_size_t row_start, nk_size_t row_count);
1854
+
1855
+ /** @copydoc nk_angulars_packed_e2m3 */
1856
+ NK_PUBLIC void nk_angulars_packed_e2m3_neonfp8(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *result,
1857
+ nk_size_t rows, nk_size_t cols, nk_size_t depth,
1858
+ nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1859
+ /** @copydoc nk_angulars_symmetric_e2m3 */
1860
+ NK_PUBLIC void nk_angulars_symmetric_e2m3_neonfp8(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1861
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1862
+ nk_size_t row_start, nk_size_t row_count);
1863
+ /** @copydoc nk_euclideans_packed_e2m3 */
1864
+ NK_PUBLIC void nk_euclideans_packed_e2m3_neonfp8(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *result,
1865
+ nk_size_t rows, nk_size_t cols, nk_size_t depth,
1866
+ nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1867
+ /** @copydoc nk_euclideans_symmetric_e2m3 */
1868
+ NK_PUBLIC void nk_euclideans_symmetric_e2m3_neonfp8(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1869
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1870
+ nk_size_t row_start, nk_size_t row_count);
1871
+
1872
+ /** @copydoc nk_angulars_packed_e3m2 */
1873
+ NK_PUBLIC void nk_angulars_packed_e3m2_neonfp8(nk_e3m2_t const *a, void const *b_packed, nk_f32_t *result,
1874
+ nk_size_t rows, nk_size_t cols, nk_size_t depth,
1875
+ nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1876
+ /** @copydoc nk_angulars_symmetric_e3m2 */
1877
+ NK_PUBLIC void nk_angulars_symmetric_e3m2_neonfp8(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1878
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1879
+ nk_size_t row_start, nk_size_t row_count);
1880
+ /** @copydoc nk_euclideans_packed_e3m2 */
1881
+ NK_PUBLIC void nk_euclideans_packed_e3m2_neonfp8(nk_e3m2_t const *a, void const *b_packed, nk_f32_t *result,
1882
+ nk_size_t rows, nk_size_t cols, nk_size_t depth,
1883
+ nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
1884
+ /** @copydoc nk_euclideans_symmetric_e3m2 */
1885
+ NK_PUBLIC void nk_euclideans_symmetric_e3m2_neonfp8(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1886
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1887
+ nk_size_t row_start, nk_size_t row_count);
1888
+
1889
+ #endif // NK_TARGET_NEONFP8
1890
+
1782
1891
  #if NK_TARGET_RVV
1783
1892
  /** @copydoc nk_angulars_packed_f32 */
1784
1893
  NK_PUBLIC void nk_angulars_packed_f32_rvv(nk_f32_t const *a, void const *b_packed, nk_f64_t *result, nk_size_t rows,
1785
1894
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1786
1895
  nk_size_t r_stride_in_bytes);
1787
1896
  /** @copydoc nk_angulars_symmetric_f32 */
1788
- NK_PUBLIC void nk_angulars_symmetric_f32_rvv(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1897
+ NK_PUBLIC void nk_angulars_symmetric_f32_rvv(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1789
1898
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1790
1899
  nk_size_t row_start, nk_size_t row_count);
1791
1900
  /** @copydoc nk_euclideans_packed_f32 */
@@ -1793,7 +1902,7 @@ NK_PUBLIC void nk_euclideans_packed_f32_rvv(nk_f32_t const *a, void const *b_pac
1793
1902
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1794
1903
  nk_size_t r_stride_in_bytes);
1795
1904
  /** @copydoc nk_euclideans_symmetric_f32 */
1796
- NK_PUBLIC void nk_euclideans_symmetric_f32_rvv(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1905
+ NK_PUBLIC void nk_euclideans_symmetric_f32_rvv(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1797
1906
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1798
1907
  nk_size_t row_start, nk_size_t row_count);
1799
1908
 
@@ -1802,7 +1911,7 @@ NK_PUBLIC void nk_angulars_packed_f64_rvv(nk_f64_t const *a, void const *b_packe
1802
1911
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1803
1912
  nk_size_t r_stride_in_bytes);
1804
1913
  /** @copydoc nk_angulars_symmetric_f64 */
1805
- NK_PUBLIC void nk_angulars_symmetric_f64_rvv(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1914
+ NK_PUBLIC void nk_angulars_symmetric_f64_rvv(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1806
1915
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1807
1916
  nk_size_t row_start, nk_size_t row_count);
1808
1917
  /** @copydoc nk_euclideans_packed_f64 */
@@ -1810,7 +1919,7 @@ NK_PUBLIC void nk_euclideans_packed_f64_rvv(nk_f64_t const *a, void const *b_pac
1810
1919
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1811
1920
  nk_size_t r_stride_in_bytes);
1812
1921
  /** @copydoc nk_euclideans_symmetric_f64 */
1813
- NK_PUBLIC void nk_euclideans_symmetric_f64_rvv(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1922
+ NK_PUBLIC void nk_euclideans_symmetric_f64_rvv(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1814
1923
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
1815
1924
  nk_size_t row_start, nk_size_t row_count);
1816
1925
 
@@ -1819,7 +1928,7 @@ NK_PUBLIC void nk_angulars_packed_f16_rvv(nk_f16_t const *a, void const *b_packe
1819
1928
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1820
1929
  nk_size_t r_stride_in_bytes);
1821
1930
  /** @copydoc nk_angulars_symmetric_f16 */
1822
- NK_PUBLIC void nk_angulars_symmetric_f16_rvv(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1931
+ NK_PUBLIC void nk_angulars_symmetric_f16_rvv(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1823
1932
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1824
1933
  nk_size_t row_start, nk_size_t row_count);
1825
1934
  /** @copydoc nk_euclideans_packed_f16 */
@@ -1827,7 +1936,7 @@ NK_PUBLIC void nk_euclideans_packed_f16_rvv(nk_f16_t const *a, void const *b_pac
1827
1936
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1828
1937
  nk_size_t r_stride_in_bytes);
1829
1938
  /** @copydoc nk_euclideans_symmetric_f16 */
1830
- NK_PUBLIC void nk_euclideans_symmetric_f16_rvv(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1939
+ NK_PUBLIC void nk_euclideans_symmetric_f16_rvv(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1831
1940
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1832
1941
  nk_size_t row_start, nk_size_t row_count);
1833
1942
 
@@ -1836,7 +1945,7 @@ NK_PUBLIC void nk_angulars_packed_bf16_rvv(nk_bf16_t const *a, void const *b_pac
1836
1945
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1837
1946
  nk_size_t r_stride_in_bytes);
1838
1947
  /** @copydoc nk_angulars_symmetric_bf16 */
1839
- NK_PUBLIC void nk_angulars_symmetric_bf16_rvv(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1948
+ NK_PUBLIC void nk_angulars_symmetric_bf16_rvv(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1840
1949
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1841
1950
  nk_size_t row_start, nk_size_t row_count);
1842
1951
  /** @copydoc nk_euclideans_packed_bf16 */
@@ -1844,7 +1953,7 @@ NK_PUBLIC void nk_euclideans_packed_bf16_rvv(nk_bf16_t const *a, void const *b_p
1844
1953
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1845
1954
  nk_size_t r_stride_in_bytes);
1846
1955
  /** @copydoc nk_euclideans_symmetric_bf16 */
1847
- NK_PUBLIC void nk_euclideans_symmetric_bf16_rvv(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1956
+ NK_PUBLIC void nk_euclideans_symmetric_bf16_rvv(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1848
1957
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1849
1958
  nk_size_t row_start, nk_size_t row_count);
1850
1959
 
@@ -1853,7 +1962,7 @@ NK_PUBLIC void nk_angulars_packed_e4m3_rvv(nk_e4m3_t const *a, void const *b_pac
1853
1962
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1854
1963
  nk_size_t r_stride_in_bytes);
1855
1964
  /** @copydoc nk_angulars_symmetric_e4m3 */
1856
- NK_PUBLIC void nk_angulars_symmetric_e4m3_rvv(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1965
+ NK_PUBLIC void nk_angulars_symmetric_e4m3_rvv(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1857
1966
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1858
1967
  nk_size_t row_start, nk_size_t row_count);
1859
1968
  /** @copydoc nk_euclideans_packed_e4m3 */
@@ -1861,7 +1970,7 @@ NK_PUBLIC void nk_euclideans_packed_e4m3_rvv(nk_e4m3_t const *a, void const *b_p
1861
1970
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1862
1971
  nk_size_t r_stride_in_bytes);
1863
1972
  /** @copydoc nk_euclideans_symmetric_e4m3 */
1864
- NK_PUBLIC void nk_euclideans_symmetric_e4m3_rvv(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1973
+ NK_PUBLIC void nk_euclideans_symmetric_e4m3_rvv(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1865
1974
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1866
1975
  nk_size_t row_start, nk_size_t row_count);
1867
1976
 
@@ -1870,7 +1979,7 @@ NK_PUBLIC void nk_angulars_packed_e5m2_rvv(nk_e5m2_t const *a, void const *b_pac
1870
1979
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1871
1980
  nk_size_t r_stride_in_bytes);
1872
1981
  /** @copydoc nk_angulars_symmetric_e5m2 */
1873
- NK_PUBLIC void nk_angulars_symmetric_e5m2_rvv(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1982
+ NK_PUBLIC void nk_angulars_symmetric_e5m2_rvv(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1874
1983
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1875
1984
  nk_size_t row_start, nk_size_t row_count);
1876
1985
  /** @copydoc nk_euclideans_packed_e5m2 */
@@ -1878,7 +1987,7 @@ NK_PUBLIC void nk_euclideans_packed_e5m2_rvv(nk_e5m2_t const *a, void const *b_p
1878
1987
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1879
1988
  nk_size_t r_stride_in_bytes);
1880
1989
  /** @copydoc nk_euclideans_symmetric_e5m2 */
1881
- NK_PUBLIC void nk_euclideans_symmetric_e5m2_rvv(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1990
+ NK_PUBLIC void nk_euclideans_symmetric_e5m2_rvv(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1882
1991
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1883
1992
  nk_size_t row_start, nk_size_t row_count);
1884
1993
 
@@ -1887,7 +1996,7 @@ NK_PUBLIC void nk_angulars_packed_e2m3_rvv(nk_e2m3_t const *a, void const *b_pac
1887
1996
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1888
1997
  nk_size_t r_stride_in_bytes);
1889
1998
  /** @copydoc nk_angulars_symmetric_e2m3 */
1890
- NK_PUBLIC void nk_angulars_symmetric_e2m3_rvv(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
1999
+ NK_PUBLIC void nk_angulars_symmetric_e2m3_rvv(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1891
2000
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1892
2001
  nk_size_t row_start, nk_size_t row_count);
1893
2002
  /** @copydoc nk_euclideans_packed_e2m3 */
@@ -1895,7 +2004,7 @@ NK_PUBLIC void nk_euclideans_packed_e2m3_rvv(nk_e2m3_t const *a, void const *b_p
1895
2004
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1896
2005
  nk_size_t r_stride_in_bytes);
1897
2006
  /** @copydoc nk_euclideans_symmetric_e2m3 */
1898
- NK_PUBLIC void nk_euclideans_symmetric_e2m3_rvv(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2007
+ NK_PUBLIC void nk_euclideans_symmetric_e2m3_rvv(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1899
2008
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1900
2009
  nk_size_t row_start, nk_size_t row_count);
1901
2010
 
@@ -1904,7 +2013,7 @@ NK_PUBLIC void nk_angulars_packed_e3m2_rvv(nk_e3m2_t const *a, void const *b_pac
1904
2013
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1905
2014
  nk_size_t r_stride_in_bytes);
1906
2015
  /** @copydoc nk_angulars_symmetric_e3m2 */
1907
- NK_PUBLIC void nk_angulars_symmetric_e3m2_rvv(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2016
+ NK_PUBLIC void nk_angulars_symmetric_e3m2_rvv(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1908
2017
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1909
2018
  nk_size_t row_start, nk_size_t row_count);
1910
2019
  /** @copydoc nk_euclideans_packed_e3m2 */
@@ -1912,7 +2021,7 @@ NK_PUBLIC void nk_euclideans_packed_e3m2_rvv(nk_e3m2_t const *a, void const *b_p
1912
2021
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1913
2022
  nk_size_t r_stride_in_bytes);
1914
2023
  /** @copydoc nk_euclideans_symmetric_e3m2 */
1915
- NK_PUBLIC void nk_euclideans_symmetric_e3m2_rvv(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2024
+ NK_PUBLIC void nk_euclideans_symmetric_e3m2_rvv(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1916
2025
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1917
2026
  nk_size_t row_start, nk_size_t row_count);
1918
2027
 
@@ -1921,7 +2030,7 @@ NK_PUBLIC void nk_angulars_packed_i8_rvv(nk_i8_t const *a, void const *b_packed,
1921
2030
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1922
2031
  nk_size_t r_stride_in_bytes);
1923
2032
  /** @copydoc nk_angulars_symmetric_i8 */
1924
- NK_PUBLIC void nk_angulars_symmetric_i8_rvv(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2033
+ NK_PUBLIC void nk_angulars_symmetric_i8_rvv(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1925
2034
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1926
2035
  nk_size_t row_start, nk_size_t row_count);
1927
2036
  /** @copydoc nk_euclideans_packed_i8 */
@@ -1929,7 +2038,7 @@ NK_PUBLIC void nk_euclideans_packed_i8_rvv(nk_i8_t const *a, void const *b_packe
1929
2038
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1930
2039
  nk_size_t r_stride_in_bytes);
1931
2040
  /** @copydoc nk_euclideans_symmetric_i8 */
1932
- NK_PUBLIC void nk_euclideans_symmetric_i8_rvv(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2041
+ NK_PUBLIC void nk_euclideans_symmetric_i8_rvv(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1933
2042
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1934
2043
  nk_size_t row_start, nk_size_t row_count);
1935
2044
 
@@ -1938,7 +2047,7 @@ NK_PUBLIC void nk_angulars_packed_u8_rvv(nk_u8_t const *a, void const *b_packed,
1938
2047
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1939
2048
  nk_size_t r_stride_in_bytes);
1940
2049
  /** @copydoc nk_angulars_symmetric_u8 */
1941
- NK_PUBLIC void nk_angulars_symmetric_u8_rvv(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2050
+ NK_PUBLIC void nk_angulars_symmetric_u8_rvv(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1942
2051
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1943
2052
  nk_size_t row_start, nk_size_t row_count);
1944
2053
  /** @copydoc nk_euclideans_packed_u8 */
@@ -1946,7 +2055,7 @@ NK_PUBLIC void nk_euclideans_packed_u8_rvv(nk_u8_t const *a, void const *b_packe
1946
2055
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
1947
2056
  nk_size_t r_stride_in_bytes);
1948
2057
  /** @copydoc nk_euclideans_symmetric_u8 */
1949
- NK_PUBLIC void nk_euclideans_symmetric_u8_rvv(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2058
+ NK_PUBLIC void nk_euclideans_symmetric_u8_rvv(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
1950
2059
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
1951
2060
  nk_size_t row_start, nk_size_t row_count);
1952
2061
  #endif // NK_TARGET_RVV
@@ -1957,13 +2066,14 @@ NK_PUBLIC void nk_euclideans_symmetric_u8_rvv(nk_u8_t const *vectors, nk_size_t
1957
2066
 
1958
2067
  #include "numkong/spatials/serial.h"
1959
2068
  #include "numkong/spatials/neon.h"
1960
- #include "numkong/spatials/neonhalf.h"
1961
2069
  #include "numkong/spatials/neonfhm.h"
2070
+ #include "numkong/spatials/neonfp8.h"
1962
2071
  #include "numkong/spatials/neonbfdot.h"
1963
2072
  #include "numkong/spatials/neonsdot.h"
1964
2073
  #include "numkong/spatials/haswell.h"
1965
2074
  #include "numkong/spatials/skylake.h"
1966
2075
  #include "numkong/spatials/genoa.h"
2076
+ #include "numkong/spatials/diamond.h"
1967
2077
  #include "numkong/spatials/icelake.h"
1968
2078
  #include "numkong/spatials/alder.h"
1969
2079
  #include "numkong/spatials/sierra.h"
@@ -1972,6 +2082,8 @@ NK_PUBLIC void nk_euclideans_symmetric_u8_rvv(nk_u8_t const *vectors, nk_size_t
1972
2082
  #include "numkong/spatials/v128relaxed.h"
1973
2083
  #include "numkong/spatials/sme.h"
1974
2084
  #include "numkong/spatials/smef64.h"
2085
+ #include "numkong/spatials/powervsx.h"
2086
+ #include "numkong/spatials/loongsonasx.h"
1975
2087
 
1976
2088
  #if defined(__cplusplus)
1977
2089
  extern "C" {
@@ -1990,6 +2102,8 @@ NK_PUBLIC void nk_angulars_packed_f64(nk_f64_t const *a, void const *b_packed, n
1990
2102
  nk_angulars_packed_f64_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
1991
2103
  #elif NK_TARGET_HASWELL
1992
2104
  nk_angulars_packed_f64_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2105
+ #elif NK_TARGET_POWERVSX
2106
+ nk_angulars_packed_f64_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
1993
2107
  #elif NK_TARGET_RVV
1994
2108
  nk_angulars_packed_f64_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
1995
2109
  #elif NK_TARGET_V128RELAXED
@@ -1998,24 +2112,31 @@ NK_PUBLIC void nk_angulars_packed_f64(nk_f64_t const *a, void const *b_packed, n
1998
2112
  nk_angulars_packed_f64_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
1999
2113
  #endif
2000
2114
  }
2001
- NK_PUBLIC void nk_angulars_symmetric_f64(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2115
+ NK_PUBLIC void nk_angulars_symmetric_f64(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2002
2116
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
2003
2117
  nk_size_t row_start, nk_size_t row_count) {
2004
2118
  #if NK_TARGET_SMEF64
2005
- nk_angulars_symmetric_f64_smef64(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2119
+ nk_angulars_symmetric_f64_smef64(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2120
+ row_count);
2006
2121
  #elif NK_TARGET_NEON
2007
- nk_angulars_symmetric_f64_neon(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2122
+ nk_angulars_symmetric_f64_neon(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2008
2123
  #elif NK_TARGET_SKYLAKE
2009
- nk_angulars_symmetric_f64_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2124
+ nk_angulars_symmetric_f64_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2125
+ row_count);
2010
2126
  #elif NK_TARGET_HASWELL
2011
- nk_angulars_symmetric_f64_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2127
+ nk_angulars_symmetric_f64_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2128
+ row_count);
2129
+ #elif NK_TARGET_POWERVSX
2130
+ nk_angulars_symmetric_f64_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2131
+ row_count);
2012
2132
  #elif NK_TARGET_RVV
2013
- nk_angulars_symmetric_f64_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2133
+ nk_angulars_symmetric_f64_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2014
2134
  #elif NK_TARGET_V128RELAXED
2015
- nk_angulars_symmetric_f64_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2135
+ nk_angulars_symmetric_f64_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2016
2136
  row_count);
2017
2137
  #else
2018
- nk_angulars_symmetric_f64_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2138
+ nk_angulars_symmetric_f64_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2139
+ row_count);
2019
2140
  #endif
2020
2141
  }
2021
2142
  NK_PUBLIC void nk_euclideans_packed_f64(nk_f64_t const *a, void const *b_packed, nk_f64_t *result, nk_size_t rows,
@@ -2029,6 +2150,8 @@ NK_PUBLIC void nk_euclideans_packed_f64(nk_f64_t const *a, void const *b_packed,
2029
2150
  nk_euclideans_packed_f64_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2030
2151
  #elif NK_TARGET_HASWELL
2031
2152
  nk_euclideans_packed_f64_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2153
+ #elif NK_TARGET_POWERVSX
2154
+ nk_euclideans_packed_f64_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2032
2155
  #elif NK_TARGET_RVV
2033
2156
  nk_euclideans_packed_f64_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2034
2157
  #elif NK_TARGET_V128RELAXED
@@ -2037,24 +2160,32 @@ NK_PUBLIC void nk_euclideans_packed_f64(nk_f64_t const *a, void const *b_packed,
2037
2160
  nk_euclideans_packed_f64_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2038
2161
  #endif
2039
2162
  }
2040
- NK_PUBLIC void nk_euclideans_symmetric_f64(nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2163
+ NK_PUBLIC void nk_euclideans_symmetric_f64(nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2041
2164
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
2042
2165
  nk_size_t row_start, nk_size_t row_count) {
2043
2166
  #if NK_TARGET_SMEF64
2044
- nk_euclideans_symmetric_f64_smef64(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2167
+ nk_euclideans_symmetric_f64_smef64(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2168
+ row_count);
2045
2169
  #elif NK_TARGET_NEON
2046
- nk_euclideans_symmetric_f64_neon(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2170
+ nk_euclideans_symmetric_f64_neon(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2171
+ row_count);
2047
2172
  #elif NK_TARGET_SKYLAKE
2048
- nk_euclideans_symmetric_f64_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2173
+ nk_euclideans_symmetric_f64_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2174
+ row_count);
2049
2175
  #elif NK_TARGET_HASWELL
2050
- nk_euclideans_symmetric_f64_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2176
+ nk_euclideans_symmetric_f64_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2177
+ row_count);
2178
+ #elif NK_TARGET_POWERVSX
2179
+ nk_euclideans_symmetric_f64_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2180
+ row_count);
2051
2181
  #elif NK_TARGET_RVV
2052
- nk_euclideans_symmetric_f64_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2182
+ nk_euclideans_symmetric_f64_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2053
2183
  #elif NK_TARGET_V128RELAXED
2054
- nk_euclideans_symmetric_f64_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2184
+ nk_euclideans_symmetric_f64_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2055
2185
  row_count);
2056
2186
  #else
2057
- nk_euclideans_symmetric_f64_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2187
+ nk_euclideans_symmetric_f64_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2188
+ row_count);
2058
2189
  #endif
2059
2190
  }
2060
2191
 
@@ -2069,6 +2200,8 @@ NK_PUBLIC void nk_angulars_packed_f32(nk_f32_t const *a, void const *b_packed, n
2069
2200
  nk_angulars_packed_f32_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2070
2201
  #elif NK_TARGET_HASWELL
2071
2202
  nk_angulars_packed_f32_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2203
+ #elif NK_TARGET_POWERVSX
2204
+ nk_angulars_packed_f32_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2072
2205
  #elif NK_TARGET_RVV
2073
2206
  nk_angulars_packed_f32_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2074
2207
  #elif NK_TARGET_V128RELAXED
@@ -2077,24 +2210,31 @@ NK_PUBLIC void nk_angulars_packed_f32(nk_f32_t const *a, void const *b_packed, n
2077
2210
  nk_angulars_packed_f32_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2078
2211
  #endif
2079
2212
  }
2080
- NK_PUBLIC void nk_angulars_symmetric_f32(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2213
+ NK_PUBLIC void nk_angulars_symmetric_f32(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2081
2214
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
2082
2215
  nk_size_t row_start, nk_size_t row_count) {
2083
2216
  #if NK_TARGET_SMEF64
2084
- nk_angulars_symmetric_f32_smef64(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2217
+ nk_angulars_symmetric_f32_smef64(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2218
+ row_count);
2085
2219
  #elif NK_TARGET_NEON
2086
- nk_angulars_symmetric_f32_neon(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2220
+ nk_angulars_symmetric_f32_neon(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2087
2221
  #elif NK_TARGET_SKYLAKE
2088
- nk_angulars_symmetric_f32_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2222
+ nk_angulars_symmetric_f32_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2223
+ row_count);
2089
2224
  #elif NK_TARGET_HASWELL
2090
- nk_angulars_symmetric_f32_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2225
+ nk_angulars_symmetric_f32_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2226
+ row_count);
2227
+ #elif NK_TARGET_POWERVSX
2228
+ nk_angulars_symmetric_f32_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2229
+ row_count);
2091
2230
  #elif NK_TARGET_RVV
2092
- nk_angulars_symmetric_f32_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2231
+ nk_angulars_symmetric_f32_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2093
2232
  #elif NK_TARGET_V128RELAXED
2094
- nk_angulars_symmetric_f32_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2233
+ nk_angulars_symmetric_f32_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2095
2234
  row_count);
2096
2235
  #else
2097
- nk_angulars_symmetric_f32_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2236
+ nk_angulars_symmetric_f32_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2237
+ row_count);
2098
2238
  #endif
2099
2239
  }
2100
2240
  NK_PUBLIC void nk_euclideans_packed_f32(nk_f32_t const *a, void const *b_packed, nk_f64_t *result, nk_size_t rows,
@@ -2108,6 +2248,8 @@ NK_PUBLIC void nk_euclideans_packed_f32(nk_f32_t const *a, void const *b_packed,
2108
2248
  nk_euclideans_packed_f32_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2109
2249
  #elif NK_TARGET_HASWELL
2110
2250
  nk_euclideans_packed_f32_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2251
+ #elif NK_TARGET_POWERVSX
2252
+ nk_euclideans_packed_f32_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2111
2253
  #elif NK_TARGET_RVV
2112
2254
  nk_euclideans_packed_f32_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2113
2255
  #elif NK_TARGET_V128RELAXED
@@ -2116,24 +2258,32 @@ NK_PUBLIC void nk_euclideans_packed_f32(nk_f32_t const *a, void const *b_packed,
2116
2258
  nk_euclideans_packed_f32_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2117
2259
  #endif
2118
2260
  }
2119
- NK_PUBLIC void nk_euclideans_symmetric_f32(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2261
+ NK_PUBLIC void nk_euclideans_symmetric_f32(nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2120
2262
  nk_size_t stride, nk_f64_t *result, nk_size_t result_stride,
2121
2263
  nk_size_t row_start, nk_size_t row_count) {
2122
2264
  #if NK_TARGET_SMEF64
2123
- nk_euclideans_symmetric_f32_smef64(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2265
+ nk_euclideans_symmetric_f32_smef64(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2266
+ row_count);
2124
2267
  #elif NK_TARGET_NEON
2125
- nk_euclideans_symmetric_f32_neon(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2268
+ nk_euclideans_symmetric_f32_neon(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2269
+ row_count);
2126
2270
  #elif NK_TARGET_SKYLAKE
2127
- nk_euclideans_symmetric_f32_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2271
+ nk_euclideans_symmetric_f32_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2272
+ row_count);
2128
2273
  #elif NK_TARGET_HASWELL
2129
- nk_euclideans_symmetric_f32_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2274
+ nk_euclideans_symmetric_f32_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2275
+ row_count);
2276
+ #elif NK_TARGET_POWERVSX
2277
+ nk_euclideans_symmetric_f32_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2278
+ row_count);
2130
2279
  #elif NK_TARGET_RVV
2131
- nk_euclideans_symmetric_f32_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2280
+ nk_euclideans_symmetric_f32_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2132
2281
  #elif NK_TARGET_V128RELAXED
2133
- nk_euclideans_symmetric_f32_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2282
+ nk_euclideans_symmetric_f32_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2134
2283
  row_count);
2135
2284
  #else
2136
- nk_euclideans_symmetric_f32_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2285
+ nk_euclideans_symmetric_f32_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2286
+ row_count);
2137
2287
  #endif
2138
2288
  }
2139
2289
 
@@ -2144,39 +2294,44 @@ NK_PUBLIC void nk_angulars_packed_f16(nk_f16_t const *a, void const *b_packed, n
2144
2294
  nk_angulars_packed_f16_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2145
2295
  #elif NK_TARGET_NEONFHM
2146
2296
  nk_angulars_packed_f16_neonfhm(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2147
- #elif NK_TARGET_NEONHALF
2148
- nk_angulars_packed_f16_neonhalf(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2149
2297
  #elif NK_TARGET_NEON
2150
2298
  nk_angulars_packed_f16_neon(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2151
2299
  #elif NK_TARGET_SKYLAKE
2152
2300
  nk_angulars_packed_f16_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2153
2301
  #elif NK_TARGET_HASWELL
2154
2302
  nk_angulars_packed_f16_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2303
+ #elif NK_TARGET_POWERVSX
2304
+ nk_angulars_packed_f16_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2155
2305
  #elif NK_TARGET_RVV
2156
2306
  nk_angulars_packed_f16_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2157
2307
  #else
2158
2308
  nk_angulars_packed_f16_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2159
2309
  #endif
2160
2310
  }
2161
- NK_PUBLIC void nk_angulars_symmetric_f16(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2311
+ NK_PUBLIC void nk_angulars_symmetric_f16(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2162
2312
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
2163
2313
  nk_size_t row_start, nk_size_t row_count) {
2164
2314
  #if NK_TARGET_SME
2165
- nk_angulars_symmetric_f16_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2315
+ nk_angulars_symmetric_f16_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2166
2316
  #elif NK_TARGET_NEONFHM
2167
- nk_angulars_symmetric_f16_neonfhm(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2168
- #elif NK_TARGET_NEONHALF
2169
- nk_angulars_symmetric_f16_neonhalf(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2317
+ nk_angulars_symmetric_f16_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2318
+ row_count);
2170
2319
  #elif NK_TARGET_NEON
2171
- nk_angulars_symmetric_f16_neon(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2320
+ nk_angulars_symmetric_f16_neon(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2172
2321
  #elif NK_TARGET_SKYLAKE
2173
- nk_angulars_symmetric_f16_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2322
+ nk_angulars_symmetric_f16_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2323
+ row_count);
2174
2324
  #elif NK_TARGET_HASWELL
2175
- nk_angulars_symmetric_f16_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2325
+ nk_angulars_symmetric_f16_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2326
+ row_count);
2327
+ #elif NK_TARGET_POWERVSX
2328
+ nk_angulars_symmetric_f16_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2329
+ row_count);
2176
2330
  #elif NK_TARGET_RVV
2177
- nk_angulars_symmetric_f16_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2331
+ nk_angulars_symmetric_f16_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2178
2332
  #else
2179
- nk_angulars_symmetric_f16_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2333
+ nk_angulars_symmetric_f16_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2334
+ row_count);
2180
2335
  #endif
2181
2336
  }
2182
2337
  NK_PUBLIC void nk_euclideans_packed_f16(nk_f16_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
@@ -2186,40 +2341,45 @@ NK_PUBLIC void nk_euclideans_packed_f16(nk_f16_t const *a, void const *b_packed,
2186
2341
  nk_euclideans_packed_f16_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2187
2342
  #elif NK_TARGET_NEONFHM
2188
2343
  nk_euclideans_packed_f16_neonfhm(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2189
- #elif NK_TARGET_NEONHALF
2190
- nk_euclideans_packed_f16_neonhalf(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2191
2344
  #elif NK_TARGET_NEON
2192
2345
  nk_euclideans_packed_f16_neon(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2193
2346
  #elif NK_TARGET_SKYLAKE
2194
2347
  nk_euclideans_packed_f16_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2195
2348
  #elif NK_TARGET_HASWELL
2196
2349
  nk_euclideans_packed_f16_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2350
+ #elif NK_TARGET_POWERVSX
2351
+ nk_euclideans_packed_f16_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2197
2352
  #elif NK_TARGET_RVV
2198
2353
  nk_euclideans_packed_f16_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2199
2354
  #else
2200
2355
  nk_euclideans_packed_f16_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2201
2356
  #endif
2202
2357
  }
2203
- NK_PUBLIC void nk_euclideans_symmetric_f16(nk_f16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2358
+ NK_PUBLIC void nk_euclideans_symmetric_f16(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2204
2359
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
2205
2360
  nk_size_t row_start, nk_size_t row_count) {
2206
2361
  #if NK_TARGET_SME
2207
- nk_euclideans_symmetric_f16_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2362
+ nk_euclideans_symmetric_f16_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2208
2363
  #elif NK_TARGET_NEONFHM
2209
- nk_euclideans_symmetric_f16_neonfhm(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2210
- #elif NK_TARGET_NEONHALF
2211
- nk_euclideans_symmetric_f16_neonhalf(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2212
- row_count);
2364
+ nk_euclideans_symmetric_f16_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2365
+ row_count);
2213
2366
  #elif NK_TARGET_NEON
2214
- nk_euclideans_symmetric_f16_neon(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2367
+ nk_euclideans_symmetric_f16_neon(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2368
+ row_count);
2215
2369
  #elif NK_TARGET_SKYLAKE
2216
- nk_euclideans_symmetric_f16_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2370
+ nk_euclideans_symmetric_f16_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2371
+ row_count);
2217
2372
  #elif NK_TARGET_HASWELL
2218
- nk_euclideans_symmetric_f16_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2373
+ nk_euclideans_symmetric_f16_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2374
+ row_count);
2375
+ #elif NK_TARGET_POWERVSX
2376
+ nk_euclideans_symmetric_f16_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2377
+ row_count);
2219
2378
  #elif NK_TARGET_RVV
2220
- nk_euclideans_symmetric_f16_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2379
+ nk_euclideans_symmetric_f16_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2221
2380
  #else
2222
- nk_euclideans_symmetric_f16_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2381
+ nk_euclideans_symmetric_f16_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2382
+ row_count);
2223
2383
  #endif
2224
2384
  }
2225
2385
 
@@ -2238,6 +2398,8 @@ NK_PUBLIC void nk_angulars_packed_bf16(nk_bf16_t const *a, void const *b_packed,
2238
2398
  nk_angulars_packed_bf16_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2239
2399
  #elif NK_TARGET_HASWELL
2240
2400
  nk_angulars_packed_bf16_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2401
+ #elif NK_TARGET_POWERVSX
2402
+ nk_angulars_packed_bf16_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2241
2403
  #elif NK_TARGET_RVV
2242
2404
  nk_angulars_packed_bf16_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2243
2405
  #elif NK_TARGET_V128RELAXED
@@ -2246,30 +2408,37 @@ NK_PUBLIC void nk_angulars_packed_bf16(nk_bf16_t const *a, void const *b_packed,
2246
2408
  nk_angulars_packed_bf16_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2247
2409
  #endif
2248
2410
  }
2249
- NK_PUBLIC void nk_angulars_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2411
+ NK_PUBLIC void nk_angulars_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2250
2412
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
2251
2413
  nk_size_t row_start, nk_size_t row_count) {
2252
2414
  #if NK_TARGET_SME
2253
- nk_angulars_symmetric_bf16_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2415
+ nk_angulars_symmetric_bf16_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2254
2416
  #elif NK_TARGET_NEONBFDOT
2255
- nk_angulars_symmetric_bf16_neonbfdot(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2417
+ nk_angulars_symmetric_bf16_neonbfdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2256
2418
  row_count);
2257
2419
  #elif NK_TARGET_SAPPHIREAMX
2258
- nk_angulars_symmetric_bf16_sapphireamx(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2420
+ nk_angulars_symmetric_bf16_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2259
2421
  row_count);
2260
2422
  #elif NK_TARGET_GENOA
2261
- nk_angulars_symmetric_bf16_genoa(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2423
+ nk_angulars_symmetric_bf16_genoa(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2424
+ row_count);
2262
2425
  #elif NK_TARGET_SKYLAKE
2263
- nk_angulars_symmetric_bf16_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2426
+ nk_angulars_symmetric_bf16_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2427
+ row_count);
2264
2428
  #elif NK_TARGET_HASWELL
2265
- nk_angulars_symmetric_bf16_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2429
+ nk_angulars_symmetric_bf16_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2430
+ row_count);
2431
+ #elif NK_TARGET_POWERVSX
2432
+ nk_angulars_symmetric_bf16_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2433
+ row_count);
2266
2434
  #elif NK_TARGET_RVV
2267
- nk_angulars_symmetric_bf16_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2435
+ nk_angulars_symmetric_bf16_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2268
2436
  #elif NK_TARGET_V128RELAXED
2269
- nk_angulars_symmetric_bf16_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2437
+ nk_angulars_symmetric_bf16_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2270
2438
  row_count);
2271
2439
  #else
2272
- nk_angulars_symmetric_bf16_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2440
+ nk_angulars_symmetric_bf16_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2441
+ row_count);
2273
2442
  #endif
2274
2443
  }
2275
2444
  NK_PUBLIC void nk_euclideans_packed_bf16(nk_bf16_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
@@ -2287,6 +2456,8 @@ NK_PUBLIC void nk_euclideans_packed_bf16(nk_bf16_t const *a, void const *b_packe
2287
2456
  nk_euclideans_packed_bf16_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2288
2457
  #elif NK_TARGET_HASWELL
2289
2458
  nk_euclideans_packed_bf16_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2459
+ #elif NK_TARGET_POWERVSX
2460
+ nk_euclideans_packed_bf16_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2290
2461
  #elif NK_TARGET_RVV
2291
2462
  nk_euclideans_packed_bf16_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2292
2463
  #elif NK_TARGET_V128RELAXED
@@ -2295,32 +2466,39 @@ NK_PUBLIC void nk_euclideans_packed_bf16(nk_bf16_t const *a, void const *b_packe
2295
2466
  nk_euclideans_packed_bf16_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2296
2467
  #endif
2297
2468
  }
2298
- NK_PUBLIC void nk_euclideans_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2469
+ NK_PUBLIC void nk_euclideans_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2299
2470
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
2300
2471
  nk_size_t row_start, nk_size_t row_count) {
2301
2472
  #if NK_TARGET_SME
2302
- nk_euclideans_symmetric_bf16_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2473
+ nk_euclideans_symmetric_bf16_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2474
+ row_count);
2303
2475
  #elif NK_TARGET_NEONBFDOT
2304
- nk_euclideans_symmetric_bf16_neonbfdot(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2476
+ nk_euclideans_symmetric_bf16_neonbfdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2305
2477
  row_count);
2306
2478
  #elif NK_TARGET_SAPPHIREAMX
2307
- nk_euclideans_symmetric_bf16_sapphireamx(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2479
+ nk_euclideans_symmetric_bf16_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2308
2480
  row_count);
2309
2481
  #elif NK_TARGET_GENOA
2310
- nk_euclideans_symmetric_bf16_genoa(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2482
+ nk_euclideans_symmetric_bf16_genoa(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2483
+ row_count);
2311
2484
  #elif NK_TARGET_SKYLAKE
2312
- nk_euclideans_symmetric_bf16_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2485
+ nk_euclideans_symmetric_bf16_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2313
2486
  row_count);
2314
2487
  #elif NK_TARGET_HASWELL
2315
- nk_euclideans_symmetric_bf16_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2488
+ nk_euclideans_symmetric_bf16_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2316
2489
  row_count);
2490
+ #elif NK_TARGET_POWERVSX
2491
+ nk_euclideans_symmetric_bf16_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2492
+ row_count);
2317
2493
  #elif NK_TARGET_RVV
2318
- nk_euclideans_symmetric_bf16_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2494
+ nk_euclideans_symmetric_bf16_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2495
+ row_count);
2319
2496
  #elif NK_TARGET_V128RELAXED
2320
- nk_euclideans_symmetric_bf16_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2497
+ nk_euclideans_symmetric_bf16_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2321
2498
  row_count);
2322
2499
  #else
2323
- nk_euclideans_symmetric_bf16_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2500
+ nk_euclideans_symmetric_bf16_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2501
+ row_count);
2324
2502
  #endif
2325
2503
  }
2326
2504
 
@@ -2329,10 +2507,14 @@ NK_PUBLIC void nk_angulars_packed_e4m3(nk_e4m3_t const *a, void const *b_packed,
2329
2507
  nk_size_t r_stride_in_bytes) {
2330
2508
  #if NK_TARGET_SME
2331
2509
  nk_angulars_packed_e4m3_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2510
+ #elif NK_TARGET_NEONFP8
2511
+ nk_angulars_packed_e4m3_neonfp8(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2332
2512
  #elif NK_TARGET_NEONFHM
2333
2513
  nk_angulars_packed_e4m3_neonfhm(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2334
2514
  #elif NK_TARGET_SAPPHIREAMX
2335
2515
  nk_angulars_packed_e4m3_sapphireamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2516
+ #elif NK_TARGET_DIAMOND
2517
+ nk_angulars_packed_e4m3_diamond(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2336
2518
  #elif NK_TARGET_GENOA
2337
2519
  nk_angulars_packed_e4m3_genoa(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2338
2520
  #elif NK_TARGET_SKYLAKE
@@ -2347,29 +2529,40 @@ NK_PUBLIC void nk_angulars_packed_e4m3(nk_e4m3_t const *a, void const *b_packed,
2347
2529
  nk_angulars_packed_e4m3_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2348
2530
  #endif
2349
2531
  }
2350
- NK_PUBLIC void nk_angulars_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2532
+ NK_PUBLIC void nk_angulars_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2351
2533
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
2352
2534
  nk_size_t row_start, nk_size_t row_count) {
2353
2535
  #if NK_TARGET_SME
2354
- nk_angulars_symmetric_e4m3_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2536
+ nk_angulars_symmetric_e4m3_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2537
+ #elif NK_TARGET_NEONFP8
2538
+ nk_angulars_symmetric_e4m3_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2539
+ row_count);
2355
2540
  #elif NK_TARGET_NEONFHM
2356
- nk_angulars_symmetric_e4m3_neonfhm(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2541
+ nk_angulars_symmetric_e4m3_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2542
+ row_count);
2357
2543
  #elif NK_TARGET_SAPPHIREAMX
2358
- nk_angulars_symmetric_e4m3_sapphireamx(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2544
+ nk_angulars_symmetric_e4m3_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2359
2545
  row_count);
2546
+ #elif NK_TARGET_DIAMOND
2547
+ nk_angulars_symmetric_e4m3_diamond(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2548
+ row_count);
2360
2549
  #elif NK_TARGET_GENOA
2361
- nk_angulars_symmetric_e4m3_genoa(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2550
+ nk_angulars_symmetric_e4m3_genoa(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2551
+ row_count);
2362
2552
  #elif NK_TARGET_SKYLAKE
2363
- nk_angulars_symmetric_e4m3_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2553
+ nk_angulars_symmetric_e4m3_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2554
+ row_count);
2364
2555
  #elif NK_TARGET_HASWELL
2365
- nk_angulars_symmetric_e4m3_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2556
+ nk_angulars_symmetric_e4m3_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2557
+ row_count);
2366
2558
  #elif NK_TARGET_RVV
2367
- nk_angulars_symmetric_e4m3_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2559
+ nk_angulars_symmetric_e4m3_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2368
2560
  #elif NK_TARGET_V128RELAXED
2369
- nk_angulars_symmetric_e4m3_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2561
+ nk_angulars_symmetric_e4m3_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2370
2562
  row_count);
2371
2563
  #else
2372
- nk_angulars_symmetric_e4m3_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2564
+ nk_angulars_symmetric_e4m3_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2565
+ row_count);
2373
2566
  #endif
2374
2567
  }
2375
2568
  NK_PUBLIC void nk_euclideans_packed_e4m3(nk_e4m3_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
@@ -2377,10 +2570,14 @@ NK_PUBLIC void nk_euclideans_packed_e4m3(nk_e4m3_t const *a, void const *b_packe
2377
2570
  nk_size_t r_stride_in_bytes) {
2378
2571
  #if NK_TARGET_SME
2379
2572
  nk_euclideans_packed_e4m3_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2573
+ #elif NK_TARGET_NEONFP8
2574
+ nk_euclideans_packed_e4m3_neonfp8(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2380
2575
  #elif NK_TARGET_NEONFHM
2381
2576
  nk_euclideans_packed_e4m3_neonfhm(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2382
2577
  #elif NK_TARGET_SAPPHIREAMX
2383
2578
  nk_euclideans_packed_e4m3_sapphireamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2579
+ #elif NK_TARGET_DIAMOND
2580
+ nk_euclideans_packed_e4m3_diamond(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2384
2581
  #elif NK_TARGET_GENOA
2385
2582
  nk_euclideans_packed_e4m3_genoa(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2386
2583
  #elif NK_TARGET_SKYLAKE
@@ -2395,32 +2592,42 @@ NK_PUBLIC void nk_euclideans_packed_e4m3(nk_e4m3_t const *a, void const *b_packe
2395
2592
  nk_euclideans_packed_e4m3_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2396
2593
  #endif
2397
2594
  }
2398
- NK_PUBLIC void nk_euclideans_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2595
+ NK_PUBLIC void nk_euclideans_symmetric_e4m3(nk_e4m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2399
2596
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
2400
2597
  nk_size_t row_start, nk_size_t row_count) {
2401
2598
  #if NK_TARGET_SME
2402
- nk_euclideans_symmetric_e4m3_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2599
+ nk_euclideans_symmetric_e4m3_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2600
+ row_count);
2601
+ #elif NK_TARGET_NEONFP8
2602
+ nk_euclideans_symmetric_e4m3_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2603
+ row_count);
2403
2604
  #elif NK_TARGET_NEONFHM
2404
- nk_euclideans_symmetric_e4m3_neonfhm(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2605
+ nk_euclideans_symmetric_e4m3_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2405
2606
  row_count);
2406
2607
  #elif NK_TARGET_SAPPHIREAMX
2407
- nk_euclideans_symmetric_e4m3_sapphireamx(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2608
+ nk_euclideans_symmetric_e4m3_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2408
2609
  row_count);
2610
+ #elif NK_TARGET_DIAMOND
2611
+ nk_euclideans_symmetric_e4m3_diamond(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2612
+ row_count);
2409
2613
  #elif NK_TARGET_GENOA
2410
- nk_euclideans_symmetric_e4m3_genoa(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2614
+ nk_euclideans_symmetric_e4m3_genoa(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2615
+ row_count);
2411
2616
  #elif NK_TARGET_SKYLAKE
2412
- nk_euclideans_symmetric_e4m3_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2617
+ nk_euclideans_symmetric_e4m3_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2413
2618
  row_count);
2414
2619
  #elif NK_TARGET_HASWELL
2415
- nk_euclideans_symmetric_e4m3_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2620
+ nk_euclideans_symmetric_e4m3_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2416
2621
  row_count);
2417
2622
  #elif NK_TARGET_RVV
2418
- nk_euclideans_symmetric_e4m3_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2623
+ nk_euclideans_symmetric_e4m3_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2624
+ row_count);
2419
2625
  #elif NK_TARGET_V128RELAXED
2420
- nk_euclideans_symmetric_e4m3_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2626
+ nk_euclideans_symmetric_e4m3_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2421
2627
  row_count);
2422
2628
  #else
2423
- nk_euclideans_symmetric_e4m3_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2629
+ nk_euclideans_symmetric_e4m3_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2630
+ row_count);
2424
2631
  #endif
2425
2632
  }
2426
2633
 
@@ -2429,10 +2636,14 @@ NK_PUBLIC void nk_angulars_packed_e5m2(nk_e5m2_t const *a, void const *b_packed,
2429
2636
  nk_size_t r_stride_in_bytes) {
2430
2637
  #if NK_TARGET_SME
2431
2638
  nk_angulars_packed_e5m2_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2639
+ #elif NK_TARGET_NEONFP8
2640
+ nk_angulars_packed_e5m2_neonfp8(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2432
2641
  #elif NK_TARGET_NEONFHM
2433
2642
  nk_angulars_packed_e5m2_neonfhm(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2434
2643
  #elif NK_TARGET_SAPPHIREAMX
2435
2644
  nk_angulars_packed_e5m2_sapphireamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2645
+ #elif NK_TARGET_DIAMOND
2646
+ nk_angulars_packed_e5m2_diamond(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2436
2647
  #elif NK_TARGET_GENOA
2437
2648
  nk_angulars_packed_e5m2_genoa(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2438
2649
  #elif NK_TARGET_SKYLAKE
@@ -2447,29 +2658,40 @@ NK_PUBLIC void nk_angulars_packed_e5m2(nk_e5m2_t const *a, void const *b_packed,
2447
2658
  nk_angulars_packed_e5m2_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2448
2659
  #endif
2449
2660
  }
2450
- NK_PUBLIC void nk_angulars_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2661
+ NK_PUBLIC void nk_angulars_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2451
2662
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
2452
2663
  nk_size_t row_start, nk_size_t row_count) {
2453
2664
  #if NK_TARGET_SME
2454
- nk_angulars_symmetric_e5m2_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2665
+ nk_angulars_symmetric_e5m2_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2666
+ #elif NK_TARGET_NEONFP8
2667
+ nk_angulars_symmetric_e5m2_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2668
+ row_count);
2455
2669
  #elif NK_TARGET_NEONFHM
2456
- nk_angulars_symmetric_e5m2_neonfhm(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2670
+ nk_angulars_symmetric_e5m2_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2671
+ row_count);
2457
2672
  #elif NK_TARGET_SAPPHIREAMX
2458
- nk_angulars_symmetric_e5m2_sapphireamx(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2673
+ nk_angulars_symmetric_e5m2_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2459
2674
  row_count);
2675
+ #elif NK_TARGET_DIAMOND
2676
+ nk_angulars_symmetric_e5m2_diamond(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2677
+ row_count);
2460
2678
  #elif NK_TARGET_GENOA
2461
- nk_angulars_symmetric_e5m2_genoa(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2679
+ nk_angulars_symmetric_e5m2_genoa(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2680
+ row_count);
2462
2681
  #elif NK_TARGET_SKYLAKE
2463
- nk_angulars_symmetric_e5m2_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2682
+ nk_angulars_symmetric_e5m2_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2683
+ row_count);
2464
2684
  #elif NK_TARGET_HASWELL
2465
- nk_angulars_symmetric_e5m2_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2685
+ nk_angulars_symmetric_e5m2_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2686
+ row_count);
2466
2687
  #elif NK_TARGET_RVV
2467
- nk_angulars_symmetric_e5m2_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2688
+ nk_angulars_symmetric_e5m2_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2468
2689
  #elif NK_TARGET_V128RELAXED
2469
- nk_angulars_symmetric_e5m2_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2690
+ nk_angulars_symmetric_e5m2_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2470
2691
  row_count);
2471
2692
  #else
2472
- nk_angulars_symmetric_e5m2_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2693
+ nk_angulars_symmetric_e5m2_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2694
+ row_count);
2473
2695
  #endif
2474
2696
  }
2475
2697
  NK_PUBLIC void nk_euclideans_packed_e5m2(nk_e5m2_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
@@ -2477,10 +2699,14 @@ NK_PUBLIC void nk_euclideans_packed_e5m2(nk_e5m2_t const *a, void const *b_packe
2477
2699
  nk_size_t r_stride_in_bytes) {
2478
2700
  #if NK_TARGET_SME
2479
2701
  nk_euclideans_packed_e5m2_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2702
+ #elif NK_TARGET_NEONFP8
2703
+ nk_euclideans_packed_e5m2_neonfp8(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2480
2704
  #elif NK_TARGET_NEONFHM
2481
2705
  nk_euclideans_packed_e5m2_neonfhm(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2482
2706
  #elif NK_TARGET_SAPPHIREAMX
2483
2707
  nk_euclideans_packed_e5m2_sapphireamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2708
+ #elif NK_TARGET_DIAMOND
2709
+ nk_euclideans_packed_e5m2_diamond(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2484
2710
  #elif NK_TARGET_GENOA
2485
2711
  nk_euclideans_packed_e5m2_genoa(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2486
2712
  #elif NK_TARGET_SKYLAKE
@@ -2495,32 +2721,42 @@ NK_PUBLIC void nk_euclideans_packed_e5m2(nk_e5m2_t const *a, void const *b_packe
2495
2721
  nk_euclideans_packed_e5m2_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2496
2722
  #endif
2497
2723
  }
2498
- NK_PUBLIC void nk_euclideans_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2724
+ NK_PUBLIC void nk_euclideans_symmetric_e5m2(nk_e5m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2499
2725
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
2500
2726
  nk_size_t row_start, nk_size_t row_count) {
2501
2727
  #if NK_TARGET_SME
2502
- nk_euclideans_symmetric_e5m2_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2728
+ nk_euclideans_symmetric_e5m2_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2729
+ row_count);
2730
+ #elif NK_TARGET_NEONFP8
2731
+ nk_euclideans_symmetric_e5m2_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2732
+ row_count);
2503
2733
  #elif NK_TARGET_NEONFHM
2504
- nk_euclideans_symmetric_e5m2_neonfhm(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2734
+ nk_euclideans_symmetric_e5m2_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2505
2735
  row_count);
2506
2736
  #elif NK_TARGET_SAPPHIREAMX
2507
- nk_euclideans_symmetric_e5m2_sapphireamx(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2737
+ nk_euclideans_symmetric_e5m2_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2508
2738
  row_count);
2739
+ #elif NK_TARGET_DIAMOND
2740
+ nk_euclideans_symmetric_e5m2_diamond(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2741
+ row_count);
2509
2742
  #elif NK_TARGET_GENOA
2510
- nk_euclideans_symmetric_e5m2_genoa(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2743
+ nk_euclideans_symmetric_e5m2_genoa(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2744
+ row_count);
2511
2745
  #elif NK_TARGET_SKYLAKE
2512
- nk_euclideans_symmetric_e5m2_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2746
+ nk_euclideans_symmetric_e5m2_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2513
2747
  row_count);
2514
2748
  #elif NK_TARGET_HASWELL
2515
- nk_euclideans_symmetric_e5m2_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2749
+ nk_euclideans_symmetric_e5m2_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2516
2750
  row_count);
2517
2751
  #elif NK_TARGET_RVV
2518
- nk_euclideans_symmetric_e5m2_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2752
+ nk_euclideans_symmetric_e5m2_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2753
+ row_count);
2519
2754
  #elif NK_TARGET_V128RELAXED
2520
- nk_euclideans_symmetric_e5m2_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2755
+ nk_euclideans_symmetric_e5m2_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2521
2756
  row_count);
2522
2757
  #else
2523
- nk_euclideans_symmetric_e5m2_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2758
+ nk_euclideans_symmetric_e5m2_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2759
+ row_count);
2524
2760
  #endif
2525
2761
  }
2526
2762
 
@@ -2531,6 +2767,8 @@ NK_PUBLIC void nk_angulars_packed_e2m3(nk_e2m3_t const *a, void const *b_packed,
2531
2767
  nk_angulars_packed_e2m3_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2532
2768
  #elif NK_TARGET_SAPPHIREAMX
2533
2769
  nk_angulars_packed_e2m3_sapphireamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2770
+ #elif NK_TARGET_NEONFP8
2771
+ nk_angulars_packed_e2m3_neonfp8(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2534
2772
  #elif NK_TARGET_SKYLAKE
2535
2773
  nk_angulars_packed_e2m3_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2536
2774
  #elif NK_TARGET_SIERRA
@@ -2547,29 +2785,37 @@ NK_PUBLIC void nk_angulars_packed_e2m3(nk_e2m3_t const *a, void const *b_packed,
2547
2785
  nk_angulars_packed_e2m3_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2548
2786
  #endif
2549
2787
  }
2550
- NK_PUBLIC void nk_angulars_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2788
+ NK_PUBLIC void nk_angulars_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2551
2789
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
2552
2790
  nk_size_t row_start, nk_size_t row_count) {
2553
2791
  #if NK_TARGET_SME
2554
- nk_angulars_symmetric_e2m3_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2792
+ nk_angulars_symmetric_e2m3_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2555
2793
  #elif NK_TARGET_SAPPHIREAMX
2556
- nk_angulars_symmetric_e2m3_sapphireamx(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2794
+ nk_angulars_symmetric_e2m3_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2557
2795
  row_count);
2796
+ #elif NK_TARGET_NEONFP8
2797
+ nk_angulars_symmetric_e2m3_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2798
+ row_count);
2558
2799
  #elif NK_TARGET_SKYLAKE
2559
- nk_angulars_symmetric_e2m3_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2800
+ nk_angulars_symmetric_e2m3_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2801
+ row_count);
2560
2802
  #elif NK_TARGET_SIERRA
2561
- nk_angulars_symmetric_e2m3_sierra(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2803
+ nk_angulars_symmetric_e2m3_sierra(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2804
+ row_count);
2562
2805
  #elif NK_TARGET_ALDER
2563
- nk_angulars_symmetric_e2m3_alder(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2806
+ nk_angulars_symmetric_e2m3_alder(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2807
+ row_count);
2564
2808
  #elif NK_TARGET_HASWELL
2565
- nk_angulars_symmetric_e2m3_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2809
+ nk_angulars_symmetric_e2m3_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2810
+ row_count);
2566
2811
  #elif NK_TARGET_RVV
2567
- nk_angulars_symmetric_e2m3_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2812
+ nk_angulars_symmetric_e2m3_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2568
2813
  #elif NK_TARGET_V128RELAXED
2569
- nk_angulars_symmetric_e2m3_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2814
+ nk_angulars_symmetric_e2m3_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2570
2815
  row_count);
2571
2816
  #else
2572
- nk_angulars_symmetric_e2m3_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2817
+ nk_angulars_symmetric_e2m3_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2818
+ row_count);
2573
2819
  #endif
2574
2820
  }
2575
2821
  NK_PUBLIC void nk_euclideans_packed_e2m3(nk_e2m3_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
@@ -2579,6 +2825,8 @@ NK_PUBLIC void nk_euclideans_packed_e2m3(nk_e2m3_t const *a, void const *b_packe
2579
2825
  nk_euclideans_packed_e2m3_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2580
2826
  #elif NK_TARGET_SAPPHIREAMX
2581
2827
  nk_euclideans_packed_e2m3_sapphireamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2828
+ #elif NK_TARGET_NEONFP8
2829
+ nk_euclideans_packed_e2m3_neonfp8(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2582
2830
  #elif NK_TARGET_SKYLAKE
2583
2831
  nk_euclideans_packed_e2m3_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2584
2832
  #elif NK_TARGET_SIERRA
@@ -2595,31 +2843,39 @@ NK_PUBLIC void nk_euclideans_packed_e2m3(nk_e2m3_t const *a, void const *b_packe
2595
2843
  nk_euclideans_packed_e2m3_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2596
2844
  #endif
2597
2845
  }
2598
- NK_PUBLIC void nk_euclideans_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2846
+ NK_PUBLIC void nk_euclideans_symmetric_e2m3(nk_e2m3_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2599
2847
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
2600
2848
  nk_size_t row_start, nk_size_t row_count) {
2601
2849
  #if NK_TARGET_SME
2602
- nk_euclideans_symmetric_e2m3_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2850
+ nk_euclideans_symmetric_e2m3_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2851
+ row_count);
2603
2852
  #elif NK_TARGET_SAPPHIREAMX
2604
- nk_euclideans_symmetric_e2m3_sapphireamx(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2853
+ nk_euclideans_symmetric_e2m3_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2605
2854
  row_count);
2855
+ #elif NK_TARGET_NEONFP8
2856
+ nk_euclideans_symmetric_e2m3_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2857
+ row_count);
2606
2858
  #elif NK_TARGET_SKYLAKE
2607
- nk_euclideans_symmetric_e2m3_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2859
+ nk_euclideans_symmetric_e2m3_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2608
2860
  row_count);
2609
2861
  #elif NK_TARGET_SIERRA
2610
- nk_euclideans_symmetric_e2m3_sierra(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2862
+ nk_euclideans_symmetric_e2m3_sierra(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2863
+ row_count);
2611
2864
  #elif NK_TARGET_ALDER
2612
- nk_euclideans_symmetric_e2m3_alder(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2865
+ nk_euclideans_symmetric_e2m3_alder(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2866
+ row_count);
2613
2867
  #elif NK_TARGET_HASWELL
2614
- nk_euclideans_symmetric_e2m3_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2868
+ nk_euclideans_symmetric_e2m3_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2615
2869
  row_count);
2616
2870
  #elif NK_TARGET_RVV
2617
- nk_euclideans_symmetric_e2m3_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2871
+ nk_euclideans_symmetric_e2m3_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2872
+ row_count);
2618
2873
  #elif NK_TARGET_V128RELAXED
2619
- nk_euclideans_symmetric_e2m3_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2874
+ nk_euclideans_symmetric_e2m3_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2620
2875
  row_count);
2621
2876
  #else
2622
- nk_euclideans_symmetric_e2m3_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2877
+ nk_euclideans_symmetric_e2m3_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2878
+ row_count);
2623
2879
  #endif
2624
2880
  }
2625
2881
 
@@ -2630,6 +2886,8 @@ NK_PUBLIC void nk_angulars_packed_e3m2(nk_e3m2_t const *a, void const *b_packed,
2630
2886
  nk_angulars_packed_e3m2_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2631
2887
  #elif NK_TARGET_SAPPHIREAMX
2632
2888
  nk_angulars_packed_e3m2_sapphireamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2889
+ #elif NK_TARGET_NEONFP8
2890
+ nk_angulars_packed_e3m2_neonfp8(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2633
2891
  #elif NK_TARGET_SKYLAKE
2634
2892
  nk_angulars_packed_e3m2_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2635
2893
  #elif NK_TARGET_HASWELL
@@ -2640,22 +2898,28 @@ NK_PUBLIC void nk_angulars_packed_e3m2(nk_e3m2_t const *a, void const *b_packed,
2640
2898
  nk_angulars_packed_e3m2_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2641
2899
  #endif
2642
2900
  }
2643
- NK_PUBLIC void nk_angulars_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2901
+ NK_PUBLIC void nk_angulars_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2644
2902
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
2645
2903
  nk_size_t row_start, nk_size_t row_count) {
2646
2904
  #if NK_TARGET_SME
2647
- nk_angulars_symmetric_e3m2_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2905
+ nk_angulars_symmetric_e3m2_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2648
2906
  #elif NK_TARGET_SAPPHIREAMX
2649
- nk_angulars_symmetric_e3m2_sapphireamx(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2907
+ nk_angulars_symmetric_e3m2_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2650
2908
  row_count);
2909
+ #elif NK_TARGET_NEONFP8
2910
+ nk_angulars_symmetric_e3m2_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2911
+ row_count);
2651
2912
  #elif NK_TARGET_SKYLAKE
2652
- nk_angulars_symmetric_e3m2_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2913
+ nk_angulars_symmetric_e3m2_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2914
+ row_count);
2653
2915
  #elif NK_TARGET_HASWELL
2654
- nk_angulars_symmetric_e3m2_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2916
+ nk_angulars_symmetric_e3m2_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2917
+ row_count);
2655
2918
  #elif NK_TARGET_RVV
2656
- nk_angulars_symmetric_e3m2_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2919
+ nk_angulars_symmetric_e3m2_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2657
2920
  #else
2658
- nk_angulars_symmetric_e3m2_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2921
+ nk_angulars_symmetric_e3m2_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2922
+ row_count);
2659
2923
  #endif
2660
2924
  }
2661
2925
  NK_PUBLIC void nk_euclideans_packed_e3m2(nk_e3m2_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
@@ -2665,6 +2929,8 @@ NK_PUBLIC void nk_euclideans_packed_e3m2(nk_e3m2_t const *a, void const *b_packe
2665
2929
  nk_euclideans_packed_e3m2_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2666
2930
  #elif NK_TARGET_SAPPHIREAMX
2667
2931
  nk_euclideans_packed_e3m2_sapphireamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2932
+ #elif NK_TARGET_NEONFP8
2933
+ nk_euclideans_packed_e3m2_neonfp8(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2668
2934
  #elif NK_TARGET_SKYLAKE
2669
2935
  nk_euclideans_packed_e3m2_skylake(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2670
2936
  #elif NK_TARGET_HASWELL
@@ -2675,24 +2941,30 @@ NK_PUBLIC void nk_euclideans_packed_e3m2(nk_e3m2_t const *a, void const *b_packe
2675
2941
  nk_euclideans_packed_e3m2_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2676
2942
  #endif
2677
2943
  }
2678
- NK_PUBLIC void nk_euclideans_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
2944
+ NK_PUBLIC void nk_euclideans_symmetric_e3m2(nk_e3m2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2679
2945
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
2680
2946
  nk_size_t row_start, nk_size_t row_count) {
2681
2947
  #if NK_TARGET_SME
2682
- nk_euclideans_symmetric_e3m2_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2948
+ nk_euclideans_symmetric_e3m2_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2949
+ row_count);
2683
2950
  #elif NK_TARGET_SAPPHIREAMX
2684
- nk_euclideans_symmetric_e3m2_sapphireamx(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2951
+ nk_euclideans_symmetric_e3m2_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2685
2952
  row_count);
2953
+ #elif NK_TARGET_NEONFP8
2954
+ nk_euclideans_symmetric_e3m2_neonfp8(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2955
+ row_count);
2686
2956
  #elif NK_TARGET_SKYLAKE
2687
- nk_euclideans_symmetric_e3m2_skylake(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2957
+ nk_euclideans_symmetric_e3m2_skylake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2688
2958
  row_count);
2689
2959
  #elif NK_TARGET_HASWELL
2690
- nk_euclideans_symmetric_e3m2_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start,
2960
+ nk_euclideans_symmetric_e3m2_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2691
2961
  row_count);
2692
2962
  #elif NK_TARGET_RVV
2693
- nk_euclideans_symmetric_e3m2_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2963
+ nk_euclideans_symmetric_e3m2_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2964
+ row_count);
2694
2965
  #else
2695
- nk_euclideans_symmetric_e3m2_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
2966
+ nk_euclideans_symmetric_e3m2_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2967
+ row_count);
2696
2968
  #endif
2697
2969
  }
2698
2970
 
@@ -2713,6 +2985,8 @@ NK_PUBLIC void nk_angulars_packed_i8(nk_i8_t const *a, void const *b_packed, nk_
2713
2985
  nk_angulars_packed_i8_alder(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2714
2986
  #elif NK_TARGET_HASWELL
2715
2987
  nk_angulars_packed_i8_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2988
+ #elif NK_TARGET_POWERVSX
2989
+ nk_angulars_packed_i8_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2716
2990
  #elif NK_TARGET_RVV
2717
2991
  nk_angulars_packed_i8_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2718
2992
  #elif NK_TARGET_V128RELAXED
@@ -2721,31 +2995,37 @@ NK_PUBLIC void nk_angulars_packed_i8(nk_i8_t const *a, void const *b_packed, nk_
2721
2995
  nk_angulars_packed_i8_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2722
2996
  #endif
2723
2997
  }
2724
- NK_PUBLIC void nk_angulars_symmetric_i8(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
2725
- nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
2726
- nk_size_t row_count) {
2998
+ NK_PUBLIC void nk_angulars_symmetric_i8(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2999
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
3000
+ nk_size_t row_start, nk_size_t row_count) {
2727
3001
  #if NK_TARGET_SME
2728
- nk_angulars_symmetric_i8_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3002
+ nk_angulars_symmetric_i8_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2729
3003
  #elif NK_TARGET_NEONSDOT
2730
- nk_angulars_symmetric_i8_neonsdot(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3004
+ nk_angulars_symmetric_i8_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3005
+ row_count);
2731
3006
  #elif NK_TARGET_SAPPHIREAMX
2732
- nk_angulars_symmetric_i8_sapphireamx(vectors, n_vectors, depth, stride, result, result_stride, row_start,
3007
+ nk_angulars_symmetric_i8_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2733
3008
  row_count);
2734
3009
  #elif NK_TARGET_ICELAKE
2735
- nk_angulars_symmetric_i8_icelake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3010
+ nk_angulars_symmetric_i8_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3011
+ row_count);
2736
3012
  #elif NK_TARGET_SIERRA
2737
- nk_angulars_symmetric_i8_sierra(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3013
+ nk_angulars_symmetric_i8_sierra(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2738
3014
  #elif NK_TARGET_ALDER
2739
- nk_angulars_symmetric_i8_alder(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3015
+ nk_angulars_symmetric_i8_alder(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2740
3016
  #elif NK_TARGET_HASWELL
2741
- nk_angulars_symmetric_i8_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3017
+ nk_angulars_symmetric_i8_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3018
+ row_count);
3019
+ #elif NK_TARGET_POWERVSX
3020
+ nk_angulars_symmetric_i8_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3021
+ row_count);
2742
3022
  #elif NK_TARGET_RVV
2743
- nk_angulars_symmetric_i8_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3023
+ nk_angulars_symmetric_i8_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2744
3024
  #elif NK_TARGET_V128RELAXED
2745
- nk_angulars_symmetric_i8_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start,
3025
+ nk_angulars_symmetric_i8_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2746
3026
  row_count);
2747
3027
  #else
2748
- nk_angulars_symmetric_i8_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3028
+ nk_angulars_symmetric_i8_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2749
3029
  #endif
2750
3030
  }
2751
3031
  NK_PUBLIC void nk_euclideans_packed_i8(nk_i8_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
@@ -2765,6 +3045,8 @@ NK_PUBLIC void nk_euclideans_packed_i8(nk_i8_t const *a, void const *b_packed, n
2765
3045
  nk_euclideans_packed_i8_alder(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2766
3046
  #elif NK_TARGET_HASWELL
2767
3047
  nk_euclideans_packed_i8_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
3048
+ #elif NK_TARGET_POWERVSX
3049
+ nk_euclideans_packed_i8_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2768
3050
  #elif NK_TARGET_RVV
2769
3051
  nk_euclideans_packed_i8_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2770
3052
  #elif NK_TARGET_V128RELAXED
@@ -2773,31 +3055,40 @@ NK_PUBLIC void nk_euclideans_packed_i8(nk_i8_t const *a, void const *b_packed, n
2773
3055
  nk_euclideans_packed_i8_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2774
3056
  #endif
2775
3057
  }
2776
- NK_PUBLIC void nk_euclideans_symmetric_i8(nk_i8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
3058
+ NK_PUBLIC void nk_euclideans_symmetric_i8(nk_i8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2777
3059
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
2778
3060
  nk_size_t row_start, nk_size_t row_count) {
2779
3061
  #if NK_TARGET_SME
2780
- nk_euclideans_symmetric_i8_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3062
+ nk_euclideans_symmetric_i8_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2781
3063
  #elif NK_TARGET_NEONSDOT
2782
- nk_euclideans_symmetric_i8_neonsdot(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3064
+ nk_euclideans_symmetric_i8_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3065
+ row_count);
2783
3066
  #elif NK_TARGET_SAPPHIREAMX
2784
- nk_euclideans_symmetric_i8_sapphireamx(vectors, n_vectors, depth, stride, result, result_stride, row_start,
3067
+ nk_euclideans_symmetric_i8_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2785
3068
  row_count);
2786
3069
  #elif NK_TARGET_ICELAKE
2787
- nk_euclideans_symmetric_i8_icelake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3070
+ nk_euclideans_symmetric_i8_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3071
+ row_count);
2788
3072
  #elif NK_TARGET_SIERRA
2789
- nk_euclideans_symmetric_i8_sierra(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3073
+ nk_euclideans_symmetric_i8_sierra(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3074
+ row_count);
2790
3075
  #elif NK_TARGET_ALDER
2791
- nk_euclideans_symmetric_i8_alder(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3076
+ nk_euclideans_symmetric_i8_alder(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3077
+ row_count);
2792
3078
  #elif NK_TARGET_HASWELL
2793
- nk_euclideans_symmetric_i8_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3079
+ nk_euclideans_symmetric_i8_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3080
+ row_count);
3081
+ #elif NK_TARGET_POWERVSX
3082
+ nk_euclideans_symmetric_i8_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3083
+ row_count);
2794
3084
  #elif NK_TARGET_RVV
2795
- nk_euclideans_symmetric_i8_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3085
+ nk_euclideans_symmetric_i8_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2796
3086
  #elif NK_TARGET_V128RELAXED
2797
- nk_euclideans_symmetric_i8_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start,
3087
+ nk_euclideans_symmetric_i8_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2798
3088
  row_count);
2799
3089
  #else
2800
- nk_euclideans_symmetric_i8_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3090
+ nk_euclideans_symmetric_i8_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3091
+ row_count);
2801
3092
  #endif
2802
3093
  }
2803
3094
 
@@ -2818,6 +3109,8 @@ NK_PUBLIC void nk_angulars_packed_u8(nk_u8_t const *a, void const *b_packed, nk_
2818
3109
  nk_angulars_packed_u8_alder(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2819
3110
  #elif NK_TARGET_HASWELL
2820
3111
  nk_angulars_packed_u8_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
3112
+ #elif NK_TARGET_POWERVSX
3113
+ nk_angulars_packed_u8_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2821
3114
  #elif NK_TARGET_RVV
2822
3115
  nk_angulars_packed_u8_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2823
3116
  #elif NK_TARGET_V128RELAXED
@@ -2826,31 +3119,37 @@ NK_PUBLIC void nk_angulars_packed_u8(nk_u8_t const *a, void const *b_packed, nk_
2826
3119
  nk_angulars_packed_u8_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2827
3120
  #endif
2828
3121
  }
2829
- NK_PUBLIC void nk_angulars_symmetric_u8(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
2830
- nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
2831
- nk_size_t row_count) {
3122
+ NK_PUBLIC void nk_angulars_symmetric_u8(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
3123
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
3124
+ nk_size_t row_start, nk_size_t row_count) {
2832
3125
  #if NK_TARGET_SME
2833
- nk_angulars_symmetric_u8_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3126
+ nk_angulars_symmetric_u8_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2834
3127
  #elif NK_TARGET_NEONSDOT
2835
- nk_angulars_symmetric_u8_neonsdot(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3128
+ nk_angulars_symmetric_u8_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3129
+ row_count);
2836
3130
  #elif NK_TARGET_SAPPHIREAMX
2837
- nk_angulars_symmetric_u8_sapphireamx(vectors, n_vectors, depth, stride, result, result_stride, row_start,
3131
+ nk_angulars_symmetric_u8_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2838
3132
  row_count);
2839
3133
  #elif NK_TARGET_ICELAKE
2840
- nk_angulars_symmetric_u8_icelake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3134
+ nk_angulars_symmetric_u8_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3135
+ row_count);
2841
3136
  #elif NK_TARGET_SIERRA
2842
- nk_angulars_symmetric_u8_sierra(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3137
+ nk_angulars_symmetric_u8_sierra(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2843
3138
  #elif NK_TARGET_ALDER
2844
- nk_angulars_symmetric_u8_alder(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3139
+ nk_angulars_symmetric_u8_alder(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2845
3140
  #elif NK_TARGET_HASWELL
2846
- nk_angulars_symmetric_u8_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3141
+ nk_angulars_symmetric_u8_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3142
+ row_count);
3143
+ #elif NK_TARGET_POWERVSX
3144
+ nk_angulars_symmetric_u8_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3145
+ row_count);
2847
3146
  #elif NK_TARGET_RVV
2848
- nk_angulars_symmetric_u8_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3147
+ nk_angulars_symmetric_u8_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2849
3148
  #elif NK_TARGET_V128RELAXED
2850
- nk_angulars_symmetric_u8_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start,
3149
+ nk_angulars_symmetric_u8_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2851
3150
  row_count);
2852
3151
  #else
2853
- nk_angulars_symmetric_u8_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3152
+ nk_angulars_symmetric_u8_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2854
3153
  #endif
2855
3154
  }
2856
3155
  NK_PUBLIC void nk_euclideans_packed_u8(nk_u8_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
@@ -2870,6 +3169,8 @@ NK_PUBLIC void nk_euclideans_packed_u8(nk_u8_t const *a, void const *b_packed, n
2870
3169
  nk_euclideans_packed_u8_alder(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2871
3170
  #elif NK_TARGET_HASWELL
2872
3171
  nk_euclideans_packed_u8_haswell(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
3172
+ #elif NK_TARGET_POWERVSX
3173
+ nk_euclideans_packed_u8_powervsx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2873
3174
  #elif NK_TARGET_RVV
2874
3175
  nk_euclideans_packed_u8_rvv(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2875
3176
  #elif NK_TARGET_V128RELAXED
@@ -2878,31 +3179,40 @@ NK_PUBLIC void nk_euclideans_packed_u8(nk_u8_t const *a, void const *b_packed, n
2878
3179
  nk_euclideans_packed_u8_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2879
3180
  #endif
2880
3181
  }
2881
- NK_PUBLIC void nk_euclideans_symmetric_u8(nk_u8_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
3182
+ NK_PUBLIC void nk_euclideans_symmetric_u8(nk_u8_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2882
3183
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
2883
3184
  nk_size_t row_start, nk_size_t row_count) {
2884
3185
  #if NK_TARGET_SME
2885
- nk_euclideans_symmetric_u8_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3186
+ nk_euclideans_symmetric_u8_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2886
3187
  #elif NK_TARGET_NEONSDOT
2887
- nk_euclideans_symmetric_u8_neonsdot(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3188
+ nk_euclideans_symmetric_u8_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3189
+ row_count);
2888
3190
  #elif NK_TARGET_SAPPHIREAMX
2889
- nk_euclideans_symmetric_u8_sapphireamx(vectors, n_vectors, depth, stride, result, result_stride, row_start,
3191
+ nk_euclideans_symmetric_u8_sapphireamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2890
3192
  row_count);
2891
3193
  #elif NK_TARGET_ICELAKE
2892
- nk_euclideans_symmetric_u8_icelake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3194
+ nk_euclideans_symmetric_u8_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3195
+ row_count);
2893
3196
  #elif NK_TARGET_SIERRA
2894
- nk_euclideans_symmetric_u8_sierra(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3197
+ nk_euclideans_symmetric_u8_sierra(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3198
+ row_count);
2895
3199
  #elif NK_TARGET_ALDER
2896
- nk_euclideans_symmetric_u8_alder(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3200
+ nk_euclideans_symmetric_u8_alder(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3201
+ row_count);
2897
3202
  #elif NK_TARGET_HASWELL
2898
- nk_euclideans_symmetric_u8_haswell(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3203
+ nk_euclideans_symmetric_u8_haswell(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3204
+ row_count);
3205
+ #elif NK_TARGET_POWERVSX
3206
+ nk_euclideans_symmetric_u8_powervsx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3207
+ row_count);
2899
3208
  #elif NK_TARGET_RVV
2900
- nk_euclideans_symmetric_u8_rvv(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3209
+ nk_euclideans_symmetric_u8_rvv(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2901
3210
  #elif NK_TARGET_V128RELAXED
2902
- nk_euclideans_symmetric_u8_v128relaxed(vectors, n_vectors, depth, stride, result, result_stride, row_start,
3211
+ nk_euclideans_symmetric_u8_v128relaxed(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2903
3212
  row_count);
2904
3213
  #else
2905
- nk_euclideans_symmetric_u8_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3214
+ nk_euclideans_symmetric_u8_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3215
+ row_count);
2906
3216
  #endif
2907
3217
  }
2908
3218
 
@@ -2919,17 +3229,19 @@ NK_PUBLIC void nk_angulars_packed_i4(nk_i4x2_t const *a, void const *b_packed, n
2919
3229
  nk_angulars_packed_i4_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2920
3230
  #endif
2921
3231
  }
2922
- NK_PUBLIC void nk_angulars_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
3232
+ NK_PUBLIC void nk_angulars_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2923
3233
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
2924
3234
  nk_size_t row_start, nk_size_t row_count) {
2925
3235
  #if NK_TARGET_SME
2926
- nk_angulars_symmetric_i4_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3236
+ nk_angulars_symmetric_i4_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2927
3237
  #elif NK_TARGET_NEONSDOT
2928
- nk_angulars_symmetric_i4_neonsdot(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3238
+ nk_angulars_symmetric_i4_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3239
+ row_count);
2929
3240
  #elif NK_TARGET_ICELAKE
2930
- nk_angulars_symmetric_i4_icelake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3241
+ nk_angulars_symmetric_i4_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3242
+ row_count);
2931
3243
  #else
2932
- nk_angulars_symmetric_i4_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3244
+ nk_angulars_symmetric_i4_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2933
3245
  #endif
2934
3246
  }
2935
3247
  NK_PUBLIC void nk_euclideans_packed_i4(nk_i4x2_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
@@ -2945,17 +3257,20 @@ NK_PUBLIC void nk_euclideans_packed_i4(nk_i4x2_t const *a, void const *b_packed,
2945
3257
  nk_euclideans_packed_i4_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2946
3258
  #endif
2947
3259
  }
2948
- NK_PUBLIC void nk_euclideans_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
3260
+ NK_PUBLIC void nk_euclideans_symmetric_i4(nk_i4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2949
3261
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
2950
3262
  nk_size_t row_start, nk_size_t row_count) {
2951
3263
  #if NK_TARGET_SME
2952
- nk_euclideans_symmetric_i4_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3264
+ nk_euclideans_symmetric_i4_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2953
3265
  #elif NK_TARGET_NEONSDOT
2954
- nk_euclideans_symmetric_i4_neonsdot(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3266
+ nk_euclideans_symmetric_i4_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3267
+ row_count);
2955
3268
  #elif NK_TARGET_ICELAKE
2956
- nk_euclideans_symmetric_i4_icelake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3269
+ nk_euclideans_symmetric_i4_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3270
+ row_count);
2957
3271
  #else
2958
- nk_euclideans_symmetric_i4_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3272
+ nk_euclideans_symmetric_i4_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3273
+ row_count);
2959
3274
  #endif
2960
3275
  }
2961
3276
 
@@ -2972,17 +3287,19 @@ NK_PUBLIC void nk_angulars_packed_u4(nk_u4x2_t const *a, void const *b_packed, n
2972
3287
  nk_angulars_packed_u4_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2973
3288
  #endif
2974
3289
  }
2975
- NK_PUBLIC void nk_angulars_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
3290
+ NK_PUBLIC void nk_angulars_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2976
3291
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
2977
3292
  nk_size_t row_start, nk_size_t row_count) {
2978
3293
  #if NK_TARGET_SME
2979
- nk_angulars_symmetric_u4_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3294
+ nk_angulars_symmetric_u4_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2980
3295
  #elif NK_TARGET_NEONSDOT
2981
- nk_angulars_symmetric_u4_neonsdot(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3296
+ nk_angulars_symmetric_u4_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3297
+ row_count);
2982
3298
  #elif NK_TARGET_ICELAKE
2983
- nk_angulars_symmetric_u4_icelake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3299
+ nk_angulars_symmetric_u4_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3300
+ row_count);
2984
3301
  #else
2985
- nk_angulars_symmetric_u4_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3302
+ nk_angulars_symmetric_u4_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2986
3303
  #endif
2987
3304
  }
2988
3305
  NK_PUBLIC void nk_euclideans_packed_u4(nk_u4x2_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
@@ -2998,17 +3315,20 @@ NK_PUBLIC void nk_euclideans_packed_u4(nk_u4x2_t const *a, void const *b_packed,
2998
3315
  nk_euclideans_packed_u4_serial(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2999
3316
  #endif
3000
3317
  }
3001
- NK_PUBLIC void nk_euclideans_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
3318
+ NK_PUBLIC void nk_euclideans_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
3002
3319
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
3003
3320
  nk_size_t row_start, nk_size_t row_count) {
3004
3321
  #if NK_TARGET_SME
3005
- nk_euclideans_symmetric_u4_sme(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3322
+ nk_euclideans_symmetric_u4_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
3006
3323
  #elif NK_TARGET_NEONSDOT
3007
- nk_euclideans_symmetric_u4_neonsdot(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3324
+ nk_euclideans_symmetric_u4_neonsdot(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3325
+ row_count);
3008
3326
  #elif NK_TARGET_ICELAKE
3009
- nk_euclideans_symmetric_u4_icelake(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3327
+ nk_euclideans_symmetric_u4_icelake(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3328
+ row_count);
3010
3329
  #else
3011
- nk_euclideans_symmetric_u4_serial(vectors, n_vectors, depth, stride, result, result_stride, row_start, row_count);
3330
+ nk_euclideans_symmetric_u4_serial(vectors, vectors_count, depth, stride, result, result_stride, row_start,
3331
+ row_count);
3012
3332
  #endif
3013
3333
  }
3014
3334