numkong 7.0.0 → 7.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +239 -122
  2. package/binding.gyp +25 -491
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -15,7 +15,7 @@
15
15
  *
16
16
  * Fused helpers minimize data passes:
17
17
  *
18
- * - `nk_bicentroid_*_rvv_`: both centroids in a single pass (used by RMSD)
18
+ * - RMSD: fully fused single-pass (centroids + squared diffs), no separate helper
19
19
  * - `nk_centroid_and_cross_covariance_*_rvv_`: centroids + H in one pass (Kabsch)
20
20
  * - `nk_centroid_and_cross_covariance_and_variance_*_rvv_`: + variance (Umeyama)
21
21
  *
@@ -89,104 +89,6 @@ NK_INTERNAL void nk_accumulate_product_f64m1_rvv_(vfloat64m1_t *sum_f64m1, vfloa
89
89
  vector_length);
90
90
  }
91
91
 
92
- /**
93
- * @brief Compute centroids of two f32 point clouds in a single pass.
94
- *
95
- * Reads both clouds simultaneously, accumulating 6 sums (3 per cloud) in f64.
96
- * Reduces RMSD from 3 passes to 2 (bicentroid + SSD).
97
- * Uses per-lane `vfwadd_wv` accumulation with deferred `vfredusum` after the loop.
98
- */
99
- NK_INTERNAL void nk_bicentroid_f32_rvv_( //
100
- nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, //
101
- nk_f64_t *ca_x, nk_f64_t *ca_y, nk_f64_t *ca_z, //
102
- nk_f64_t *cb_x, nk_f64_t *cb_y, nk_f64_t *cb_z) {
103
- nk_size_t vlmax = __riscv_vsetvlmax_e64m2();
104
- vfloat64m2_t sum_a_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
105
- vfloat64m2_t sum_a_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
106
- vfloat64m2_t sum_a_z_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
107
- vfloat64m2_t sum_b_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
108
- vfloat64m2_t sum_b_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
109
- vfloat64m2_t sum_b_z_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
110
- nk_f32_t const *a_ptr = a, *b_ptr = b;
111
- nk_size_t remaining = n;
112
- for (nk_size_t vector_length; remaining > 0;
113
- remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
114
- vector_length = __riscv_vsetvl_e32m1(remaining);
115
- vfloat32m1x3_t a_f32m1x3 = __riscv_vlseg3e32_v_f32m1x3(a_ptr, vector_length);
116
- sum_a_x_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_a_x_f64m2, sum_a_x_f64m2,
117
- __riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 0), vector_length);
118
- sum_a_y_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_a_y_f64m2, sum_a_y_f64m2,
119
- __riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 1), vector_length);
120
- sum_a_z_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_a_z_f64m2, sum_a_z_f64m2,
121
- __riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 2), vector_length);
122
- vfloat32m1x3_t b_f32m1x3 = __riscv_vlseg3e32_v_f32m1x3(b_ptr, vector_length);
123
- sum_b_x_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_b_x_f64m2, sum_b_x_f64m2,
124
- __riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 0), vector_length);
125
- sum_b_y_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_b_y_f64m2, sum_b_y_f64m2,
126
- __riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 1), vector_length);
127
- sum_b_z_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_b_z_f64m2, sum_b_z_f64m2,
128
- __riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 2), vector_length);
129
- }
130
- vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
131
- nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
132
- *ca_x = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_a_x_f64m2, zero_f64m1, vlmax)) * inv_n;
133
- *ca_y = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_a_y_f64m2, zero_f64m1, vlmax)) * inv_n;
134
- *ca_z = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_a_z_f64m2, zero_f64m1, vlmax)) * inv_n;
135
- *cb_x = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_b_x_f64m2, zero_f64m1, vlmax)) * inv_n;
136
- *cb_y = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_b_y_f64m2, zero_f64m1, vlmax)) * inv_n;
137
- *cb_z = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_b_z_f64m2, zero_f64m1, vlmax)) * inv_n;
138
- }
139
-
140
- /**
141
- * @brief Compute centroids of two f64 point clouds in a single pass.
142
- * Uses per-lane `vfadd_vv` accumulation with deferred `vfredusum` after the loop.
143
- */
144
- NK_INTERNAL void nk_bicentroid_f64_rvv_( //
145
- nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, //
146
- nk_f64_t *ca_x, nk_f64_t *ca_y, nk_f64_t *ca_z, //
147
- nk_f64_t *cb_x, nk_f64_t *cb_y, nk_f64_t *cb_z) {
148
- nk_size_t vlmax = __riscv_vsetvlmax_e64m1();
149
- vfloat64m1_t sum_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
150
- vfloat64m1_t sum_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
151
- vfloat64m1_t sum_a_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
152
- vfloat64m1_t sum_b_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
153
- vfloat64m1_t sum_b_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
154
- vfloat64m1_t sum_b_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
155
- vfloat64m1_t compensation_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
156
- vfloat64m1_t compensation_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
157
- vfloat64m1_t compensation_a_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
158
- vfloat64m1_t compensation_b_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
159
- vfloat64m1_t compensation_b_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
160
- vfloat64m1_t compensation_b_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
161
- nk_f64_t const *a_ptr = a, *b_ptr = b;
162
- nk_size_t remaining = n;
163
- for (nk_size_t vector_length; remaining > 0;
164
- remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
165
- vector_length = __riscv_vsetvl_e64m1(remaining);
166
- vfloat64m1x3_t a_f64m1x3 = __riscv_vlseg3e64_v_f64m1x3(a_ptr, vector_length);
167
- nk_accumulate_sum_f64m1_rvv_(&sum_a_x_f64m1, &compensation_a_x_f64m1,
168
- __riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 0), vector_length);
169
- nk_accumulate_sum_f64m1_rvv_(&sum_a_y_f64m1, &compensation_a_y_f64m1,
170
- __riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 1), vector_length);
171
- nk_accumulate_sum_f64m1_rvv_(&sum_a_z_f64m1, &compensation_a_z_f64m1,
172
- __riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 2), vector_length);
173
- vfloat64m1x3_t b_f64m1x3 = __riscv_vlseg3e64_v_f64m1x3(b_ptr, vector_length);
174
- nk_accumulate_sum_f64m1_rvv_(&sum_b_x_f64m1, &compensation_b_x_f64m1,
175
- __riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 0), vector_length);
176
- nk_accumulate_sum_f64m1_rvv_(&sum_b_y_f64m1, &compensation_b_y_f64m1,
177
- __riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 1), vector_length);
178
- nk_accumulate_sum_f64m1_rvv_(&sum_b_z_f64m1, &compensation_b_z_f64m1,
179
- __riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 2), vector_length);
180
- }
181
- nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
182
- *ca_x = nk_dot_stable_sum_f64m1_rvv_(sum_a_x_f64m1, compensation_a_x_f64m1) * inv_n;
183
- *ca_y = nk_dot_stable_sum_f64m1_rvv_(sum_a_y_f64m1, compensation_a_y_f64m1) * inv_n;
184
- *ca_z = nk_dot_stable_sum_f64m1_rvv_(sum_a_z_f64m1, compensation_a_z_f64m1) * inv_n;
185
- *cb_x = nk_dot_stable_sum_f64m1_rvv_(sum_b_x_f64m1, compensation_b_x_f64m1) * inv_n;
186
- *cb_y = nk_dot_stable_sum_f64m1_rvv_(sum_b_y_f64m1, compensation_b_y_f64m1) * inv_n;
187
- *cb_z = nk_dot_stable_sum_f64m1_rvv_(sum_b_z_f64m1, compensation_b_z_f64m1) * inv_n;
188
- }
189
-
190
92
  /**
191
93
  * @brief Compute centroids and cross-covariance matrix in a single pass (f32).
192
94
  *
@@ -198,27 +100,29 @@ NK_INTERNAL void nk_bicentroid_f64_rvv_( //
198
100
  * Cross-products use per-lane `vfwmacc_vv` accumulation (vfloat64m2_t) with
199
101
  * deferred `vfredusum` after the loop — eliminates 9 reductions per iteration.
200
102
  */
201
- NK_INTERNAL void nk_centroid_and_cross_covariance_f32_rvv_( //
202
- nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, //
203
- nk_f64_t *ca_x, nk_f64_t *ca_y, nk_f64_t *ca_z, //
204
- nk_f64_t *cb_x, nk_f64_t *cb_y, nk_f64_t *cb_z, //
103
+ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_rvv_( //
104
+ nk_f32_t const *a, nk_f32_t const *b, nk_size_t points_count, //
105
+ nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
106
+ nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
205
107
  nk_f64_t h[9]) {
206
- nk_size_t vlmax = __riscv_vsetvlmax_e64m2();
207
- vfloat64m2_t sum_a_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax), sum_a_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
208
- vfloat64m2_t sum_a_z_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
209
- vfloat64m2_t sum_b_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax), sum_b_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
210
- vfloat64m2_t sum_b_z_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
211
- vfloat64m2_t cross_00_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax),
212
- cross_01_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
213
- vfloat64m2_t cross_02_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax),
214
- cross_10_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
215
- vfloat64m2_t cross_11_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax),
216
- cross_12_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
217
- vfloat64m2_t cross_20_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax),
218
- cross_21_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
219
- vfloat64m2_t cross_22_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
108
+ nk_size_t max_vector_length = __riscv_vsetvlmax_e64m2();
109
+ vfloat64m2_t sum_a_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
110
+ sum_a_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
111
+ vfloat64m2_t sum_a_z_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
112
+ vfloat64m2_t sum_b_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
113
+ sum_b_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
114
+ vfloat64m2_t sum_b_z_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
115
+ vfloat64m2_t cross_00_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
116
+ cross_01_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
117
+ vfloat64m2_t cross_02_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
118
+ cross_10_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
119
+ vfloat64m2_t cross_11_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
120
+ cross_12_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
121
+ vfloat64m2_t cross_20_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
122
+ cross_21_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
123
+ vfloat64m2_t cross_22_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
220
124
  nk_f32_t const *a_ptr = a, *b_ptr = b;
221
- nk_size_t remaining = n;
125
+ nk_size_t remaining = points_count;
222
126
  for (nk_size_t vector_length; remaining > 0;
223
127
  remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
224
128
  vector_length = __riscv_vsetvl_e32m1(remaining);
@@ -248,45 +152,51 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_rvv_( //
248
152
  }
249
153
  vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
250
154
  // Compute centroids
251
- nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
252
- nk_f64_t ca_x_ = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_a_x_f64m2, zero_f64m1, vlmax)) *
253
- inv_n;
254
- nk_f64_t ca_y_ = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_a_y_f64m2, zero_f64m1, vlmax)) *
255
- inv_n;
256
- nk_f64_t ca_z_ = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_a_z_f64m2, zero_f64m1, vlmax)) *
257
- inv_n;
258
- nk_f64_t cb_x_ = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_b_x_f64m2, zero_f64m1, vlmax)) *
259
- inv_n;
260
- nk_f64_t cb_y_ = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_b_y_f64m2, zero_f64m1, vlmax)) *
261
- inv_n;
262
- nk_f64_t cb_z_ = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_b_z_f64m2, zero_f64m1, vlmax)) *
263
- inv_n;
264
- *ca_x = ca_x_;
265
- *ca_y = ca_y_;
266
- *ca_z = ca_z_;
267
- *cb_x = cb_x_;
268
- *cb_y = cb_y_;
269
- *cb_z = cb_z_;
270
- // Fix up: H[i][j] = raw[i][j] - n * ca[i] * cb[j]
271
- nk_f64_t n_f64 = (nk_f64_t)n;
272
- h[0] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_00_f64m2, zero_f64m1, vlmax)) -
273
- n_f64 * ca_x_ * cb_x_;
274
- h[1] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_01_f64m2, zero_f64m1, vlmax)) -
275
- n_f64 * ca_x_ * cb_y_;
276
- h[2] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_02_f64m2, zero_f64m1, vlmax)) -
277
- n_f64 * ca_x_ * cb_z_;
278
- h[3] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_10_f64m2, zero_f64m1, vlmax)) -
279
- n_f64 * ca_y_ * cb_x_;
280
- h[4] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_11_f64m2, zero_f64m1, vlmax)) -
281
- n_f64 * ca_y_ * cb_y_;
282
- h[5] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_12_f64m2, zero_f64m1, vlmax)) -
283
- n_f64 * ca_y_ * cb_z_;
284
- h[6] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_20_f64m2, zero_f64m1, vlmax)) -
285
- n_f64 * ca_z_ * cb_x_;
286
- h[7] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_21_f64m2, zero_f64m1, vlmax)) -
287
- n_f64 * ca_z_ * cb_y_;
288
- h[8] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_22_f64m2, zero_f64m1, vlmax)) -
289
- n_f64 * ca_z_ * cb_z_;
155
+ nk_f64_t inv_points_count = 1.0 / (nk_f64_t)points_count;
156
+ nk_f64_t centroid_a_x_f64 = __riscv_vfmv_f_s_f64m1_f64(
157
+ __riscv_vfredusum_vs_f64m2_f64m1(sum_a_x_f64m2, zero_f64m1, max_vector_length)) *
158
+ inv_points_count;
159
+ nk_f64_t centroid_a_y_f64 = __riscv_vfmv_f_s_f64m1_f64(
160
+ __riscv_vfredusum_vs_f64m2_f64m1(sum_a_y_f64m2, zero_f64m1, max_vector_length)) *
161
+ inv_points_count;
162
+ nk_f64_t centroid_a_z_f64 = __riscv_vfmv_f_s_f64m1_f64(
163
+ __riscv_vfredusum_vs_f64m2_f64m1(sum_a_z_f64m2, zero_f64m1, max_vector_length)) *
164
+ inv_points_count;
165
+ nk_f64_t centroid_b_x_f64 = __riscv_vfmv_f_s_f64m1_f64(
166
+ __riscv_vfredusum_vs_f64m2_f64m1(sum_b_x_f64m2, zero_f64m1, max_vector_length)) *
167
+ inv_points_count;
168
+ nk_f64_t centroid_b_y_f64 = __riscv_vfmv_f_s_f64m1_f64(
169
+ __riscv_vfredusum_vs_f64m2_f64m1(sum_b_y_f64m2, zero_f64m1, max_vector_length)) *
170
+ inv_points_count;
171
+ nk_f64_t centroid_b_z_f64 = __riscv_vfmv_f_s_f64m1_f64(
172
+ __riscv_vfredusum_vs_f64m2_f64m1(sum_b_z_f64m2, zero_f64m1, max_vector_length)) *
173
+ inv_points_count;
174
+ *centroid_a_x = centroid_a_x_f64;
175
+ *centroid_a_y = centroid_a_y_f64;
176
+ *centroid_a_z = centroid_a_z_f64;
177
+ *centroid_b_x = centroid_b_x_f64;
178
+ *centroid_b_y = centroid_b_y_f64;
179
+ *centroid_b_z = centroid_b_z_f64;
180
+ // Fix up: H[i][j] = raw[i][j] - points_count * ca[i] * cb[j]
181
+ nk_f64_t n_f64 = (nk_f64_t)points_count;
182
+ h[0] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_00_f64m2, zero_f64m1, max_vector_length)) -
183
+ n_f64 * centroid_a_x_f64 * centroid_b_x_f64;
184
+ h[1] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_01_f64m2, zero_f64m1, max_vector_length)) -
185
+ n_f64 * centroid_a_x_f64 * centroid_b_y_f64;
186
+ h[2] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_02_f64m2, zero_f64m1, max_vector_length)) -
187
+ n_f64 * centroid_a_x_f64 * centroid_b_z_f64;
188
+ h[3] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_10_f64m2, zero_f64m1, max_vector_length)) -
189
+ n_f64 * centroid_a_y_f64 * centroid_b_x_f64;
190
+ h[4] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_11_f64m2, zero_f64m1, max_vector_length)) -
191
+ n_f64 * centroid_a_y_f64 * centroid_b_y_f64;
192
+ h[5] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_12_f64m2, zero_f64m1, max_vector_length)) -
193
+ n_f64 * centroid_a_y_f64 * centroid_b_z_f64;
194
+ h[6] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_20_f64m2, zero_f64m1, max_vector_length)) -
195
+ n_f64 * centroid_a_z_f64 * centroid_b_x_f64;
196
+ h[7] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_21_f64m2, zero_f64m1, max_vector_length)) -
197
+ n_f64 * centroid_a_z_f64 * centroid_b_y_f64;
198
+ h[8] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_22_f64m2, zero_f64m1, max_vector_length)) -
199
+ n_f64 * centroid_a_z_f64 * centroid_b_z_f64;
290
200
  }
291
201
 
292
202
  /**
@@ -295,42 +205,44 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_rvv_( //
295
205
  * Per-lane `vfadd_vv`/`vfmacc_vv` accumulation with deferred `vfredusum` after the loop
296
206
  * — eliminates 15 horizontal reductions per iteration.
297
207
  */
298
- NK_INTERNAL void nk_centroid_and_cross_covariance_f64_rvv_( //
299
- nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, //
300
- nk_f64_t *ca_x, nk_f64_t *ca_y, nk_f64_t *ca_z, //
301
- nk_f64_t *cb_x, nk_f64_t *cb_y, nk_f64_t *cb_z, //
208
+ NK_INTERNAL void nk_centroid_and_cross_covariance_f64_rvv_( //
209
+ nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, //
210
+ nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
211
+ nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
302
212
  nk_f64_t h[9]) {
303
- nk_size_t vlmax = __riscv_vsetvlmax_e64m1();
304
- vfloat64m1_t sum_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax), sum_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
305
- vfloat64m1_t sum_a_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
306
- vfloat64m1_t sum_b_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax), sum_b_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
307
- vfloat64m1_t sum_b_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
308
- vfloat64m1_t compensation_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
309
- vfloat64m1_t compensation_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
310
- vfloat64m1_t compensation_a_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
311
- vfloat64m1_t compensation_b_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
312
- vfloat64m1_t compensation_b_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
313
- vfloat64m1_t compensation_b_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
314
- vfloat64m1_t cross_00_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax),
315
- cross_01_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
316
- vfloat64m1_t cross_02_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax),
317
- cross_10_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
318
- vfloat64m1_t cross_11_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax),
319
- cross_12_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
320
- vfloat64m1_t cross_20_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax),
321
- cross_21_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
322
- vfloat64m1_t cross_22_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
323
- vfloat64m1_t compensation_00_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
324
- vfloat64m1_t compensation_01_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
325
- vfloat64m1_t compensation_02_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
326
- vfloat64m1_t compensation_10_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
327
- vfloat64m1_t compensation_11_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
328
- vfloat64m1_t compensation_12_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
329
- vfloat64m1_t compensation_20_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
330
- vfloat64m1_t compensation_21_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
331
- vfloat64m1_t compensation_22_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
213
+ nk_size_t max_vector_length = __riscv_vsetvlmax_e64m1();
214
+ vfloat64m1_t sum_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
215
+ sum_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
216
+ vfloat64m1_t sum_a_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
217
+ vfloat64m1_t sum_b_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
218
+ sum_b_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
219
+ vfloat64m1_t sum_b_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
220
+ vfloat64m1_t compensation_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
221
+ vfloat64m1_t compensation_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
222
+ vfloat64m1_t compensation_a_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
223
+ vfloat64m1_t compensation_b_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
224
+ vfloat64m1_t compensation_b_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
225
+ vfloat64m1_t compensation_b_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
226
+ vfloat64m1_t cross_00_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
227
+ cross_01_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
228
+ vfloat64m1_t cross_02_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
229
+ cross_10_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
230
+ vfloat64m1_t cross_11_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
231
+ cross_12_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
232
+ vfloat64m1_t cross_20_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
233
+ cross_21_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
234
+ vfloat64m1_t cross_22_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
235
+ vfloat64m1_t compensation_00_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
236
+ vfloat64m1_t compensation_01_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
237
+ vfloat64m1_t compensation_02_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
238
+ vfloat64m1_t compensation_10_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
239
+ vfloat64m1_t compensation_11_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
240
+ vfloat64m1_t compensation_12_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
241
+ vfloat64m1_t compensation_20_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
242
+ vfloat64m1_t compensation_21_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
243
+ vfloat64m1_t compensation_22_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
332
244
  nk_f64_t const *a_ptr = a, *b_ptr = b;
333
- nk_size_t remaining = n;
245
+ nk_size_t remaining = points_count;
334
246
  for (nk_size_t vector_length; remaining > 0;
335
247
  remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
336
248
  vector_length = __riscv_vsetvl_e64m1(remaining);
@@ -359,29 +271,38 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f64_rvv_( //
359
271
  nk_accumulate_product_f64m1_rvv_(&cross_22_f64m1, &compensation_22_f64m1, a_z_f64m1, b_z_f64m1, vector_length);
360
272
  }
361
273
  // Compute centroids.
362
- nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
363
- nk_f64_t ca_x_ = nk_dot_stable_sum_f64m1_rvv_(sum_a_x_f64m1, compensation_a_x_f64m1) * inv_n;
364
- nk_f64_t ca_y_ = nk_dot_stable_sum_f64m1_rvv_(sum_a_y_f64m1, compensation_a_y_f64m1) * inv_n;
365
- nk_f64_t ca_z_ = nk_dot_stable_sum_f64m1_rvv_(sum_a_z_f64m1, compensation_a_z_f64m1) * inv_n;
366
- nk_f64_t cb_x_ = nk_dot_stable_sum_f64m1_rvv_(sum_b_x_f64m1, compensation_b_x_f64m1) * inv_n;
367
- nk_f64_t cb_y_ = nk_dot_stable_sum_f64m1_rvv_(sum_b_y_f64m1, compensation_b_y_f64m1) * inv_n;
368
- nk_f64_t cb_z_ = nk_dot_stable_sum_f64m1_rvv_(sum_b_z_f64m1, compensation_b_z_f64m1) * inv_n;
369
- *ca_x = ca_x_;
370
- *ca_y = ca_y_;
371
- *ca_z = ca_z_;
372
- *cb_x = cb_x_;
373
- *cb_y = cb_y_;
374
- *cb_z = cb_z_;
375
- nk_f64_t n_f64 = (nk_f64_t)n;
376
- h[0] = nk_dot_stable_sum_f64m1_rvv_(cross_00_f64m1, compensation_00_f64m1) - n_f64 * ca_x_ * cb_x_;
377
- h[1] = nk_dot_stable_sum_f64m1_rvv_(cross_01_f64m1, compensation_01_f64m1) - n_f64 * ca_x_ * cb_y_;
378
- h[2] = nk_dot_stable_sum_f64m1_rvv_(cross_02_f64m1, compensation_02_f64m1) - n_f64 * ca_x_ * cb_z_;
379
- h[3] = nk_dot_stable_sum_f64m1_rvv_(cross_10_f64m1, compensation_10_f64m1) - n_f64 * ca_y_ * cb_x_;
380
- h[4] = nk_dot_stable_sum_f64m1_rvv_(cross_11_f64m1, compensation_11_f64m1) - n_f64 * ca_y_ * cb_y_;
381
- h[5] = nk_dot_stable_sum_f64m1_rvv_(cross_12_f64m1, compensation_12_f64m1) - n_f64 * ca_y_ * cb_z_;
382
- h[6] = nk_dot_stable_sum_f64m1_rvv_(cross_20_f64m1, compensation_20_f64m1) - n_f64 * ca_z_ * cb_x_;
383
- h[7] = nk_dot_stable_sum_f64m1_rvv_(cross_21_f64m1, compensation_21_f64m1) - n_f64 * ca_z_ * cb_y_;
384
- h[8] = nk_dot_stable_sum_f64m1_rvv_(cross_22_f64m1, compensation_22_f64m1) - n_f64 * ca_z_ * cb_z_;
274
+ nk_f64_t inv_points_count = 1.0 / (nk_f64_t)points_count;
275
+ nk_f64_t centroid_a_x_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_a_x_f64m1, compensation_a_x_f64m1) * inv_points_count;
276
+ nk_f64_t centroid_a_y_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_a_y_f64m1, compensation_a_y_f64m1) * inv_points_count;
277
+ nk_f64_t centroid_a_z_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_a_z_f64m1, compensation_a_z_f64m1) * inv_points_count;
278
+ nk_f64_t centroid_b_x_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_b_x_f64m1, compensation_b_x_f64m1) * inv_points_count;
279
+ nk_f64_t centroid_b_y_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_b_y_f64m1, compensation_b_y_f64m1) * inv_points_count;
280
+ nk_f64_t centroid_b_z_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_b_z_f64m1, compensation_b_z_f64m1) * inv_points_count;
281
+ *centroid_a_x = centroid_a_x_f64;
282
+ *centroid_a_y = centroid_a_y_f64;
283
+ *centroid_a_z = centroid_a_z_f64;
284
+ *centroid_b_x = centroid_b_x_f64;
285
+ *centroid_b_y = centroid_b_y_f64;
286
+ *centroid_b_z = centroid_b_z_f64;
287
+ nk_f64_t n_f64 = (nk_f64_t)points_count;
288
+ h[0] = nk_dot_stable_sum_f64m1_rvv_(cross_00_f64m1, compensation_00_f64m1) -
289
+ n_f64 * centroid_a_x_f64 * centroid_b_x_f64;
290
+ h[1] = nk_dot_stable_sum_f64m1_rvv_(cross_01_f64m1, compensation_01_f64m1) -
291
+ n_f64 * centroid_a_x_f64 * centroid_b_y_f64;
292
+ h[2] = nk_dot_stable_sum_f64m1_rvv_(cross_02_f64m1, compensation_02_f64m1) -
293
+ n_f64 * centroid_a_x_f64 * centroid_b_z_f64;
294
+ h[3] = nk_dot_stable_sum_f64m1_rvv_(cross_10_f64m1, compensation_10_f64m1) -
295
+ n_f64 * centroid_a_y_f64 * centroid_b_x_f64;
296
+ h[4] = nk_dot_stable_sum_f64m1_rvv_(cross_11_f64m1, compensation_11_f64m1) -
297
+ n_f64 * centroid_a_y_f64 * centroid_b_y_f64;
298
+ h[5] = nk_dot_stable_sum_f64m1_rvv_(cross_12_f64m1, compensation_12_f64m1) -
299
+ n_f64 * centroid_a_y_f64 * centroid_b_z_f64;
300
+ h[6] = nk_dot_stable_sum_f64m1_rvv_(cross_20_f64m1, compensation_20_f64m1) -
301
+ n_f64 * centroid_a_z_f64 * centroid_b_x_f64;
302
+ h[7] = nk_dot_stable_sum_f64m1_rvv_(cross_21_f64m1, compensation_21_f64m1) -
303
+ n_f64 * centroid_a_z_f64 * centroid_b_y_f64;
304
+ h[8] = nk_dot_stable_sum_f64m1_rvv_(cross_22_f64m1, compensation_22_f64m1) -
305
+ n_f64 * centroid_a_z_f64 * centroid_b_z_f64;
385
306
  }
386
307
 
387
308
  /**
@@ -394,28 +315,30 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f64_rvv_( //
394
315
  * Cross-products use per-lane `vfwmacc_vv` accumulation (vfloat64m2_t) with
395
316
  * deferred `vfredusum` after the loop — eliminates 9 reductions per iteration.
396
317
  */
397
- NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_rvv_( //
398
- nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, //
399
- nk_f64_t *ca_x, nk_f64_t *ca_y, nk_f64_t *ca_z, //
400
- nk_f64_t *cb_x, nk_f64_t *cb_y, nk_f64_t *cb_z, //
318
+ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_rvv_( //
319
+ nk_f32_t const *a, nk_f32_t const *b, nk_size_t points_count, //
320
+ nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
321
+ nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
401
322
  nk_f64_t h[9], nk_f64_t *variance_a) {
402
- nk_size_t vlmax = __riscv_vsetvlmax_e64m2();
403
- vfloat64m2_t sum_a_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax), sum_a_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
404
- vfloat64m2_t sum_a_z_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
405
- vfloat64m2_t sum_b_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax), sum_b_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
406
- vfloat64m2_t sum_b_z_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
407
- vfloat64m2_t cross_00_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax),
408
- cross_01_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
409
- vfloat64m2_t cross_02_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax),
410
- cross_10_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
411
- vfloat64m2_t cross_11_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax),
412
- cross_12_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
413
- vfloat64m2_t cross_20_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax),
414
- cross_21_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
415
- vfloat64m2_t cross_22_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
416
- vfloat64m2_t sum_norm_squared_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
323
+ nk_size_t max_vector_length = __riscv_vsetvlmax_e64m2();
324
+ vfloat64m2_t sum_a_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
325
+ sum_a_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
326
+ vfloat64m2_t sum_a_z_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
327
+ vfloat64m2_t sum_b_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
328
+ sum_b_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
329
+ vfloat64m2_t sum_b_z_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
330
+ vfloat64m2_t cross_00_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
331
+ cross_01_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
332
+ vfloat64m2_t cross_02_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
333
+ cross_10_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
334
+ vfloat64m2_t cross_11_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
335
+ cross_12_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
336
+ vfloat64m2_t cross_20_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
337
+ cross_21_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
338
+ vfloat64m2_t cross_22_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
339
+ vfloat64m2_t sum_norm_squared_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
417
340
  nk_f32_t const *a_ptr = a, *b_ptr = b;
418
- nk_size_t remaining = n;
341
+ nk_size_t remaining = points_count;
419
342
  for (nk_size_t vector_length; remaining > 0;
420
343
  remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
421
344
  vector_length = __riscv_vsetvl_e32m1(remaining);
@@ -450,49 +373,56 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_rvv_( //
450
373
  norm_squared_f64m2, vector_length);
451
374
  }
452
375
  vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
453
- nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
454
- nk_f64_t ca_x_ = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_a_x_f64m2, zero_f64m1, vlmax)) *
455
- inv_n;
456
- nk_f64_t ca_y_ = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_a_y_f64m2, zero_f64m1, vlmax)) *
457
- inv_n;
458
- nk_f64_t ca_z_ = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_a_z_f64m2, zero_f64m1, vlmax)) *
459
- inv_n;
460
- nk_f64_t cb_x_ = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_b_x_f64m2, zero_f64m1, vlmax)) *
461
- inv_n;
462
- nk_f64_t cb_y_ = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_b_y_f64m2, zero_f64m1, vlmax)) *
463
- inv_n;
464
- nk_f64_t cb_z_ = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_b_z_f64m2, zero_f64m1, vlmax)) *
465
- inv_n;
466
- *ca_x = ca_x_;
467
- *ca_y = ca_y_;
468
- *ca_z = ca_z_;
469
- *cb_x = cb_x_;
470
- *cb_y = cb_y_;
471
- *cb_z = cb_z_;
472
- nk_f64_t n_f64 = (nk_f64_t)n;
473
- h[0] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_00_f64m2, zero_f64m1, vlmax)) -
474
- n_f64 * ca_x_ * cb_x_;
475
- h[1] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_01_f64m2, zero_f64m1, vlmax)) -
476
- n_f64 * ca_x_ * cb_y_;
477
- h[2] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_02_f64m2, zero_f64m1, vlmax)) -
478
- n_f64 * ca_x_ * cb_z_;
479
- h[3] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_10_f64m2, zero_f64m1, vlmax)) -
480
- n_f64 * ca_y_ * cb_x_;
481
- h[4] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_11_f64m2, zero_f64m1, vlmax)) -
482
- n_f64 * ca_y_ * cb_y_;
483
- h[5] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_12_f64m2, zero_f64m1, vlmax)) -
484
- n_f64 * ca_y_ * cb_z_;
485
- h[6] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_20_f64m2, zero_f64m1, vlmax)) -
486
- n_f64 * ca_z_ * cb_x_;
487
- h[7] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_21_f64m2, zero_f64m1, vlmax)) -
488
- n_f64 * ca_z_ * cb_y_;
489
- h[8] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_22_f64m2, zero_f64m1, vlmax)) -
490
- n_f64 * ca_z_ * cb_z_;
491
- // variance_a = (1/n) * ||a[i]||² - n * ||ca||²)
376
+ nk_f64_t inv_points_count = 1.0 / (nk_f64_t)points_count;
377
+ nk_f64_t centroid_a_x_f64 = __riscv_vfmv_f_s_f64m1_f64(
378
+ __riscv_vfredusum_vs_f64m2_f64m1(sum_a_x_f64m2, zero_f64m1, max_vector_length)) *
379
+ inv_points_count;
380
+ nk_f64_t centroid_a_y_f64 = __riscv_vfmv_f_s_f64m1_f64(
381
+ __riscv_vfredusum_vs_f64m2_f64m1(sum_a_y_f64m2, zero_f64m1, max_vector_length)) *
382
+ inv_points_count;
383
+ nk_f64_t centroid_a_z_f64 = __riscv_vfmv_f_s_f64m1_f64(
384
+ __riscv_vfredusum_vs_f64m2_f64m1(sum_a_z_f64m2, zero_f64m1, max_vector_length)) *
385
+ inv_points_count;
386
+ nk_f64_t centroid_b_x_f64 = __riscv_vfmv_f_s_f64m1_f64(
387
+ __riscv_vfredusum_vs_f64m2_f64m1(sum_b_x_f64m2, zero_f64m1, max_vector_length)) *
388
+ inv_points_count;
389
+ nk_f64_t centroid_b_y_f64 = __riscv_vfmv_f_s_f64m1_f64(
390
+ __riscv_vfredusum_vs_f64m2_f64m1(sum_b_y_f64m2, zero_f64m1, max_vector_length)) *
391
+ inv_points_count;
392
+ nk_f64_t centroid_b_z_f64 = __riscv_vfmv_f_s_f64m1_f64(
393
+ __riscv_vfredusum_vs_f64m2_f64m1(sum_b_z_f64m2, zero_f64m1, max_vector_length)) *
394
+ inv_points_count;
395
+ *centroid_a_x = centroid_a_x_f64;
396
+ *centroid_a_y = centroid_a_y_f64;
397
+ *centroid_a_z = centroid_a_z_f64;
398
+ *centroid_b_x = centroid_b_x_f64;
399
+ *centroid_b_y = centroid_b_y_f64;
400
+ *centroid_b_z = centroid_b_z_f64;
401
+ nk_f64_t n_f64 = (nk_f64_t)points_count;
402
+ h[0] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_00_f64m2, zero_f64m1, max_vector_length)) -
403
+ n_f64 * centroid_a_x_f64 * centroid_b_x_f64;
404
+ h[1] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_01_f64m2, zero_f64m1, max_vector_length)) -
405
+ n_f64 * centroid_a_x_f64 * centroid_b_y_f64;
406
+ h[2] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_02_f64m2, zero_f64m1, max_vector_length)) -
407
+ n_f64 * centroid_a_x_f64 * centroid_b_z_f64;
408
+ h[3] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_10_f64m2, zero_f64m1, max_vector_length)) -
409
+ n_f64 * centroid_a_y_f64 * centroid_b_x_f64;
410
+ h[4] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_11_f64m2, zero_f64m1, max_vector_length)) -
411
+ n_f64 * centroid_a_y_f64 * centroid_b_y_f64;
412
+ h[5] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_12_f64m2, zero_f64m1, max_vector_length)) -
413
+ n_f64 * centroid_a_y_f64 * centroid_b_z_f64;
414
+ h[6] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_20_f64m2, zero_f64m1, max_vector_length)) -
415
+ n_f64 * centroid_a_z_f64 * centroid_b_x_f64;
416
+ h[7] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_21_f64m2, zero_f64m1, max_vector_length)) -
417
+ n_f64 * centroid_a_z_f64 * centroid_b_y_f64;
418
+ h[8] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_22_f64m2, zero_f64m1, max_vector_length)) -
419
+ n_f64 * centroid_a_z_f64 * centroid_b_z_f64;
420
+ // variance_a = (1/points_count) * (Σ ||a[i]||² - points_count * ||ca||²)
492
421
  *variance_a = __riscv_vfmv_f_s_f64m1_f64(
493
- __riscv_vfredusum_vs_f64m2_f64m1(sum_norm_squared_f64m2, zero_f64m1, vlmax)) *
494
- inv_n -
495
- (ca_x_ * ca_x_ + ca_y_ * ca_y_ + ca_z_ * ca_z_);
422
+ __riscv_vfredusum_vs_f64m2_f64m1(sum_norm_squared_f64m2, zero_f64m1, max_vector_length)) *
423
+ inv_points_count -
424
+ (centroid_a_x_f64 * centroid_a_x_f64 + centroid_a_y_f64 * centroid_a_y_f64 +
425
+ centroid_a_z_f64 * centroid_a_z_f64);
496
426
  }
497
427
 
498
428
  /**
@@ -501,44 +431,46 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_rvv_( //
501
431
  * Per-lane `vfadd_vv`/`vfmacc_vv` accumulation with deferred `vfredusum` after the loop
502
432
  * — eliminates 16 horizontal reductions per iteration.
503
433
  */
504
- NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f64_rvv_( //
505
- nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, //
506
- nk_f64_t *ca_x, nk_f64_t *ca_y, nk_f64_t *ca_z, //
507
- nk_f64_t *cb_x, nk_f64_t *cb_y, nk_f64_t *cb_z, //
434
+ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f64_rvv_( //
435
+ nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, //
436
+ nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
437
+ nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
508
438
  nk_f64_t h[9], nk_f64_t *variance_a) {
509
- nk_size_t vlmax = __riscv_vsetvlmax_e64m1();
510
- vfloat64m1_t sum_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax), sum_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
511
- vfloat64m1_t sum_a_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
512
- vfloat64m1_t sum_b_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax), sum_b_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
513
- vfloat64m1_t sum_b_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
514
- vfloat64m1_t compensation_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
515
- vfloat64m1_t compensation_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
516
- vfloat64m1_t compensation_a_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
517
- vfloat64m1_t compensation_b_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
518
- vfloat64m1_t compensation_b_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
519
- vfloat64m1_t compensation_b_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
520
- vfloat64m1_t cross_00_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax),
521
- cross_01_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
522
- vfloat64m1_t cross_02_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax),
523
- cross_10_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
524
- vfloat64m1_t cross_11_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax),
525
- cross_12_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
526
- vfloat64m1_t cross_20_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax),
527
- cross_21_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
528
- vfloat64m1_t cross_22_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
529
- vfloat64m1_t compensation_00_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
530
- vfloat64m1_t compensation_01_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
531
- vfloat64m1_t compensation_02_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
532
- vfloat64m1_t compensation_10_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
533
- vfloat64m1_t compensation_11_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
534
- vfloat64m1_t compensation_12_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
535
- vfloat64m1_t compensation_20_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
536
- vfloat64m1_t compensation_21_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
537
- vfloat64m1_t compensation_22_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
538
- vfloat64m1_t sum_norm_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
539
- vfloat64m1_t compensation_norm_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
439
+ nk_size_t max_vector_length = __riscv_vsetvlmax_e64m1();
440
+ vfloat64m1_t sum_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
441
+ sum_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
442
+ vfloat64m1_t sum_a_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
443
+ vfloat64m1_t sum_b_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
444
+ sum_b_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
445
+ vfloat64m1_t sum_b_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
446
+ vfloat64m1_t compensation_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
447
+ vfloat64m1_t compensation_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
448
+ vfloat64m1_t compensation_a_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
449
+ vfloat64m1_t compensation_b_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
450
+ vfloat64m1_t compensation_b_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
451
+ vfloat64m1_t compensation_b_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
452
+ vfloat64m1_t cross_00_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
453
+ cross_01_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
454
+ vfloat64m1_t cross_02_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
455
+ cross_10_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
456
+ vfloat64m1_t cross_11_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
457
+ cross_12_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
458
+ vfloat64m1_t cross_20_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
459
+ cross_21_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
460
+ vfloat64m1_t cross_22_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
461
+ vfloat64m1_t compensation_00_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
462
+ vfloat64m1_t compensation_01_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
463
+ vfloat64m1_t compensation_02_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
464
+ vfloat64m1_t compensation_10_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
465
+ vfloat64m1_t compensation_11_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
466
+ vfloat64m1_t compensation_12_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
467
+ vfloat64m1_t compensation_20_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
468
+ vfloat64m1_t compensation_21_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
469
+ vfloat64m1_t compensation_22_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
470
+ vfloat64m1_t sum_norm_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
471
+ vfloat64m1_t compensation_norm_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
540
472
  nk_f64_t const *a_ptr = a, *b_ptr = b;
541
- nk_size_t remaining = n;
473
+ nk_size_t remaining = points_count;
542
474
  for (nk_size_t vector_length; remaining > 0;
543
475
  remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
544
476
  vector_length = __riscv_vsetvl_e64m1(remaining);
@@ -571,56 +503,70 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f64_rvv_( //
571
503
  nk_accumulate_sum_f64m1_rvv_(&sum_norm_squared_f64m1, &compensation_norm_squared_f64m1, norm_squared_f64m1,
572
504
  vector_length);
573
505
  }
574
- nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
575
- nk_f64_t ca_x_ = nk_dot_stable_sum_f64m1_rvv_(sum_a_x_f64m1, compensation_a_x_f64m1) * inv_n;
576
- nk_f64_t ca_y_ = nk_dot_stable_sum_f64m1_rvv_(sum_a_y_f64m1, compensation_a_y_f64m1) * inv_n;
577
- nk_f64_t ca_z_ = nk_dot_stable_sum_f64m1_rvv_(sum_a_z_f64m1, compensation_a_z_f64m1) * inv_n;
578
- nk_f64_t cb_x_ = nk_dot_stable_sum_f64m1_rvv_(sum_b_x_f64m1, compensation_b_x_f64m1) * inv_n;
579
- nk_f64_t cb_y_ = nk_dot_stable_sum_f64m1_rvv_(sum_b_y_f64m1, compensation_b_y_f64m1) * inv_n;
580
- nk_f64_t cb_z_ = nk_dot_stable_sum_f64m1_rvv_(sum_b_z_f64m1, compensation_b_z_f64m1) * inv_n;
581
- *ca_x = ca_x_;
582
- *ca_y = ca_y_;
583
- *ca_z = ca_z_;
584
- *cb_x = cb_x_;
585
- *cb_y = cb_y_;
586
- *cb_z = cb_z_;
587
- nk_f64_t n_f64 = (nk_f64_t)n;
588
- h[0] = nk_dot_stable_sum_f64m1_rvv_(cross_00_f64m1, compensation_00_f64m1) - n_f64 * ca_x_ * cb_x_;
589
- h[1] = nk_dot_stable_sum_f64m1_rvv_(cross_01_f64m1, compensation_01_f64m1) - n_f64 * ca_x_ * cb_y_;
590
- h[2] = nk_dot_stable_sum_f64m1_rvv_(cross_02_f64m1, compensation_02_f64m1) - n_f64 * ca_x_ * cb_z_;
591
- h[3] = nk_dot_stable_sum_f64m1_rvv_(cross_10_f64m1, compensation_10_f64m1) - n_f64 * ca_y_ * cb_x_;
592
- h[4] = nk_dot_stable_sum_f64m1_rvv_(cross_11_f64m1, compensation_11_f64m1) - n_f64 * ca_y_ * cb_y_;
593
- h[5] = nk_dot_stable_sum_f64m1_rvv_(cross_12_f64m1, compensation_12_f64m1) - n_f64 * ca_y_ * cb_z_;
594
- h[6] = nk_dot_stable_sum_f64m1_rvv_(cross_20_f64m1, compensation_20_f64m1) - n_f64 * ca_z_ * cb_x_;
595
- h[7] = nk_dot_stable_sum_f64m1_rvv_(cross_21_f64m1, compensation_21_f64m1) - n_f64 * ca_z_ * cb_y_;
596
- h[8] = nk_dot_stable_sum_f64m1_rvv_(cross_22_f64m1, compensation_22_f64m1) - n_f64 * ca_z_ * cb_z_;
597
- *variance_a = nk_dot_stable_sum_f64m1_rvv_(sum_norm_squared_f64m1, compensation_norm_squared_f64m1) * inv_n -
598
- (ca_x_ * ca_x_ + ca_y_ * ca_y_ + ca_z_ * ca_z_);
506
+ nk_f64_t inv_points_count = 1.0 / (nk_f64_t)points_count;
507
+ nk_f64_t centroid_a_x_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_a_x_f64m1, compensation_a_x_f64m1) * inv_points_count;
508
+ nk_f64_t centroid_a_y_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_a_y_f64m1, compensation_a_y_f64m1) * inv_points_count;
509
+ nk_f64_t centroid_a_z_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_a_z_f64m1, compensation_a_z_f64m1) * inv_points_count;
510
+ nk_f64_t centroid_b_x_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_b_x_f64m1, compensation_b_x_f64m1) * inv_points_count;
511
+ nk_f64_t centroid_b_y_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_b_y_f64m1, compensation_b_y_f64m1) * inv_points_count;
512
+ nk_f64_t centroid_b_z_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_b_z_f64m1, compensation_b_z_f64m1) * inv_points_count;
513
+ *centroid_a_x = centroid_a_x_f64;
514
+ *centroid_a_y = centroid_a_y_f64;
515
+ *centroid_a_z = centroid_a_z_f64;
516
+ *centroid_b_x = centroid_b_x_f64;
517
+ *centroid_b_y = centroid_b_y_f64;
518
+ *centroid_b_z = centroid_b_z_f64;
519
+ nk_f64_t n_f64 = (nk_f64_t)points_count;
520
+ h[0] = nk_dot_stable_sum_f64m1_rvv_(cross_00_f64m1, compensation_00_f64m1) -
521
+ n_f64 * centroid_a_x_f64 * centroid_b_x_f64;
522
+ h[1] = nk_dot_stable_sum_f64m1_rvv_(cross_01_f64m1, compensation_01_f64m1) -
523
+ n_f64 * centroid_a_x_f64 * centroid_b_y_f64;
524
+ h[2] = nk_dot_stable_sum_f64m1_rvv_(cross_02_f64m1, compensation_02_f64m1) -
525
+ n_f64 * centroid_a_x_f64 * centroid_b_z_f64;
526
+ h[3] = nk_dot_stable_sum_f64m1_rvv_(cross_10_f64m1, compensation_10_f64m1) -
527
+ n_f64 * centroid_a_y_f64 * centroid_b_x_f64;
528
+ h[4] = nk_dot_stable_sum_f64m1_rvv_(cross_11_f64m1, compensation_11_f64m1) -
529
+ n_f64 * centroid_a_y_f64 * centroid_b_y_f64;
530
+ h[5] = nk_dot_stable_sum_f64m1_rvv_(cross_12_f64m1, compensation_12_f64m1) -
531
+ n_f64 * centroid_a_y_f64 * centroid_b_z_f64;
532
+ h[6] = nk_dot_stable_sum_f64m1_rvv_(cross_20_f64m1, compensation_20_f64m1) -
533
+ n_f64 * centroid_a_z_f64 * centroid_b_x_f64;
534
+ h[7] = nk_dot_stable_sum_f64m1_rvv_(cross_21_f64m1, compensation_21_f64m1) -
535
+ n_f64 * centroid_a_z_f64 * centroid_b_y_f64;
536
+ h[8] = nk_dot_stable_sum_f64m1_rvv_(cross_22_f64m1, compensation_22_f64m1) -
537
+ n_f64 * centroid_a_z_f64 * centroid_b_z_f64;
538
+ *variance_a = nk_dot_stable_sum_f64m1_rvv_(sum_norm_squared_f64m1, compensation_norm_squared_f64m1) *
539
+ inv_points_count -
540
+ (centroid_a_x_f64 * centroid_a_x_f64 + centroid_a_y_f64 * centroid_a_y_f64 +
541
+ centroid_a_z_f64 * centroid_a_z_f64);
599
542
  }
600
543
 
601
- NK_INTERNAL nk_f64_t nk_transformed_ssd_f32_rvv_( //
602
- nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, //
603
- nk_f64_t const *r, nk_f64_t scale, //
604
- nk_f64_t ca_x, nk_f64_t ca_y, nk_f64_t ca_z, //
605
- nk_f64_t cb_x, nk_f64_t cb_y, nk_f64_t cb_z) {
544
+ NK_INTERNAL nk_f64_t nk_transformed_ssd_f32_rvv_( //
545
+ nk_f32_t const *a, nk_f32_t const *b, nk_size_t points_count, //
546
+ nk_f64_t const *r, nk_f64_t scale, //
547
+ nk_f64_t centroid_a_x, nk_f64_t centroid_a_y, nk_f64_t centroid_a_z, //
548
+ nk_f64_t centroid_b_x, nk_f64_t centroid_b_y, nk_f64_t centroid_b_z) {
606
549
  nk_f64_t scaled_rotation_x_x = scale * r[0], scaled_rotation_x_y = scale * r[1], scaled_rotation_x_z = scale * r[2];
607
550
  nk_f64_t scaled_rotation_y_x = scale * r[3], scaled_rotation_y_y = scale * r[4], scaled_rotation_y_z = scale * r[5];
608
551
  nk_f64_t scaled_rotation_z_x = scale * r[6], scaled_rotation_z_y = scale * r[7], scaled_rotation_z_z = scale * r[8];
609
- nk_size_t vlmax = __riscv_vsetvlmax_e64m2();
610
- vfloat64m2_t sum_distance_squared_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
552
+ nk_size_t max_vector_length = __riscv_vsetvlmax_e64m2();
553
+ vfloat64m2_t sum_distance_squared_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
611
554
  vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
612
555
  nk_f32_t const *a_ptr = a, *b_ptr = b;
613
- nk_size_t remaining = n;
556
+ nk_size_t remaining = points_count;
614
557
  for (nk_size_t vector_length; remaining > 0;
615
558
  remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
616
559
  vector_length = __riscv_vsetvl_e32m1(remaining);
617
560
  vfloat32m1x3_t a_f32m1x3 = __riscv_vlseg3e32_v_f32m1x3(a_ptr, vector_length);
618
561
  vfloat64m2_t centered_a_x_f64m2 = __riscv_vfsub_vf_f64m2(
619
- __riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 0), vector_length), ca_x, vector_length);
562
+ __riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 0), vector_length), centroid_a_x,
563
+ vector_length);
620
564
  vfloat64m2_t centered_a_y_f64m2 = __riscv_vfsub_vf_f64m2(
621
- __riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 1), vector_length), ca_y, vector_length);
565
+ __riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 1), vector_length), centroid_a_y,
566
+ vector_length);
622
567
  vfloat64m2_t centered_a_z_f64m2 = __riscv_vfsub_vf_f64m2(
623
- __riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 2), vector_length), ca_z, vector_length);
568
+ __riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 2), vector_length), centroid_a_z,
569
+ vector_length);
624
570
  vfloat64m2_t rotated_a_x_f64m2 = __riscv_vfmul_vf_f64m2(centered_a_x_f64m2, scaled_rotation_x_x, vector_length);
625
571
  rotated_a_x_f64m2 = __riscv_vfmacc_vf_f64m2(rotated_a_x_f64m2, scaled_rotation_x_y, centered_a_y_f64m2,
626
572
  vector_length);
@@ -638,11 +584,14 @@ NK_INTERNAL nk_f64_t nk_transformed_ssd_f32_rvv_( //
638
584
  vector_length);
639
585
  vfloat32m1x3_t b_f32m1x3 = __riscv_vlseg3e32_v_f32m1x3(b_ptr, vector_length);
640
586
  vfloat64m2_t centered_b_x_f64m2 = __riscv_vfsub_vf_f64m2(
641
- __riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 0), vector_length), cb_x, vector_length);
587
+ __riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 0), vector_length), centroid_b_x,
588
+ vector_length);
642
589
  vfloat64m2_t centered_b_y_f64m2 = __riscv_vfsub_vf_f64m2(
643
- __riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 1), vector_length), cb_y, vector_length);
590
+ __riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 1), vector_length), centroid_b_y,
591
+ vector_length);
644
592
  vfloat64m2_t centered_b_z_f64m2 = __riscv_vfsub_vf_f64m2(
645
- __riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 2), vector_length), cb_z, vector_length);
593
+ __riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 2), vector_length), centroid_b_z,
594
+ vector_length);
646
595
  vfloat64m2_t delta_x_f64m2 = __riscv_vfsub_vv_f64m2(rotated_a_x_f64m2, centered_b_x_f64m2, vector_length);
647
596
  vfloat64m2_t delta_y_f64m2 = __riscv_vfsub_vv_f64m2(rotated_a_y_f64m2, centered_b_y_f64m2, vector_length);
648
597
  vfloat64m2_t delta_z_f64m2 = __riscv_vfsub_vv_f64m2(rotated_a_z_f64m2, centered_b_z_f64m2, vector_length);
@@ -653,32 +602,33 @@ NK_INTERNAL nk_f64_t nk_transformed_ssd_f32_rvv_( //
653
602
  sum_distance_squared_f64m2 = __riscv_vfmacc_vv_f64m2_tu(sum_distance_squared_f64m2, delta_z_f64m2,
654
603
  delta_z_f64m2, vector_length);
655
604
  }
656
- return __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_distance_squared_f64m2, zero_f64m1, vlmax));
605
+ return __riscv_vfmv_f_s_f64m1_f64(
606
+ __riscv_vfredusum_vs_f64m2_f64m1(sum_distance_squared_f64m2, zero_f64m1, max_vector_length));
657
607
  }
658
608
 
659
- NK_INTERNAL nk_f64_t nk_transformed_ssd_f64_rvv_( //
660
- nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, //
661
- nk_f64_t const *r, nk_f64_t scale, //
662
- nk_f64_t ca_x, nk_f64_t ca_y, nk_f64_t ca_z, //
663
- nk_f64_t cb_x, nk_f64_t cb_y, nk_f64_t cb_z) {
609
+ NK_INTERNAL nk_f64_t nk_transformed_ssd_f64_rvv_( //
610
+ nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, //
611
+ nk_f64_t const *r, nk_f64_t scale, //
612
+ nk_f64_t centroid_a_x, nk_f64_t centroid_a_y, nk_f64_t centroid_a_z, //
613
+ nk_f64_t centroid_b_x, nk_f64_t centroid_b_y, nk_f64_t centroid_b_z) {
664
614
  nk_f64_t scaled_rotation_x_x = scale * r[0], scaled_rotation_x_y = scale * r[1], scaled_rotation_x_z = scale * r[2];
665
615
  nk_f64_t scaled_rotation_y_x = scale * r[3], scaled_rotation_y_y = scale * r[4], scaled_rotation_y_z = scale * r[5];
666
616
  nk_f64_t scaled_rotation_z_x = scale * r[6], scaled_rotation_z_y = scale * r[7], scaled_rotation_z_z = scale * r[8];
667
- nk_size_t vlmax = __riscv_vsetvlmax_e64m1();
668
- vfloat64m1_t sum_distance_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
669
- vfloat64m1_t compensation_distance_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
617
+ nk_size_t max_vector_length = __riscv_vsetvlmax_e64m1();
618
+ vfloat64m1_t sum_distance_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
619
+ vfloat64m1_t compensation_distance_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
670
620
  nk_f64_t const *a_ptr = a, *b_ptr = b;
671
- nk_size_t remaining = n;
621
+ nk_size_t remaining = points_count;
672
622
  for (nk_size_t vector_length; remaining > 0;
673
623
  remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
674
624
  vector_length = __riscv_vsetvl_e64m1(remaining);
675
625
  vfloat64m1x3_t a_f64m1x3 = __riscv_vlseg3e64_v_f64m1x3(a_ptr, vector_length);
676
- vfloat64m1_t centered_a_x_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 0), ca_x,
677
- vector_length);
678
- vfloat64m1_t centered_a_y_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 1), ca_y,
679
- vector_length);
680
- vfloat64m1_t centered_a_z_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 2), ca_z,
681
- vector_length);
626
+ vfloat64m1_t centered_a_x_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 0),
627
+ centroid_a_x, vector_length);
628
+ vfloat64m1_t centered_a_y_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 1),
629
+ centroid_a_y, vector_length);
630
+ vfloat64m1_t centered_a_z_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 2),
631
+ centroid_a_z, vector_length);
682
632
  vfloat64m1_t rotated_a_x_f64m1 = __riscv_vfmul_vf_f64m1(centered_a_x_f64m1, scaled_rotation_x_x, vector_length);
683
633
  rotated_a_x_f64m1 = __riscv_vfmacc_vf_f64m1(rotated_a_x_f64m1, scaled_rotation_x_y, centered_a_y_f64m1,
684
634
  vector_length);
@@ -695,12 +645,12 @@ NK_INTERNAL nk_f64_t nk_transformed_ssd_f64_rvv_( //
695
645
  rotated_a_z_f64m1 = __riscv_vfmacc_vf_f64m1(rotated_a_z_f64m1, scaled_rotation_z_z, centered_a_z_f64m1,
696
646
  vector_length);
697
647
  vfloat64m1x3_t b_f64m1x3 = __riscv_vlseg3e64_v_f64m1x3(b_ptr, vector_length);
698
- vfloat64m1_t centered_b_x_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 0), cb_x,
699
- vector_length);
700
- vfloat64m1_t centered_b_y_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 1), cb_y,
701
- vector_length);
702
- vfloat64m1_t centered_b_z_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 2), cb_z,
703
- vector_length);
648
+ vfloat64m1_t centered_b_x_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 0),
649
+ centroid_b_x, vector_length);
650
+ vfloat64m1_t centered_b_y_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 1),
651
+ centroid_b_y, vector_length);
652
+ vfloat64m1_t centered_b_z_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 2),
653
+ centroid_b_z, vector_length);
704
654
  vfloat64m1_t delta_x_f64m1 = __riscv_vfsub_vv_f64m1(rotated_a_x_f64m1, centered_b_x_f64m1, vector_length);
705
655
  vfloat64m1_t delta_y_f64m1 = __riscv_vfsub_vv_f64m1(rotated_a_y_f64m1, centered_b_y_f64m1, vector_length);
706
656
  vfloat64m1_t delta_z_f64m1 = __riscv_vfsub_vv_f64m1(rotated_a_z_f64m1, centered_b_z_f64m1, vector_length);
@@ -745,42 +695,176 @@ NK_INTERNAL void nk_rotation_from_svd_f64_rvv_( //
745
695
  nk_rotation_from_svd_f64_serial_(svd_u, svd_v, r);
746
696
  }
747
697
 
748
- NK_PUBLIC void nk_rmsd_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
698
+ NK_PUBLIC void nk_rmsd_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
749
699
  nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
750
- nk_f64_t identity[9] = {1, 0, 0, 0, 1, 0, 0, 0, 1};
751
700
  if (rotation)
752
- for (int j = 0; j < 9; ++j) rotation[j] = identity[j];
701
+ rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
702
+ rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
753
703
  if (scale) *scale = 1.0f;
754
- nk_f64_t ca_x, ca_y, ca_z, cb_x, cb_y, cb_z;
755
- nk_bicentroid_f32_rvv_(a, b, n, &ca_x, &ca_y, &ca_z, &cb_x, &cb_y, &cb_z);
756
- if (a_centroid) a_centroid[0] = (nk_f32_t)ca_x, a_centroid[1] = (nk_f32_t)ca_y, a_centroid[2] = (nk_f32_t)ca_z;
757
- if (b_centroid) b_centroid[0] = (nk_f32_t)cb_x, b_centroid[1] = (nk_f32_t)cb_y, b_centroid[2] = (nk_f32_t)cb_z;
758
- nk_f64_t ssd = nk_transformed_ssd_f32_rvv_(a, b, n, identity, 1.0, ca_x, ca_y, ca_z, cb_x, cb_y, cb_z);
759
- *result = nk_f64_sqrt_rvv(ssd / (nk_f64_t)n);
704
+
705
+ // Fused single-pass: accumulate centroids and squared differences simultaneously.
706
+ // RMSD = (E[(a−b)²] (ā )²)
707
+ nk_size_t max_vector_length = __riscv_vsetvlmax_e64m2();
708
+ vfloat64m2_t sum_a_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
709
+ vfloat64m2_t sum_a_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
710
+ vfloat64m2_t sum_a_z_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
711
+ vfloat64m2_t sum_b_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
712
+ vfloat64m2_t sum_b_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
713
+ vfloat64m2_t sum_b_z_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
714
+ vfloat64m2_t sum_squared_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
715
+ nk_f32_t const *a_ptr = a, *b_ptr = b;
716
+ nk_size_t remaining = points_count;
717
+ for (nk_size_t vector_length; remaining > 0;
718
+ remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
719
+ vector_length = __riscv_vsetvl_e32m1(remaining);
720
+ vfloat32m1x3_t a_f32m1x3 = __riscv_vlseg3e32_v_f32m1x3(a_ptr, vector_length);
721
+ vfloat32m1_t a_x_f32m1 = __riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 0);
722
+ vfloat32m1_t a_y_f32m1 = __riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 1);
723
+ vfloat32m1_t a_z_f32m1 = __riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 2);
724
+ vfloat32m1x3_t b_f32m1x3 = __riscv_vlseg3e32_v_f32m1x3(b_ptr, vector_length);
725
+ vfloat32m1_t b_x_f32m1 = __riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 0);
726
+ vfloat32m1_t b_y_f32m1 = __riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 1);
727
+ vfloat32m1_t b_z_f32m1 = __riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 2);
728
+ // Accumulate centroids in f64.
729
+ sum_a_x_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_a_x_f64m2, sum_a_x_f64m2, a_x_f32m1, vector_length);
730
+ sum_a_y_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_a_y_f64m2, sum_a_y_f64m2, a_y_f32m1, vector_length);
731
+ sum_a_z_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_a_z_f64m2, sum_a_z_f64m2, a_z_f32m1, vector_length);
732
+ sum_b_x_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_b_x_f64m2, sum_b_x_f64m2, b_x_f32m1, vector_length);
733
+ sum_b_y_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_b_y_f64m2, sum_b_y_f64m2, b_y_f32m1, vector_length);
734
+ sum_b_z_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_b_z_f64m2, sum_b_z_f64m2, b_z_f32m1, vector_length);
735
+ // Accumulate (a−b)² per component. Widen a,b to f64 before subtracting to avoid f32
736
+ // cancellation in the single-pass formula RMSD = √(E[(a−b)²] − (ā − b̄)²).
737
+ vfloat64m2_t a_x_f64m2 = __riscv_vfwcvt_f_f_v_f64m2(a_x_f32m1, vector_length);
738
+ vfloat64m2_t b_x_f64m2 = __riscv_vfwcvt_f_f_v_f64m2(b_x_f32m1, vector_length);
739
+ vfloat64m2_t a_y_f64m2 = __riscv_vfwcvt_f_f_v_f64m2(a_y_f32m1, vector_length);
740
+ vfloat64m2_t b_y_f64m2 = __riscv_vfwcvt_f_f_v_f64m2(b_y_f32m1, vector_length);
741
+ vfloat64m2_t a_z_f64m2 = __riscv_vfwcvt_f_f_v_f64m2(a_z_f32m1, vector_length);
742
+ vfloat64m2_t b_z_f64m2 = __riscv_vfwcvt_f_f_v_f64m2(b_z_f32m1, vector_length);
743
+ vfloat64m2_t delta_x_f64m2 = __riscv_vfsub_vv_f64m2(a_x_f64m2, b_x_f64m2, vector_length);
744
+ vfloat64m2_t delta_y_f64m2 = __riscv_vfsub_vv_f64m2(a_y_f64m2, b_y_f64m2, vector_length);
745
+ vfloat64m2_t delta_z_f64m2 = __riscv_vfsub_vv_f64m2(a_z_f64m2, b_z_f64m2, vector_length);
746
+ sum_squared_f64m2 = __riscv_vfmacc_vv_f64m2_tu(sum_squared_f64m2, delta_x_f64m2, delta_x_f64m2, vector_length);
747
+ sum_squared_f64m2 = __riscv_vfmacc_vv_f64m2_tu(sum_squared_f64m2, delta_y_f64m2, delta_y_f64m2, vector_length);
748
+ sum_squared_f64m2 = __riscv_vfmacc_vv_f64m2_tu(sum_squared_f64m2, delta_z_f64m2, delta_z_f64m2, vector_length);
749
+ }
750
+ vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
751
+ nk_f64_t inv_points_count = 1.0 / (nk_f64_t)points_count;
752
+ nk_f64_t centroid_a_x = __riscv_vfmv_f_s_f64m1_f64(
753
+ __riscv_vfredusum_vs_f64m2_f64m1(sum_a_x_f64m2, zero_f64m1, max_vector_length)) *
754
+ inv_points_count;
755
+ nk_f64_t centroid_a_y = __riscv_vfmv_f_s_f64m1_f64(
756
+ __riscv_vfredusum_vs_f64m2_f64m1(sum_a_y_f64m2, zero_f64m1, max_vector_length)) *
757
+ inv_points_count;
758
+ nk_f64_t centroid_a_z = __riscv_vfmv_f_s_f64m1_f64(
759
+ __riscv_vfredusum_vs_f64m2_f64m1(sum_a_z_f64m2, zero_f64m1, max_vector_length)) *
760
+ inv_points_count;
761
+ nk_f64_t centroid_b_x = __riscv_vfmv_f_s_f64m1_f64(
762
+ __riscv_vfredusum_vs_f64m2_f64m1(sum_b_x_f64m2, zero_f64m1, max_vector_length)) *
763
+ inv_points_count;
764
+ nk_f64_t centroid_b_y = __riscv_vfmv_f_s_f64m1_f64(
765
+ __riscv_vfredusum_vs_f64m2_f64m1(sum_b_y_f64m2, zero_f64m1, max_vector_length)) *
766
+ inv_points_count;
767
+ nk_f64_t centroid_b_z = __riscv_vfmv_f_s_f64m1_f64(
768
+ __riscv_vfredusum_vs_f64m2_f64m1(sum_b_z_f64m2, zero_f64m1, max_vector_length)) *
769
+ inv_points_count;
770
+ if (a_centroid)
771
+ a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
772
+ a_centroid[2] = (nk_f32_t)centroid_a_z;
773
+ if (b_centroid)
774
+ b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
775
+ b_centroid[2] = (nk_f32_t)centroid_b_z;
776
+
777
+ nk_f64_t sum_squared = __riscv_vfmv_f_s_f64m1_f64(
778
+ __riscv_vfredusum_vs_f64m2_f64m1(sum_squared_f64m2, zero_f64m1, max_vector_length));
779
+ nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x, mean_diff_y = centroid_a_y - centroid_b_y,
780
+ mean_diff_z = centroid_a_z - centroid_b_z;
781
+ nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
782
+ *result = nk_f64_sqrt_rvv(sum_squared * inv_points_count - mean_diff_sq);
760
783
  }
761
784
 
762
- NK_PUBLIC void nk_rmsd_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
785
+ NK_PUBLIC void nk_rmsd_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, nk_f64_t *a_centroid,
763
786
  nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
764
- nk_f64_t identity[9] = {1, 0, 0, 0, 1, 0, 0, 0, 1};
765
787
  if (rotation)
766
- for (int j = 0; j < 9; ++j) rotation[j] = identity[j];
788
+ rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
789
+ rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
767
790
  if (scale) *scale = 1.0;
768
- nk_f64_t ca_x, ca_y, ca_z, cb_x, cb_y, cb_z;
769
- nk_bicentroid_f64_rvv_(a, b, n, &ca_x, &ca_y, &ca_z, &cb_x, &cb_y, &cb_z);
770
- if (a_centroid) a_centroid[0] = ca_x, a_centroid[1] = ca_y, a_centroid[2] = ca_z;
771
- if (b_centroid) b_centroid[0] = cb_x, b_centroid[1] = cb_y, b_centroid[2] = cb_z;
772
- nk_f64_t ssd = nk_transformed_ssd_f64_rvv_(a, b, n, identity, 1.0, ca_x, ca_y, ca_z, cb_x, cb_y, cb_z);
773
- *result = nk_f64_sqrt_rvv(ssd / (nk_f64_t)n);
791
+
792
+ // Fused single-pass: accumulate centroids and squared differences simultaneously.
793
+ // RMSD = √(E[(a−b)²] b̄)²)
794
+ nk_size_t max_vector_length = __riscv_vsetvlmax_e64m1();
795
+ vfloat64m1_t sum_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
796
+ vfloat64m1_t sum_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
797
+ vfloat64m1_t sum_a_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
798
+ vfloat64m1_t sum_b_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
799
+ vfloat64m1_t sum_b_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
800
+ vfloat64m1_t sum_b_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
801
+ vfloat64m1_t compensation_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
802
+ vfloat64m1_t compensation_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
803
+ vfloat64m1_t compensation_a_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
804
+ vfloat64m1_t compensation_b_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
805
+ vfloat64m1_t compensation_b_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
806
+ vfloat64m1_t compensation_b_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
807
+ vfloat64m1_t sum_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
808
+ vfloat64m1_t compensation_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
809
+ nk_f64_t const *a_ptr = a, *b_ptr = b;
810
+ nk_size_t remaining = points_count;
811
+ for (nk_size_t vector_length; remaining > 0;
812
+ remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
813
+ vector_length = __riscv_vsetvl_e64m1(remaining);
814
+ vfloat64m1x3_t a_f64m1x3 = __riscv_vlseg3e64_v_f64m1x3(a_ptr, vector_length);
815
+ vfloat64m1_t a_x_f64m1 = __riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 0);
816
+ vfloat64m1_t a_y_f64m1 = __riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 1);
817
+ vfloat64m1_t a_z_f64m1 = __riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 2);
818
+ vfloat64m1x3_t b_f64m1x3 = __riscv_vlseg3e64_v_f64m1x3(b_ptr, vector_length);
819
+ vfloat64m1_t b_x_f64m1 = __riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 0);
820
+ vfloat64m1_t b_y_f64m1 = __riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 1);
821
+ vfloat64m1_t b_z_f64m1 = __riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 2);
822
+ // Accumulate centroids with Kahan compensation.
823
+ nk_accumulate_sum_f64m1_rvv_(&sum_a_x_f64m1, &compensation_a_x_f64m1, a_x_f64m1, vector_length);
824
+ nk_accumulate_sum_f64m1_rvv_(&sum_a_y_f64m1, &compensation_a_y_f64m1, a_y_f64m1, vector_length);
825
+ nk_accumulate_sum_f64m1_rvv_(&sum_a_z_f64m1, &compensation_a_z_f64m1, a_z_f64m1, vector_length);
826
+ nk_accumulate_sum_f64m1_rvv_(&sum_b_x_f64m1, &compensation_b_x_f64m1, b_x_f64m1, vector_length);
827
+ nk_accumulate_sum_f64m1_rvv_(&sum_b_y_f64m1, &compensation_b_y_f64m1, b_y_f64m1, vector_length);
828
+ nk_accumulate_sum_f64m1_rvv_(&sum_b_z_f64m1, &compensation_b_z_f64m1, b_z_f64m1, vector_length);
829
+ // Accumulate (a-b)^2 per component.
830
+ vfloat64m1_t delta_x_f64m1 = __riscv_vfsub_vv_f64m1(a_x_f64m1, b_x_f64m1, vector_length);
831
+ vfloat64m1_t delta_y_f64m1 = __riscv_vfsub_vv_f64m1(a_y_f64m1, b_y_f64m1, vector_length);
832
+ vfloat64m1_t delta_z_f64m1 = __riscv_vfsub_vv_f64m1(a_z_f64m1, b_z_f64m1, vector_length);
833
+ vfloat64m1_t dist_sq_f64m1 = __riscv_vfmul_vv_f64m1(delta_x_f64m1, delta_x_f64m1, vector_length);
834
+ dist_sq_f64m1 = __riscv_vfmacc_vv_f64m1(dist_sq_f64m1, delta_y_f64m1, delta_y_f64m1, vector_length);
835
+ dist_sq_f64m1 = __riscv_vfmacc_vv_f64m1(dist_sq_f64m1, delta_z_f64m1, delta_z_f64m1, vector_length);
836
+ nk_accumulate_sum_f64m1_rvv_(&sum_squared_f64m1, &compensation_squared_f64m1, dist_sq_f64m1, vector_length);
837
+ }
838
+ nk_f64_t inv_points_count = 1.0 / (nk_f64_t)points_count;
839
+ nk_f64_t centroid_a_x = nk_dot_stable_sum_f64m1_rvv_(sum_a_x_f64m1, compensation_a_x_f64m1) * inv_points_count;
840
+ nk_f64_t centroid_a_y = nk_dot_stable_sum_f64m1_rvv_(sum_a_y_f64m1, compensation_a_y_f64m1) * inv_points_count;
841
+ nk_f64_t centroid_a_z = nk_dot_stable_sum_f64m1_rvv_(sum_a_z_f64m1, compensation_a_z_f64m1) * inv_points_count;
842
+ nk_f64_t centroid_b_x = nk_dot_stable_sum_f64m1_rvv_(sum_b_x_f64m1, compensation_b_x_f64m1) * inv_points_count;
843
+ nk_f64_t centroid_b_y = nk_dot_stable_sum_f64m1_rvv_(sum_b_y_f64m1, compensation_b_y_f64m1) * inv_points_count;
844
+ nk_f64_t centroid_b_z = nk_dot_stable_sum_f64m1_rvv_(sum_b_z_f64m1, compensation_b_z_f64m1) * inv_points_count;
845
+ if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
846
+ if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
847
+
848
+ nk_f64_t sum_squared = nk_dot_stable_sum_f64m1_rvv_(sum_squared_f64m1, compensation_squared_f64m1);
849
+ nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x, mean_diff_y = centroid_a_y - centroid_b_y,
850
+ mean_diff_z = centroid_a_z - centroid_b_z;
851
+ nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
852
+ *result = nk_f64_sqrt_rvv(sum_squared * inv_points_count - mean_diff_sq);
774
853
  }
775
854
 
776
- NK_PUBLIC void nk_kabsch_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
855
+ NK_PUBLIC void nk_kabsch_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
777
856
  nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
778
857
  if (scale) *scale = 1.0f;
779
- nk_f64_t ca_x, ca_y, ca_z, cb_x, cb_y, cb_z;
858
+ nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z;
780
859
  nk_f64_t h[9];
781
- nk_centroid_and_cross_covariance_f32_rvv_(a, b, n, &ca_x, &ca_y, &ca_z, &cb_x, &cb_y, &cb_z, h);
782
- if (a_centroid) a_centroid[0] = (nk_f32_t)ca_x, a_centroid[1] = (nk_f32_t)ca_y, a_centroid[2] = (nk_f32_t)ca_z;
783
- if (b_centroid) b_centroid[0] = (nk_f32_t)cb_x, b_centroid[1] = (nk_f32_t)cb_y, b_centroid[2] = (nk_f32_t)cb_z;
860
+ nk_centroid_and_cross_covariance_f32_rvv_(a, b, points_count, &centroid_a_x, &centroid_a_y, &centroid_a_z,
861
+ &centroid_b_x, &centroid_b_y, &centroid_b_z, h);
862
+ if (a_centroid)
863
+ a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
864
+ a_centroid[2] = (nk_f32_t)centroid_a_z;
865
+ if (b_centroid)
866
+ b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
867
+ b_centroid[2] = (nk_f32_t)centroid_b_z;
784
868
  nk_f64_t svd_u[9], svd_s[9], svd_v[9];
785
869
  nk_svd3x3_f64_(h, svd_u, svd_s, svd_v);
786
870
  nk_f64_t r[9];
@@ -791,18 +875,20 @@ NK_PUBLIC void nk_kabsch_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t
791
875
  }
792
876
  if (rotation)
793
877
  for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)r[j];
794
- nk_f64_t ssd = nk_transformed_ssd_f32_rvv_(a, b, n, r, 1.0, ca_x, ca_y, ca_z, cb_x, cb_y, cb_z);
795
- *result = nk_f64_sqrt_rvv(ssd / (nk_f64_t)n);
878
+ nk_f64_t ssd = nk_transformed_ssd_f32_rvv_(a, b, points_count, r, 1.0, centroid_a_x, centroid_a_y, centroid_a_z,
879
+ centroid_b_x, centroid_b_y, centroid_b_z);
880
+ *result = nk_f64_sqrt_rvv(ssd / (nk_f64_t)points_count);
796
881
  }
797
882
 
798
- NK_PUBLIC void nk_kabsch_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
883
+ NK_PUBLIC void nk_kabsch_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, nk_f64_t *a_centroid,
799
884
  nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
800
885
  if (scale) *scale = 1.0;
801
- nk_f64_t ca_x, ca_y, ca_z, cb_x, cb_y, cb_z;
886
+ nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z;
802
887
  nk_f64_t h[9];
803
- nk_centroid_and_cross_covariance_f64_rvv_(a, b, n, &ca_x, &ca_y, &ca_z, &cb_x, &cb_y, &cb_z, h);
804
- if (a_centroid) a_centroid[0] = ca_x, a_centroid[1] = ca_y, a_centroid[2] = ca_z;
805
- if (b_centroid) b_centroid[0] = cb_x, b_centroid[1] = cb_y, b_centroid[2] = cb_z;
888
+ nk_centroid_and_cross_covariance_f64_rvv_(a, b, points_count, &centroid_a_x, &centroid_a_y, &centroid_a_z,
889
+ &centroid_b_x, &centroid_b_y, &centroid_b_z, h);
890
+ if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
891
+ if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
806
892
  nk_f64_t svd_u[9], svd_s[9], svd_v[9];
807
893
  nk_svd3x3_f64_(h, svd_u, svd_s, svd_v);
808
894
  nk_f64_t r[9];
@@ -813,18 +899,24 @@ NK_PUBLIC void nk_kabsch_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t
813
899
  }
814
900
  if (rotation)
815
901
  for (int j = 0; j < 9; ++j) rotation[j] = r[j];
816
- nk_f64_t ssd = nk_transformed_ssd_f64_rvv_(a, b, n, r, 1.0, ca_x, ca_y, ca_z, cb_x, cb_y, cb_z);
817
- *result = nk_f64_sqrt_rvv(ssd / (nk_f64_t)n);
902
+ nk_f64_t ssd = nk_transformed_ssd_f64_rvv_(a, b, points_count, r, 1.0, centroid_a_x, centroid_a_y, centroid_a_z,
903
+ centroid_b_x, centroid_b_y, centroid_b_z);
904
+ *result = nk_f64_sqrt_rvv(ssd / (nk_f64_t)points_count);
818
905
  }
819
906
 
820
- NK_PUBLIC void nk_umeyama_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
907
+ NK_PUBLIC void nk_umeyama_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
821
908
  nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
822
- nk_f64_t ca_x, ca_y, ca_z, cb_x, cb_y, cb_z;
909
+ nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z;
823
910
  nk_f64_t h[9], variance_a;
824
- nk_centroid_and_cross_covariance_and_variance_f32_rvv_(a, b, n, &ca_x, &ca_y, &ca_z, &cb_x, &cb_y, &cb_z, h,
825
- &variance_a);
826
- if (a_centroid) a_centroid[0] = (nk_f32_t)ca_x, a_centroid[1] = (nk_f32_t)ca_y, a_centroid[2] = (nk_f32_t)ca_z;
827
- if (b_centroid) b_centroid[0] = (nk_f32_t)cb_x, b_centroid[1] = (nk_f32_t)cb_y, b_centroid[2] = (nk_f32_t)cb_z;
911
+ nk_centroid_and_cross_covariance_and_variance_f32_rvv_(a, b, points_count, &centroid_a_x, &centroid_a_y,
912
+ &centroid_a_z, &centroid_b_x, &centroid_b_y, &centroid_b_z,
913
+ h, &variance_a);
914
+ if (a_centroid)
915
+ a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
916
+ a_centroid[2] = (nk_f32_t)centroid_a_z;
917
+ if (b_centroid)
918
+ b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
919
+ b_centroid[2] = (nk_f32_t)centroid_b_z;
828
920
  nk_f64_t svd_u[9], svd_s[9], svd_v[9];
829
921
  nk_svd3x3_f64_(h, svd_u, svd_s, svd_v);
830
922
  nk_f64_t r[9];
@@ -832,7 +924,7 @@ NK_PUBLIC void nk_umeyama_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_
832
924
  nk_f64_t det = nk_det3x3_f64_(r);
833
925
  nk_f64_t sign_det = det < 0 ? -1.0 : 1.0;
834
926
  nk_f64_t trace_ds = nk_sum_three_products_f64_(svd_s[0], 1.0, svd_s[4], 1.0, svd_s[8], sign_det);
835
- nk_f64_t scale_factor = trace_ds / ((nk_f64_t)n * variance_a);
927
+ nk_f64_t scale_factor = trace_ds / ((nk_f64_t)points_count * variance_a);
836
928
  if (scale) *scale = (nk_f32_t)scale_factor;
837
929
  if (det < 0) {
838
930
  svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
@@ -840,18 +932,20 @@ NK_PUBLIC void nk_umeyama_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_
840
932
  }
841
933
  if (rotation)
842
934
  for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)r[j];
843
- nk_f64_t ssd = nk_transformed_ssd_f32_rvv_(a, b, n, r, scale_factor, ca_x, ca_y, ca_z, cb_x, cb_y, cb_z);
844
- *result = nk_f64_sqrt_rvv(ssd / (nk_f64_t)n);
935
+ nk_f64_t ssd = nk_transformed_ssd_f32_rvv_(a, b, points_count, r, scale_factor, centroid_a_x, centroid_a_y,
936
+ centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z);
937
+ *result = nk_f64_sqrt_rvv(ssd / (nk_f64_t)points_count);
845
938
  }
846
939
 
847
- NK_PUBLIC void nk_umeyama_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
940
+ NK_PUBLIC void nk_umeyama_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, nk_f64_t *a_centroid,
848
941
  nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
849
- nk_f64_t ca_x, ca_y, ca_z, cb_x, cb_y, cb_z;
942
+ nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z;
850
943
  nk_f64_t h[9], variance_a;
851
- nk_centroid_and_cross_covariance_and_variance_f64_rvv_(a, b, n, &ca_x, &ca_y, &ca_z, &cb_x, &cb_y, &cb_z, h,
852
- &variance_a);
853
- if (a_centroid) a_centroid[0] = ca_x, a_centroid[1] = ca_y, a_centroid[2] = ca_z;
854
- if (b_centroid) b_centroid[0] = cb_x, b_centroid[1] = cb_y, b_centroid[2] = cb_z;
944
+ nk_centroid_and_cross_covariance_and_variance_f64_rvv_(a, b, points_count, &centroid_a_x, &centroid_a_y,
945
+ &centroid_a_z, &centroid_b_x, &centroid_b_y, &centroid_b_z,
946
+ h, &variance_a);
947
+ if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
948
+ if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
855
949
  nk_f64_t svd_u[9], svd_s[9], svd_v[9];
856
950
  nk_svd3x3_f64_(h, svd_u, svd_s, svd_v);
857
951
  nk_f64_t r[9];
@@ -859,7 +953,7 @@ NK_PUBLIC void nk_umeyama_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_
859
953
  nk_f64_t det = nk_det3x3_f64_(r);
860
954
  nk_f64_t sign_det = det < 0 ? -1.0 : 1.0;
861
955
  nk_f64_t trace_ds = nk_sum_three_products_f64_(svd_s[0], 1.0, svd_s[4], 1.0, svd_s[8], sign_det);
862
- nk_f64_t scale_factor = trace_ds / ((nk_f64_t)n * variance_a);
956
+ nk_f64_t scale_factor = trace_ds / ((nk_f64_t)points_count * variance_a);
863
957
  if (scale) *scale = scale_factor;
864
958
  if (det < 0) {
865
959
  svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
@@ -867,38 +961,39 @@ NK_PUBLIC void nk_umeyama_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_
867
961
  }
868
962
  if (rotation)
869
963
  for (int j = 0; j < 9; ++j) rotation[j] = r[j];
870
- nk_f64_t ssd = nk_transformed_ssd_f64_rvv_(a, b, n, r, scale_factor, ca_x, ca_y, ca_z, cb_x, cb_y, cb_z);
871
- *result = nk_f64_sqrt_rvv(ssd / (nk_f64_t)n);
964
+ nk_f64_t ssd = nk_transformed_ssd_f64_rvv_(a, b, points_count, r, scale_factor, centroid_a_x, centroid_a_y,
965
+ centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z);
966
+ *result = nk_f64_sqrt_rvv(ssd / (nk_f64_t)points_count);
872
967
  }
873
968
 
874
- NK_PUBLIC void nk_rmsd_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
969
+ NK_PUBLIC void nk_rmsd_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
875
970
  nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
876
- nk_rmsd_f16_serial(a, b, n, a_centroid, b_centroid, rotation, scale, result);
971
+ nk_rmsd_f16_serial(a, b, points_count, a_centroid, b_centroid, rotation, scale, result);
877
972
  }
878
973
 
879
- NK_PUBLIC void nk_kabsch_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
974
+ NK_PUBLIC void nk_kabsch_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
880
975
  nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
881
- nk_kabsch_f16_serial(a, b, n, a_centroid, b_centroid, rotation, scale, result);
976
+ nk_kabsch_f16_serial(a, b, points_count, a_centroid, b_centroid, rotation, scale, result);
882
977
  }
883
978
 
884
- NK_PUBLIC void nk_umeyama_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
979
+ NK_PUBLIC void nk_umeyama_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
885
980
  nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
886
- nk_umeyama_f16_serial(a, b, n, a_centroid, b_centroid, rotation, scale, result);
981
+ nk_umeyama_f16_serial(a, b, points_count, a_centroid, b_centroid, rotation, scale, result);
887
982
  }
888
983
 
889
- NK_PUBLIC void nk_rmsd_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
984
+ NK_PUBLIC void nk_rmsd_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
890
985
  nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
891
- nk_rmsd_bf16_serial(a, b, n, a_centroid, b_centroid, rotation, scale, result);
986
+ nk_rmsd_bf16_serial(a, b, points_count, a_centroid, b_centroid, rotation, scale, result);
892
987
  }
893
988
 
894
- NK_PUBLIC void nk_kabsch_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
989
+ NK_PUBLIC void nk_kabsch_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
895
990
  nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
896
- nk_kabsch_bf16_serial(a, b, n, a_centroid, b_centroid, rotation, scale, result);
991
+ nk_kabsch_bf16_serial(a, b, points_count, a_centroid, b_centroid, rotation, scale, result);
897
992
  }
898
993
 
899
- NK_PUBLIC void nk_umeyama_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
994
+ NK_PUBLIC void nk_umeyama_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
900
995
  nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
901
- nk_umeyama_bf16_serial(a, b, n, a_centroid, b_centroid, rotation, scale, result);
996
+ nk_umeyama_bf16_serial(a, b, points_count, a_centroid, b_centroid, rotation, scale, result);
902
997
  }
903
998
 
904
999
  #if defined(__cplusplus)