numkong 7.5.0 → 7.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/binding.gyp +18 -0
  2. package/c/dispatch_e5m2.c +23 -3
  3. package/include/numkong/capabilities.h +1 -1
  4. package/include/numkong/cast/README.md +3 -0
  5. package/include/numkong/cast/haswell.h +28 -64
  6. package/include/numkong/cast/serial.h +17 -0
  7. package/include/numkong/cast/skylake.h +67 -52
  8. package/include/numkong/cast.h +1 -0
  9. package/include/numkong/dot/README.md +1 -0
  10. package/include/numkong/dot/haswell.h +92 -13
  11. package/include/numkong/dot/serial.h +15 -0
  12. package/include/numkong/dot/skylake.h +61 -14
  13. package/include/numkong/dots/README.md +2 -0
  14. package/include/numkong/dots/graniteamx.h +434 -0
  15. package/include/numkong/dots/haswell.h +28 -28
  16. package/include/numkong/dots/sapphireamx.h +1 -1
  17. package/include/numkong/dots/serial.h +23 -8
  18. package/include/numkong/dots/skylake.h +28 -23
  19. package/include/numkong/dots.h +12 -0
  20. package/include/numkong/each/serial.h +18 -1
  21. package/include/numkong/geospatial/serial.h +14 -3
  22. package/include/numkong/maxsim/serial.h +15 -0
  23. package/include/numkong/mesh/README.md +50 -44
  24. package/include/numkong/mesh/genoa.h +462 -0
  25. package/include/numkong/mesh/haswell.h +806 -933
  26. package/include/numkong/mesh/neon.h +871 -943
  27. package/include/numkong/mesh/neonbfdot.h +382 -522
  28. package/include/numkong/mesh/neonfhm.h +676 -0
  29. package/include/numkong/mesh/rvv.h +404 -319
  30. package/include/numkong/mesh/serial.h +204 -162
  31. package/include/numkong/mesh/skylake.h +1029 -1585
  32. package/include/numkong/mesh/v128relaxed.h +403 -377
  33. package/include/numkong/mesh.h +38 -0
  34. package/include/numkong/reduce/serial.h +15 -1
  35. package/include/numkong/sparse/serial.h +17 -2
  36. package/include/numkong/spatial/genoa.h +0 -68
  37. package/include/numkong/spatial/haswell.h +98 -56
  38. package/include/numkong/spatial/serial.h +15 -0
  39. package/include/numkong/spatial/skylake.h +114 -54
  40. package/include/numkong/spatial.h +0 -12
  41. package/include/numkong/spatials/graniteamx.h +128 -0
  42. package/include/numkong/spatials/serial.h +18 -1
  43. package/include/numkong/spatials/skylake.h +2 -2
  44. package/include/numkong/spatials.h +17 -0
  45. package/include/numkong/tensor.hpp +107 -23
  46. package/javascript/numkong.c +3 -2
  47. package/package.json +7 -7
  48. package/wasm/numkong.wasm +0 -0
@@ -16,8 +16,8 @@
16
16
  * Fused helpers minimize data passes:
17
17
  *
18
18
  * - RMSD: fully fused single-pass (centroids + squared diffs), no separate helper
19
- * - `nk_centroid_and_cross_covariance_*_rvv_`: centroids + H in one pass (Kabsch)
20
- * - `nk_centroid_and_cross_covariance_and_variance_*_rvv_`: + variance (Umeyama)
19
+ * - `nk_centroid_and_cross_covariance_*_rvv_`: centroids + H + centered ‖·‖² of a and b (Kabsch)
20
+ * - `nk_centroid_and_cross_covariance_and_variance_*_rvv_`: same outputs, used by Umeyama
21
21
  *
22
22
  * Math for fused centroid+covariance:
23
23
  * H[i][j] = Σ (a[i] - ca[i]) * (b[j] - cb[j])
@@ -104,7 +104,7 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_rvv_( //
104
104
  nk_f32_t const *a, nk_f32_t const *b, nk_size_t points_count, //
105
105
  nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
106
106
  nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
107
- nk_f64_t h[9]) {
107
+ nk_f64_t cross_covariance[9], nk_f64_t *centered_norm_squared_a, nk_f64_t *centered_norm_squared_b) {
108
108
  nk_size_t max_vector_length = __riscv_vsetvlmax_e64m2();
109
109
  vfloat64m2_t sum_a_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
110
110
  sum_a_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
@@ -121,6 +121,8 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_rvv_( //
121
121
  vfloat64m2_t cross_20_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
122
122
  cross_21_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
123
123
  vfloat64m2_t cross_22_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
124
+ vfloat64m2_t norm_squared_a_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
125
+ vfloat64m2_t norm_squared_b_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
124
126
  nk_f32_t const *a_ptr = a, *b_ptr = b;
125
127
  nk_size_t remaining = points_count;
126
128
  for (nk_size_t vector_length; remaining > 0;
@@ -149,6 +151,13 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_rvv_( //
149
151
  cross_20_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(cross_20_f64m2, a_z_f32m1, b_x_f32m1, vector_length);
150
152
  cross_21_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(cross_21_f64m2, a_z_f32m1, b_y_f32m1, vector_length);
151
153
  cross_22_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(cross_22_f64m2, a_z_f32m1, b_z_f32m1, vector_length);
154
+ // Accumulate norm-squared for a and b (uncentered; centering fixup applied after reduction).
155
+ norm_squared_a_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_a_f64m2, a_x_f32m1, a_x_f32m1, vector_length);
156
+ norm_squared_a_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_a_f64m2, a_y_f32m1, a_y_f32m1, vector_length);
157
+ norm_squared_a_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_a_f64m2, a_z_f32m1, a_z_f32m1, vector_length);
158
+ norm_squared_b_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_b_f64m2, b_x_f32m1, b_x_f32m1, vector_length);
159
+ norm_squared_b_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_b_f64m2, b_y_f32m1, b_y_f32m1, vector_length);
160
+ norm_squared_b_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_b_f64m2, b_z_f32m1, b_z_f32m1, vector_length);
152
161
  }
153
162
  vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
154
163
  // Compute centroids
@@ -179,24 +188,46 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_rvv_( //
179
188
  *centroid_b_z = centroid_b_z_f64;
180
189
  // Fix up: H[i][j] = raw[i][j] - points_count * ca[i] * cb[j]
181
190
  nk_f64_t n_f64 = (nk_f64_t)points_count;
182
- h[0] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_00_f64m2, zero_f64m1, max_vector_length)) -
183
- n_f64 * centroid_a_x_f64 * centroid_b_x_f64;
184
- h[1] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_01_f64m2, zero_f64m1, max_vector_length)) -
185
- n_f64 * centroid_a_x_f64 * centroid_b_y_f64;
186
- h[2] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_02_f64m2, zero_f64m1, max_vector_length)) -
187
- n_f64 * centroid_a_x_f64 * centroid_b_z_f64;
188
- h[3] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_10_f64m2, zero_f64m1, max_vector_length)) -
189
- n_f64 * centroid_a_y_f64 * centroid_b_x_f64;
190
- h[4] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_11_f64m2, zero_f64m1, max_vector_length)) -
191
- n_f64 * centroid_a_y_f64 * centroid_b_y_f64;
192
- h[5] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_12_f64m2, zero_f64m1, max_vector_length)) -
193
- n_f64 * centroid_a_y_f64 * centroid_b_z_f64;
194
- h[6] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_20_f64m2, zero_f64m1, max_vector_length)) -
195
- n_f64 * centroid_a_z_f64 * centroid_b_x_f64;
196
- h[7] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_21_f64m2, zero_f64m1, max_vector_length)) -
197
- n_f64 * centroid_a_z_f64 * centroid_b_y_f64;
198
- h[8] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_22_f64m2, zero_f64m1, max_vector_length)) -
199
- n_f64 * centroid_a_z_f64 * centroid_b_z_f64;
191
+ cross_covariance[0] = __riscv_vfmv_f_s_f64m1_f64(
192
+ __riscv_vfredusum_vs_f64m2_f64m1(cross_00_f64m2, zero_f64m1, max_vector_length)) -
193
+ n_f64 * centroid_a_x_f64 * centroid_b_x_f64;
194
+ cross_covariance[1] = __riscv_vfmv_f_s_f64m1_f64(
195
+ __riscv_vfredusum_vs_f64m2_f64m1(cross_01_f64m2, zero_f64m1, max_vector_length)) -
196
+ n_f64 * centroid_a_x_f64 * centroid_b_y_f64;
197
+ cross_covariance[2] = __riscv_vfmv_f_s_f64m1_f64(
198
+ __riscv_vfredusum_vs_f64m2_f64m1(cross_02_f64m2, zero_f64m1, max_vector_length)) -
199
+ n_f64 * centroid_a_x_f64 * centroid_b_z_f64;
200
+ cross_covariance[3] = __riscv_vfmv_f_s_f64m1_f64(
201
+ __riscv_vfredusum_vs_f64m2_f64m1(cross_10_f64m2, zero_f64m1, max_vector_length)) -
202
+ n_f64 * centroid_a_y_f64 * centroid_b_x_f64;
203
+ cross_covariance[4] = __riscv_vfmv_f_s_f64m1_f64(
204
+ __riscv_vfredusum_vs_f64m2_f64m1(cross_11_f64m2, zero_f64m1, max_vector_length)) -
205
+ n_f64 * centroid_a_y_f64 * centroid_b_y_f64;
206
+ cross_covariance[5] = __riscv_vfmv_f_s_f64m1_f64(
207
+ __riscv_vfredusum_vs_f64m2_f64m1(cross_12_f64m2, zero_f64m1, max_vector_length)) -
208
+ n_f64 * centroid_a_y_f64 * centroid_b_z_f64;
209
+ cross_covariance[6] = __riscv_vfmv_f_s_f64m1_f64(
210
+ __riscv_vfredusum_vs_f64m2_f64m1(cross_20_f64m2, zero_f64m1, max_vector_length)) -
211
+ n_f64 * centroid_a_z_f64 * centroid_b_x_f64;
212
+ cross_covariance[7] = __riscv_vfmv_f_s_f64m1_f64(
213
+ __riscv_vfredusum_vs_f64m2_f64m1(cross_21_f64m2, zero_f64m1, max_vector_length)) -
214
+ n_f64 * centroid_a_z_f64 * centroid_b_y_f64;
215
+ cross_covariance[8] = __riscv_vfmv_f_s_f64m1_f64(
216
+ __riscv_vfredusum_vs_f64m2_f64m1(cross_22_f64m2, zero_f64m1, max_vector_length)) -
217
+ n_f64 * centroid_a_z_f64 * centroid_b_z_f64;
218
+ // Centered norm-squared via parallel-axis identity; clamp at zero for numeric safety.
219
+ nk_f64_t norm_squared_a_sum = __riscv_vfmv_f_s_f64m1_f64(
220
+ __riscv_vfredusum_vs_f64m2_f64m1(norm_squared_a_f64m2, zero_f64m1, max_vector_length));
221
+ nk_f64_t norm_squared_b_sum = __riscv_vfmv_f_s_f64m1_f64(
222
+ __riscv_vfredusum_vs_f64m2_f64m1(norm_squared_b_f64m2, zero_f64m1, max_vector_length));
223
+ *centered_norm_squared_a = norm_squared_a_sum -
224
+ n_f64 * (centroid_a_x_f64 * centroid_a_x_f64 + centroid_a_y_f64 * centroid_a_y_f64 +
225
+ centroid_a_z_f64 * centroid_a_z_f64);
226
+ *centered_norm_squared_b = norm_squared_b_sum -
227
+ n_f64 * (centroid_b_x_f64 * centroid_b_x_f64 + centroid_b_y_f64 * centroid_b_y_f64 +
228
+ centroid_b_z_f64 * centroid_b_z_f64);
229
+ if (*centered_norm_squared_a < 0.0) *centered_norm_squared_a = 0.0;
230
+ if (*centered_norm_squared_b < 0.0) *centered_norm_squared_b = 0.0;
200
231
  }
201
232
 
202
233
  /**
@@ -209,7 +240,7 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f64_rvv_( //
209
240
  nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, //
210
241
  nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
211
242
  nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
212
- nk_f64_t h[9]) {
243
+ nk_f64_t cross_covariance[9], nk_f64_t *centered_norm_squared_a, nk_f64_t *centered_norm_squared_b) {
213
244
  nk_size_t max_vector_length = __riscv_vsetvlmax_e64m1();
214
245
  vfloat64m1_t sum_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
215
246
  sum_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
@@ -241,6 +272,10 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f64_rvv_( //
241
272
  vfloat64m1_t compensation_20_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
242
273
  vfloat64m1_t compensation_21_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
243
274
  vfloat64m1_t compensation_22_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
275
+ vfloat64m1_t norm_squared_a_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
276
+ vfloat64m1_t norm_squared_b_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
277
+ vfloat64m1_t compensation_norm_squared_a_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
278
+ vfloat64m1_t compensation_norm_squared_b_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
244
279
  nk_f64_t const *a_ptr = a, *b_ptr = b;
245
280
  nk_size_t remaining = points_count;
246
281
  for (nk_size_t vector_length; remaining > 0;
@@ -269,6 +304,19 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f64_rvv_( //
269
304
  nk_accumulate_product_f64m1_rvv_(&cross_20_f64m1, &compensation_20_f64m1, a_z_f64m1, b_x_f64m1, vector_length);
270
305
  nk_accumulate_product_f64m1_rvv_(&cross_21_f64m1, &compensation_21_f64m1, a_z_f64m1, b_y_f64m1, vector_length);
271
306
  nk_accumulate_product_f64m1_rvv_(&cross_22_f64m1, &compensation_22_f64m1, a_z_f64m1, b_z_f64m1, vector_length);
307
+ // Accumulate norm-squared for a and b via Kahan-compensated products (self*self).
308
+ nk_accumulate_product_f64m1_rvv_(&norm_squared_a_f64m1, &compensation_norm_squared_a_f64m1, a_x_f64m1,
309
+ a_x_f64m1, vector_length);
310
+ nk_accumulate_product_f64m1_rvv_(&norm_squared_a_f64m1, &compensation_norm_squared_a_f64m1, a_y_f64m1,
311
+ a_y_f64m1, vector_length);
312
+ nk_accumulate_product_f64m1_rvv_(&norm_squared_a_f64m1, &compensation_norm_squared_a_f64m1, a_z_f64m1,
313
+ a_z_f64m1, vector_length);
314
+ nk_accumulate_product_f64m1_rvv_(&norm_squared_b_f64m1, &compensation_norm_squared_b_f64m1, b_x_f64m1,
315
+ b_x_f64m1, vector_length);
316
+ nk_accumulate_product_f64m1_rvv_(&norm_squared_b_f64m1, &compensation_norm_squared_b_f64m1, b_y_f64m1,
317
+ b_y_f64m1, vector_length);
318
+ nk_accumulate_product_f64m1_rvv_(&norm_squared_b_f64m1, &compensation_norm_squared_b_f64m1, b_z_f64m1,
319
+ b_z_f64m1, vector_length);
272
320
  }
273
321
  // Compute centroids.
274
322
  nk_f64_t inv_points_count = 1.0 / (nk_f64_t)points_count;
@@ -285,32 +333,45 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f64_rvv_( //
285
333
  *centroid_b_y = centroid_b_y_f64;
286
334
  *centroid_b_z = centroid_b_z_f64;
287
335
  nk_f64_t n_f64 = (nk_f64_t)points_count;
288
- h[0] = nk_dot_stable_sum_f64m1_rvv_(cross_00_f64m1, compensation_00_f64m1) -
289
- n_f64 * centroid_a_x_f64 * centroid_b_x_f64;
290
- h[1] = nk_dot_stable_sum_f64m1_rvv_(cross_01_f64m1, compensation_01_f64m1) -
291
- n_f64 * centroid_a_x_f64 * centroid_b_y_f64;
292
- h[2] = nk_dot_stable_sum_f64m1_rvv_(cross_02_f64m1, compensation_02_f64m1) -
293
- n_f64 * centroid_a_x_f64 * centroid_b_z_f64;
294
- h[3] = nk_dot_stable_sum_f64m1_rvv_(cross_10_f64m1, compensation_10_f64m1) -
295
- n_f64 * centroid_a_y_f64 * centroid_b_x_f64;
296
- h[4] = nk_dot_stable_sum_f64m1_rvv_(cross_11_f64m1, compensation_11_f64m1) -
297
- n_f64 * centroid_a_y_f64 * centroid_b_y_f64;
298
- h[5] = nk_dot_stable_sum_f64m1_rvv_(cross_12_f64m1, compensation_12_f64m1) -
299
- n_f64 * centroid_a_y_f64 * centroid_b_z_f64;
300
- h[6] = nk_dot_stable_sum_f64m1_rvv_(cross_20_f64m1, compensation_20_f64m1) -
301
- n_f64 * centroid_a_z_f64 * centroid_b_x_f64;
302
- h[7] = nk_dot_stable_sum_f64m1_rvv_(cross_21_f64m1, compensation_21_f64m1) -
303
- n_f64 * centroid_a_z_f64 * centroid_b_y_f64;
304
- h[8] = nk_dot_stable_sum_f64m1_rvv_(cross_22_f64m1, compensation_22_f64m1) -
305
- n_f64 * centroid_a_z_f64 * centroid_b_z_f64;
336
+ cross_covariance[0] = nk_dot_stable_sum_f64m1_rvv_(cross_00_f64m1, compensation_00_f64m1) -
337
+ n_f64 * centroid_a_x_f64 * centroid_b_x_f64;
338
+ cross_covariance[1] = nk_dot_stable_sum_f64m1_rvv_(cross_01_f64m1, compensation_01_f64m1) -
339
+ n_f64 * centroid_a_x_f64 * centroid_b_y_f64;
340
+ cross_covariance[2] = nk_dot_stable_sum_f64m1_rvv_(cross_02_f64m1, compensation_02_f64m1) -
341
+ n_f64 * centroid_a_x_f64 * centroid_b_z_f64;
342
+ cross_covariance[3] = nk_dot_stable_sum_f64m1_rvv_(cross_10_f64m1, compensation_10_f64m1) -
343
+ n_f64 * centroid_a_y_f64 * centroid_b_x_f64;
344
+ cross_covariance[4] = nk_dot_stable_sum_f64m1_rvv_(cross_11_f64m1, compensation_11_f64m1) -
345
+ n_f64 * centroid_a_y_f64 * centroid_b_y_f64;
346
+ cross_covariance[5] = nk_dot_stable_sum_f64m1_rvv_(cross_12_f64m1, compensation_12_f64m1) -
347
+ n_f64 * centroid_a_y_f64 * centroid_b_z_f64;
348
+ cross_covariance[6] = nk_dot_stable_sum_f64m1_rvv_(cross_20_f64m1, compensation_20_f64m1) -
349
+ n_f64 * centroid_a_z_f64 * centroid_b_x_f64;
350
+ cross_covariance[7] = nk_dot_stable_sum_f64m1_rvv_(cross_21_f64m1, compensation_21_f64m1) -
351
+ n_f64 * centroid_a_z_f64 * centroid_b_y_f64;
352
+ cross_covariance[8] = nk_dot_stable_sum_f64m1_rvv_(cross_22_f64m1, compensation_22_f64m1) -
353
+ n_f64 * centroid_a_z_f64 * centroid_b_z_f64;
354
+ // Centered norm-squared via parallel-axis identity; clamp at zero for numeric safety.
355
+ nk_f64_t norm_squared_a_sum = nk_dot_stable_sum_f64m1_rvv_(norm_squared_a_f64m1, compensation_norm_squared_a_f64m1);
356
+ nk_f64_t norm_squared_b_sum = nk_dot_stable_sum_f64m1_rvv_(norm_squared_b_f64m1, compensation_norm_squared_b_f64m1);
357
+ *centered_norm_squared_a = norm_squared_a_sum -
358
+ n_f64 * (centroid_a_x_f64 * centroid_a_x_f64 + centroid_a_y_f64 * centroid_a_y_f64 +
359
+ centroid_a_z_f64 * centroid_a_z_f64);
360
+ *centered_norm_squared_b = norm_squared_b_sum -
361
+ n_f64 * (centroid_b_x_f64 * centroid_b_x_f64 + centroid_b_y_f64 * centroid_b_y_f64 +
362
+ centroid_b_z_f64 * centroid_b_z_f64);
363
+ if (*centered_norm_squared_a < 0.0) *centered_norm_squared_a = 0.0;
364
+ if (*centered_norm_squared_b < 0.0) *centered_norm_squared_b = 0.0;
306
365
  }
307
366
 
308
367
  /**
309
- * @brief Compute centroids, cross-covariance, and variance_a in a single pass (f32).
368
+ * @brief Compute centroids, cross-covariance, and centered norm-squared of both point sets (f32).
310
369
  *
311
- * Same as centroid_and_cross_covariance but also computes:
312
- * variance_a = (1/n) * Σ ||a[i] - ca||²
313
- * = (1/n) * (Σ ||a[i]||² - n * ||ca||²)
370
+ * Same as centroid_and_cross_covariance but also outputs:
371
+ * centered_norm_squared_a = Σ ||a[i] - ca||² = Σ ||a[i]||² - n * ||ca||²
372
+ * centered_norm_squared_b = Σ ||b[i] - cb||² = Σ ||b[i]||² - n * ||cb||²
373
+ *
374
+ * These enable the trace-identity SSD fold in Kabsch/Umeyama callers.
314
375
  *
315
376
  * Cross-products use per-lane `vfwmacc_vv` accumulation (vfloat64m2_t) with
316
377
  * deferred `vfredusum` after the loop — eliminates 9 reductions per iteration.
@@ -319,7 +380,7 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_rvv_( //
319
380
  nk_f32_t const *a, nk_f32_t const *b, nk_size_t points_count, //
320
381
  nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
321
382
  nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
322
- nk_f64_t h[9], nk_f64_t *variance_a) {
383
+ nk_f64_t cross_covariance[9], nk_f64_t *centered_norm_squared_a, nk_f64_t *centered_norm_squared_b) {
323
384
  nk_size_t max_vector_length = __riscv_vsetvlmax_e64m2();
324
385
  vfloat64m2_t sum_a_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
325
386
  sum_a_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
@@ -336,7 +397,8 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_rvv_( //
336
397
  vfloat64m2_t cross_20_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
337
398
  cross_21_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
338
399
  vfloat64m2_t cross_22_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
339
- vfloat64m2_t sum_norm_squared_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
400
+ vfloat64m2_t norm_squared_a_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
401
+ vfloat64m2_t norm_squared_b_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
340
402
  nk_f32_t const *a_ptr = a, *b_ptr = b;
341
403
  nk_size_t remaining = points_count;
342
404
  for (nk_size_t vector_length; remaining > 0;
@@ -365,12 +427,13 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_rvv_( //
365
427
  cross_20_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(cross_20_f64m2, a_z_f32m1, b_x_f32m1, vector_length);
366
428
  cross_21_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(cross_21_f64m2, a_z_f32m1, b_y_f32m1, vector_length);
367
429
  cross_22_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(cross_22_f64m2, a_z_f32m1, b_z_f32m1, vector_length);
368
- // Variance: Σ (a_x² + a_y² + a_z²) raw, not centered.
369
- vfloat64m2_t norm_squared_f64m2 = __riscv_vfwmul_vv_f64m2(a_x_f32m1, a_x_f32m1, vector_length);
370
- norm_squared_f64m2 = __riscv_vfwmacc_vv_f64m2(norm_squared_f64m2, a_y_f32m1, a_y_f32m1, vector_length);
371
- norm_squared_f64m2 = __riscv_vfwmacc_vv_f64m2(norm_squared_f64m2, a_z_f32m1, a_z_f32m1, vector_length);
372
- sum_norm_squared_f64m2 = __riscv_vfadd_vv_f64m2_tu(sum_norm_squared_f64m2, sum_norm_squared_f64m2,
373
- norm_squared_f64m2, vector_length);
430
+ // Accumulate norm-squared for a and b (uncentered; centering fixup applied after reduction).
431
+ norm_squared_a_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_a_f64m2, a_x_f32m1, a_x_f32m1, vector_length);
432
+ norm_squared_a_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_a_f64m2, a_y_f32m1, a_y_f32m1, vector_length);
433
+ norm_squared_a_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_a_f64m2, a_z_f32m1, a_z_f32m1, vector_length);
434
+ norm_squared_b_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_b_f64m2, b_x_f32m1, b_x_f32m1, vector_length);
435
+ norm_squared_b_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_b_f64m2, b_y_f32m1, b_y_f32m1, vector_length);
436
+ norm_squared_b_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_b_f64m2, b_z_f32m1, b_z_f32m1, vector_length);
374
437
  }
375
438
  vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
376
439
  nk_f64_t inv_points_count = 1.0 / (nk_f64_t)points_count;
@@ -399,35 +462,52 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_rvv_( //
399
462
  *centroid_b_y = centroid_b_y_f64;
400
463
  *centroid_b_z = centroid_b_z_f64;
401
464
  nk_f64_t n_f64 = (nk_f64_t)points_count;
402
- h[0] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_00_f64m2, zero_f64m1, max_vector_length)) -
403
- n_f64 * centroid_a_x_f64 * centroid_b_x_f64;
404
- h[1] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_01_f64m2, zero_f64m1, max_vector_length)) -
405
- n_f64 * centroid_a_x_f64 * centroid_b_y_f64;
406
- h[2] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_02_f64m2, zero_f64m1, max_vector_length)) -
407
- n_f64 * centroid_a_x_f64 * centroid_b_z_f64;
408
- h[3] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_10_f64m2, zero_f64m1, max_vector_length)) -
409
- n_f64 * centroid_a_y_f64 * centroid_b_x_f64;
410
- h[4] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_11_f64m2, zero_f64m1, max_vector_length)) -
411
- n_f64 * centroid_a_y_f64 * centroid_b_y_f64;
412
- h[5] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_12_f64m2, zero_f64m1, max_vector_length)) -
413
- n_f64 * centroid_a_y_f64 * centroid_b_z_f64;
414
- h[6] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_20_f64m2, zero_f64m1, max_vector_length)) -
415
- n_f64 * centroid_a_z_f64 * centroid_b_x_f64;
416
- h[7] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_21_f64m2, zero_f64m1, max_vector_length)) -
417
- n_f64 * centroid_a_z_f64 * centroid_b_y_f64;
418
- h[8] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_22_f64m2, zero_f64m1, max_vector_length)) -
419
- n_f64 * centroid_a_z_f64 * centroid_b_z_f64;
420
- // variance_a = (1/points_count) * (Σ ||a[i]||² - points_count * ||ca||²)
421
- *variance_a = __riscv_vfmv_f_s_f64m1_f64(
422
- __riscv_vfredusum_vs_f64m2_f64m1(sum_norm_squared_f64m2, zero_f64m1, max_vector_length)) *
423
- inv_points_count -
424
- (centroid_a_x_f64 * centroid_a_x_f64 + centroid_a_y_f64 * centroid_a_y_f64 +
425
- centroid_a_z_f64 * centroid_a_z_f64);
465
+ cross_covariance[0] = __riscv_vfmv_f_s_f64m1_f64(
466
+ __riscv_vfredusum_vs_f64m2_f64m1(cross_00_f64m2, zero_f64m1, max_vector_length)) -
467
+ n_f64 * centroid_a_x_f64 * centroid_b_x_f64;
468
+ cross_covariance[1] = __riscv_vfmv_f_s_f64m1_f64(
469
+ __riscv_vfredusum_vs_f64m2_f64m1(cross_01_f64m2, zero_f64m1, max_vector_length)) -
470
+ n_f64 * centroid_a_x_f64 * centroid_b_y_f64;
471
+ cross_covariance[2] = __riscv_vfmv_f_s_f64m1_f64(
472
+ __riscv_vfredusum_vs_f64m2_f64m1(cross_02_f64m2, zero_f64m1, max_vector_length)) -
473
+ n_f64 * centroid_a_x_f64 * centroid_b_z_f64;
474
+ cross_covariance[3] = __riscv_vfmv_f_s_f64m1_f64(
475
+ __riscv_vfredusum_vs_f64m2_f64m1(cross_10_f64m2, zero_f64m1, max_vector_length)) -
476
+ n_f64 * centroid_a_y_f64 * centroid_b_x_f64;
477
+ cross_covariance[4] = __riscv_vfmv_f_s_f64m1_f64(
478
+ __riscv_vfredusum_vs_f64m2_f64m1(cross_11_f64m2, zero_f64m1, max_vector_length)) -
479
+ n_f64 * centroid_a_y_f64 * centroid_b_y_f64;
480
+ cross_covariance[5] = __riscv_vfmv_f_s_f64m1_f64(
481
+ __riscv_vfredusum_vs_f64m2_f64m1(cross_12_f64m2, zero_f64m1, max_vector_length)) -
482
+ n_f64 * centroid_a_y_f64 * centroid_b_z_f64;
483
+ cross_covariance[6] = __riscv_vfmv_f_s_f64m1_f64(
484
+ __riscv_vfredusum_vs_f64m2_f64m1(cross_20_f64m2, zero_f64m1, max_vector_length)) -
485
+ n_f64 * centroid_a_z_f64 * centroid_b_x_f64;
486
+ cross_covariance[7] = __riscv_vfmv_f_s_f64m1_f64(
487
+ __riscv_vfredusum_vs_f64m2_f64m1(cross_21_f64m2, zero_f64m1, max_vector_length)) -
488
+ n_f64 * centroid_a_z_f64 * centroid_b_y_f64;
489
+ cross_covariance[8] = __riscv_vfmv_f_s_f64m1_f64(
490
+ __riscv_vfredusum_vs_f64m2_f64m1(cross_22_f64m2, zero_f64m1, max_vector_length)) -
491
+ n_f64 * centroid_a_z_f64 * centroid_b_z_f64;
492
+ // Centered norm-squared via parallel-axis identity; clamp at zero for numeric safety.
493
+ nk_f64_t norm_squared_a_sum = __riscv_vfmv_f_s_f64m1_f64(
494
+ __riscv_vfredusum_vs_f64m2_f64m1(norm_squared_a_f64m2, zero_f64m1, max_vector_length));
495
+ nk_f64_t norm_squared_b_sum = __riscv_vfmv_f_s_f64m1_f64(
496
+ __riscv_vfredusum_vs_f64m2_f64m1(norm_squared_b_f64m2, zero_f64m1, max_vector_length));
497
+ *centered_norm_squared_a = norm_squared_a_sum -
498
+ n_f64 * (centroid_a_x_f64 * centroid_a_x_f64 + centroid_a_y_f64 * centroid_a_y_f64 +
499
+ centroid_a_z_f64 * centroid_a_z_f64);
500
+ *centered_norm_squared_b = norm_squared_b_sum -
501
+ n_f64 * (centroid_b_x_f64 * centroid_b_x_f64 + centroid_b_y_f64 * centroid_b_y_f64 +
502
+ centroid_b_z_f64 * centroid_b_z_f64);
503
+ if (*centered_norm_squared_a < 0.0) *centered_norm_squared_a = 0.0;
504
+ if (*centered_norm_squared_b < 0.0) *centered_norm_squared_b = 0.0;
426
505
  }
427
506
 
428
507
  /**
429
- * @brief Compute centroids, cross-covariance, and variance_a in a single pass (f64).
508
+ * @brief Compute centroids, cross-covariance, and centered norm-squared of both point sets (f64).
430
509
  *
510
+ * Same outputs as the f32 variant; used by the Umeyama caller for the trace-identity SSD fold.
431
511
  * Per-lane `vfadd_vv`/`vfmacc_vv` accumulation with deferred `vfredusum` after the loop
432
512
  * — eliminates 16 horizontal reductions per iteration.
433
513
  */
@@ -435,7 +515,7 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f64_rvv_( //
435
515
  nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, //
436
516
  nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
437
517
  nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
438
- nk_f64_t h[9], nk_f64_t *variance_a) {
518
+ nk_f64_t cross_covariance[9], nk_f64_t *centered_norm_squared_a, nk_f64_t *centered_norm_squared_b) {
439
519
  nk_size_t max_vector_length = __riscv_vsetvlmax_e64m1();
440
520
  vfloat64m1_t sum_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
441
521
  sum_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
@@ -467,8 +547,10 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f64_rvv_( //
467
547
  vfloat64m1_t compensation_20_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
468
548
  vfloat64m1_t compensation_21_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
469
549
  vfloat64m1_t compensation_22_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
470
- vfloat64m1_t sum_norm_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
471
- vfloat64m1_t compensation_norm_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
550
+ vfloat64m1_t norm_squared_a_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
551
+ vfloat64m1_t norm_squared_b_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
552
+ vfloat64m1_t compensation_norm_squared_a_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
553
+ vfloat64m1_t compensation_norm_squared_b_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
472
554
  nk_f64_t const *a_ptr = a, *b_ptr = b;
473
555
  nk_size_t remaining = points_count;
474
556
  for (nk_size_t vector_length; remaining > 0;
@@ -497,11 +579,19 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f64_rvv_( //
497
579
  nk_accumulate_product_f64m1_rvv_(&cross_20_f64m1, &compensation_20_f64m1, a_z_f64m1, b_x_f64m1, vector_length);
498
580
  nk_accumulate_product_f64m1_rvv_(&cross_21_f64m1, &compensation_21_f64m1, a_z_f64m1, b_y_f64m1, vector_length);
499
581
  nk_accumulate_product_f64m1_rvv_(&cross_22_f64m1, &compensation_22_f64m1, a_z_f64m1, b_z_f64m1, vector_length);
500
- vfloat64m1_t norm_squared_f64m1 = __riscv_vfmul_vv_f64m1(a_x_f64m1, a_x_f64m1, vector_length);
501
- norm_squared_f64m1 = __riscv_vfmacc_vv_f64m1(norm_squared_f64m1, a_y_f64m1, a_y_f64m1, vector_length);
502
- norm_squared_f64m1 = __riscv_vfmacc_vv_f64m1(norm_squared_f64m1, a_z_f64m1, a_z_f64m1, vector_length);
503
- nk_accumulate_sum_f64m1_rvv_(&sum_norm_squared_f64m1, &compensation_norm_squared_f64m1, norm_squared_f64m1,
504
- vector_length);
582
+ // Accumulate norm-squared for a and b via Kahan-compensated products (self*self).
583
+ nk_accumulate_product_f64m1_rvv_(&norm_squared_a_f64m1, &compensation_norm_squared_a_f64m1, a_x_f64m1,
584
+ a_x_f64m1, vector_length);
585
+ nk_accumulate_product_f64m1_rvv_(&norm_squared_a_f64m1, &compensation_norm_squared_a_f64m1, a_y_f64m1,
586
+ a_y_f64m1, vector_length);
587
+ nk_accumulate_product_f64m1_rvv_(&norm_squared_a_f64m1, &compensation_norm_squared_a_f64m1, a_z_f64m1,
588
+ a_z_f64m1, vector_length);
589
+ nk_accumulate_product_f64m1_rvv_(&norm_squared_b_f64m1, &compensation_norm_squared_b_f64m1, b_x_f64m1,
590
+ b_x_f64m1, vector_length);
591
+ nk_accumulate_product_f64m1_rvv_(&norm_squared_b_f64m1, &compensation_norm_squared_b_f64m1, b_y_f64m1,
592
+ b_y_f64m1, vector_length);
593
+ nk_accumulate_product_f64m1_rvv_(&norm_squared_b_f64m1, &compensation_norm_squared_b_f64m1, b_z_f64m1,
594
+ b_z_f64m1, vector_length);
505
595
  }
506
596
  nk_f64_t inv_points_count = 1.0 / (nk_f64_t)points_count;
507
597
  nk_f64_t centroid_a_x_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_a_x_f64m1, compensation_a_x_f64m1) * inv_points_count;
@@ -517,182 +607,59 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f64_rvv_( //
517
607
  *centroid_b_y = centroid_b_y_f64;
518
608
  *centroid_b_z = centroid_b_z_f64;
519
609
  nk_f64_t n_f64 = (nk_f64_t)points_count;
520
- h[0] = nk_dot_stable_sum_f64m1_rvv_(cross_00_f64m1, compensation_00_f64m1) -
521
- n_f64 * centroid_a_x_f64 * centroid_b_x_f64;
522
- h[1] = nk_dot_stable_sum_f64m1_rvv_(cross_01_f64m1, compensation_01_f64m1) -
523
- n_f64 * centroid_a_x_f64 * centroid_b_y_f64;
524
- h[2] = nk_dot_stable_sum_f64m1_rvv_(cross_02_f64m1, compensation_02_f64m1) -
525
- n_f64 * centroid_a_x_f64 * centroid_b_z_f64;
526
- h[3] = nk_dot_stable_sum_f64m1_rvv_(cross_10_f64m1, compensation_10_f64m1) -
527
- n_f64 * centroid_a_y_f64 * centroid_b_x_f64;
528
- h[4] = nk_dot_stable_sum_f64m1_rvv_(cross_11_f64m1, compensation_11_f64m1) -
529
- n_f64 * centroid_a_y_f64 * centroid_b_y_f64;
530
- h[5] = nk_dot_stable_sum_f64m1_rvv_(cross_12_f64m1, compensation_12_f64m1) -
531
- n_f64 * centroid_a_y_f64 * centroid_b_z_f64;
532
- h[6] = nk_dot_stable_sum_f64m1_rvv_(cross_20_f64m1, compensation_20_f64m1) -
533
- n_f64 * centroid_a_z_f64 * centroid_b_x_f64;
534
- h[7] = nk_dot_stable_sum_f64m1_rvv_(cross_21_f64m1, compensation_21_f64m1) -
535
- n_f64 * centroid_a_z_f64 * centroid_b_y_f64;
536
- h[8] = nk_dot_stable_sum_f64m1_rvv_(cross_22_f64m1, compensation_22_f64m1) -
537
- n_f64 * centroid_a_z_f64 * centroid_b_z_f64;
538
- *variance_a = nk_dot_stable_sum_f64m1_rvv_(sum_norm_squared_f64m1, compensation_norm_squared_f64m1) *
539
- inv_points_count -
540
- (centroid_a_x_f64 * centroid_a_x_f64 + centroid_a_y_f64 * centroid_a_y_f64 +
541
- centroid_a_z_f64 * centroid_a_z_f64);
542
- }
543
-
544
- NK_INTERNAL nk_f64_t nk_transformed_ssd_f32_rvv_( //
545
- nk_f32_t const *a, nk_f32_t const *b, nk_size_t points_count, //
546
- nk_f64_t const *r, nk_f64_t scale, //
547
- nk_f64_t centroid_a_x, nk_f64_t centroid_a_y, nk_f64_t centroid_a_z, //
548
- nk_f64_t centroid_b_x, nk_f64_t centroid_b_y, nk_f64_t centroid_b_z) {
549
- nk_f64_t scaled_rotation_x_x = scale * r[0], scaled_rotation_x_y = scale * r[1], scaled_rotation_x_z = scale * r[2];
550
- nk_f64_t scaled_rotation_y_x = scale * r[3], scaled_rotation_y_y = scale * r[4], scaled_rotation_y_z = scale * r[5];
551
- nk_f64_t scaled_rotation_z_x = scale * r[6], scaled_rotation_z_y = scale * r[7], scaled_rotation_z_z = scale * r[8];
552
- nk_size_t max_vector_length = __riscv_vsetvlmax_e64m2();
553
- vfloat64m2_t sum_distance_squared_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
554
- vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
555
- nk_f32_t const *a_ptr = a, *b_ptr = b;
556
- nk_size_t remaining = points_count;
557
- for (nk_size_t vector_length; remaining > 0;
558
- remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
559
- vector_length = __riscv_vsetvl_e32m1(remaining);
560
- vfloat32m1x3_t a_f32m1x3 = __riscv_vlseg3e32_v_f32m1x3(a_ptr, vector_length);
561
- vfloat64m2_t centered_a_x_f64m2 = __riscv_vfsub_vf_f64m2(
562
- __riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 0), vector_length), centroid_a_x,
563
- vector_length);
564
- vfloat64m2_t centered_a_y_f64m2 = __riscv_vfsub_vf_f64m2(
565
- __riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 1), vector_length), centroid_a_y,
566
- vector_length);
567
- vfloat64m2_t centered_a_z_f64m2 = __riscv_vfsub_vf_f64m2(
568
- __riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 2), vector_length), centroid_a_z,
569
- vector_length);
570
- vfloat64m2_t rotated_a_x_f64m2 = __riscv_vfmul_vf_f64m2(centered_a_x_f64m2, scaled_rotation_x_x, vector_length);
571
- rotated_a_x_f64m2 = __riscv_vfmacc_vf_f64m2(rotated_a_x_f64m2, scaled_rotation_x_y, centered_a_y_f64m2,
572
- vector_length);
573
- rotated_a_x_f64m2 = __riscv_vfmacc_vf_f64m2(rotated_a_x_f64m2, scaled_rotation_x_z, centered_a_z_f64m2,
574
- vector_length);
575
- vfloat64m2_t rotated_a_y_f64m2 = __riscv_vfmul_vf_f64m2(centered_a_x_f64m2, scaled_rotation_y_x, vector_length);
576
- rotated_a_y_f64m2 = __riscv_vfmacc_vf_f64m2(rotated_a_y_f64m2, scaled_rotation_y_y, centered_a_y_f64m2,
577
- vector_length);
578
- rotated_a_y_f64m2 = __riscv_vfmacc_vf_f64m2(rotated_a_y_f64m2, scaled_rotation_y_z, centered_a_z_f64m2,
579
- vector_length);
580
- vfloat64m2_t rotated_a_z_f64m2 = __riscv_vfmul_vf_f64m2(centered_a_x_f64m2, scaled_rotation_z_x, vector_length);
581
- rotated_a_z_f64m2 = __riscv_vfmacc_vf_f64m2(rotated_a_z_f64m2, scaled_rotation_z_y, centered_a_y_f64m2,
582
- vector_length);
583
- rotated_a_z_f64m2 = __riscv_vfmacc_vf_f64m2(rotated_a_z_f64m2, scaled_rotation_z_z, centered_a_z_f64m2,
584
- vector_length);
585
- vfloat32m1x3_t b_f32m1x3 = __riscv_vlseg3e32_v_f32m1x3(b_ptr, vector_length);
586
- vfloat64m2_t centered_b_x_f64m2 = __riscv_vfsub_vf_f64m2(
587
- __riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 0), vector_length), centroid_b_x,
588
- vector_length);
589
- vfloat64m2_t centered_b_y_f64m2 = __riscv_vfsub_vf_f64m2(
590
- __riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 1), vector_length), centroid_b_y,
591
- vector_length);
592
- vfloat64m2_t centered_b_z_f64m2 = __riscv_vfsub_vf_f64m2(
593
- __riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 2), vector_length), centroid_b_z,
594
- vector_length);
595
- vfloat64m2_t delta_x_f64m2 = __riscv_vfsub_vv_f64m2(rotated_a_x_f64m2, centered_b_x_f64m2, vector_length);
596
- vfloat64m2_t delta_y_f64m2 = __riscv_vfsub_vv_f64m2(rotated_a_y_f64m2, centered_b_y_f64m2, vector_length);
597
- vfloat64m2_t delta_z_f64m2 = __riscv_vfsub_vv_f64m2(rotated_a_z_f64m2, centered_b_z_f64m2, vector_length);
598
- sum_distance_squared_f64m2 = __riscv_vfmacc_vv_f64m2_tu(sum_distance_squared_f64m2, delta_x_f64m2,
599
- delta_x_f64m2, vector_length);
600
- sum_distance_squared_f64m2 = __riscv_vfmacc_vv_f64m2_tu(sum_distance_squared_f64m2, delta_y_f64m2,
601
- delta_y_f64m2, vector_length);
602
- sum_distance_squared_f64m2 = __riscv_vfmacc_vv_f64m2_tu(sum_distance_squared_f64m2, delta_z_f64m2,
603
- delta_z_f64m2, vector_length);
604
- }
605
- return __riscv_vfmv_f_s_f64m1_f64(
606
- __riscv_vfredusum_vs_f64m2_f64m1(sum_distance_squared_f64m2, zero_f64m1, max_vector_length));
607
- }
608
-
609
- NK_INTERNAL nk_f64_t nk_transformed_ssd_f64_rvv_( //
610
- nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, //
611
- nk_f64_t const *r, nk_f64_t scale, //
612
- nk_f64_t centroid_a_x, nk_f64_t centroid_a_y, nk_f64_t centroid_a_z, //
613
- nk_f64_t centroid_b_x, nk_f64_t centroid_b_y, nk_f64_t centroid_b_z) {
614
- nk_f64_t scaled_rotation_x_x = scale * r[0], scaled_rotation_x_y = scale * r[1], scaled_rotation_x_z = scale * r[2];
615
- nk_f64_t scaled_rotation_y_x = scale * r[3], scaled_rotation_y_y = scale * r[4], scaled_rotation_y_z = scale * r[5];
616
- nk_f64_t scaled_rotation_z_x = scale * r[6], scaled_rotation_z_y = scale * r[7], scaled_rotation_z_z = scale * r[8];
617
- nk_size_t max_vector_length = __riscv_vsetvlmax_e64m1();
618
- vfloat64m1_t sum_distance_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
619
- vfloat64m1_t compensation_distance_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
620
- nk_f64_t const *a_ptr = a, *b_ptr = b;
621
- nk_size_t remaining = points_count;
622
- for (nk_size_t vector_length; remaining > 0;
623
- remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
624
- vector_length = __riscv_vsetvl_e64m1(remaining);
625
- vfloat64m1x3_t a_f64m1x3 = __riscv_vlseg3e64_v_f64m1x3(a_ptr, vector_length);
626
- vfloat64m1_t centered_a_x_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 0),
627
- centroid_a_x, vector_length);
628
- vfloat64m1_t centered_a_y_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 1),
629
- centroid_a_y, vector_length);
630
- vfloat64m1_t centered_a_z_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 2),
631
- centroid_a_z, vector_length);
632
- vfloat64m1_t rotated_a_x_f64m1 = __riscv_vfmul_vf_f64m1(centered_a_x_f64m1, scaled_rotation_x_x, vector_length);
633
- rotated_a_x_f64m1 = __riscv_vfmacc_vf_f64m1(rotated_a_x_f64m1, scaled_rotation_x_y, centered_a_y_f64m1,
634
- vector_length);
635
- rotated_a_x_f64m1 = __riscv_vfmacc_vf_f64m1(rotated_a_x_f64m1, scaled_rotation_x_z, centered_a_z_f64m1,
636
- vector_length);
637
- vfloat64m1_t rotated_a_y_f64m1 = __riscv_vfmul_vf_f64m1(centered_a_x_f64m1, scaled_rotation_y_x, vector_length);
638
- rotated_a_y_f64m1 = __riscv_vfmacc_vf_f64m1(rotated_a_y_f64m1, scaled_rotation_y_y, centered_a_y_f64m1,
639
- vector_length);
640
- rotated_a_y_f64m1 = __riscv_vfmacc_vf_f64m1(rotated_a_y_f64m1, scaled_rotation_y_z, centered_a_z_f64m1,
641
- vector_length);
642
- vfloat64m1_t rotated_a_z_f64m1 = __riscv_vfmul_vf_f64m1(centered_a_x_f64m1, scaled_rotation_z_x, vector_length);
643
- rotated_a_z_f64m1 = __riscv_vfmacc_vf_f64m1(rotated_a_z_f64m1, scaled_rotation_z_y, centered_a_y_f64m1,
644
- vector_length);
645
- rotated_a_z_f64m1 = __riscv_vfmacc_vf_f64m1(rotated_a_z_f64m1, scaled_rotation_z_z, centered_a_z_f64m1,
646
- vector_length);
647
- vfloat64m1x3_t b_f64m1x3 = __riscv_vlseg3e64_v_f64m1x3(b_ptr, vector_length);
648
- vfloat64m1_t centered_b_x_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 0),
649
- centroid_b_x, vector_length);
650
- vfloat64m1_t centered_b_y_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 1),
651
- centroid_b_y, vector_length);
652
- vfloat64m1_t centered_b_z_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 2),
653
- centroid_b_z, vector_length);
654
- vfloat64m1_t delta_x_f64m1 = __riscv_vfsub_vv_f64m1(rotated_a_x_f64m1, centered_b_x_f64m1, vector_length);
655
- vfloat64m1_t delta_y_f64m1 = __riscv_vfsub_vv_f64m1(rotated_a_y_f64m1, centered_b_y_f64m1, vector_length);
656
- vfloat64m1_t delta_z_f64m1 = __riscv_vfsub_vv_f64m1(rotated_a_z_f64m1, centered_b_z_f64m1, vector_length);
657
- vfloat64m1_t distance_squared_f64m1 = __riscv_vfmul_vv_f64m1(delta_x_f64m1, delta_x_f64m1, vector_length);
658
- distance_squared_f64m1 = __riscv_vfmacc_vv_f64m1(distance_squared_f64m1, delta_y_f64m1, delta_y_f64m1,
659
- vector_length);
660
- distance_squared_f64m1 = __riscv_vfmacc_vv_f64m1(distance_squared_f64m1, delta_z_f64m1, delta_z_f64m1,
661
- vector_length);
662
- nk_accumulate_sum_f64m1_rvv_(&sum_distance_squared_f64m1, &compensation_distance_squared_f64m1,
663
- distance_squared_f64m1, vector_length);
664
- }
665
- return nk_dot_stable_sum_f64m1_rvv_(sum_distance_squared_f64m1, compensation_distance_squared_f64m1);
610
+ cross_covariance[0] = nk_dot_stable_sum_f64m1_rvv_(cross_00_f64m1, compensation_00_f64m1) -
611
+ n_f64 * centroid_a_x_f64 * centroid_b_x_f64;
612
+ cross_covariance[1] = nk_dot_stable_sum_f64m1_rvv_(cross_01_f64m1, compensation_01_f64m1) -
613
+ n_f64 * centroid_a_x_f64 * centroid_b_y_f64;
614
+ cross_covariance[2] = nk_dot_stable_sum_f64m1_rvv_(cross_02_f64m1, compensation_02_f64m1) -
615
+ n_f64 * centroid_a_x_f64 * centroid_b_z_f64;
616
+ cross_covariance[3] = nk_dot_stable_sum_f64m1_rvv_(cross_10_f64m1, compensation_10_f64m1) -
617
+ n_f64 * centroid_a_y_f64 * centroid_b_x_f64;
618
+ cross_covariance[4] = nk_dot_stable_sum_f64m1_rvv_(cross_11_f64m1, compensation_11_f64m1) -
619
+ n_f64 * centroid_a_y_f64 * centroid_b_y_f64;
620
+ cross_covariance[5] = nk_dot_stable_sum_f64m1_rvv_(cross_12_f64m1, compensation_12_f64m1) -
621
+ n_f64 * centroid_a_y_f64 * centroid_b_z_f64;
622
+ cross_covariance[6] = nk_dot_stable_sum_f64m1_rvv_(cross_20_f64m1, compensation_20_f64m1) -
623
+ n_f64 * centroid_a_z_f64 * centroid_b_x_f64;
624
+ cross_covariance[7] = nk_dot_stable_sum_f64m1_rvv_(cross_21_f64m1, compensation_21_f64m1) -
625
+ n_f64 * centroid_a_z_f64 * centroid_b_y_f64;
626
+ cross_covariance[8] = nk_dot_stable_sum_f64m1_rvv_(cross_22_f64m1, compensation_22_f64m1) -
627
+ n_f64 * centroid_a_z_f64 * centroid_b_z_f64;
628
+ // Centered norm-squared via parallel-axis identity; clamp at zero for numeric safety.
629
+ nk_f64_t norm_squared_a_sum = nk_dot_stable_sum_f64m1_rvv_(norm_squared_a_f64m1, compensation_norm_squared_a_f64m1);
630
+ nk_f64_t norm_squared_b_sum = nk_dot_stable_sum_f64m1_rvv_(norm_squared_b_f64m1, compensation_norm_squared_b_f64m1);
631
+ *centered_norm_squared_a = norm_squared_a_sum -
632
+ n_f64 * (centroid_a_x_f64 * centroid_a_x_f64 + centroid_a_y_f64 * centroid_a_y_f64 +
633
+ centroid_a_z_f64 * centroid_a_z_f64);
634
+ *centered_norm_squared_b = norm_squared_b_sum -
635
+ n_f64 * (centroid_b_x_f64 * centroid_b_x_f64 + centroid_b_y_f64 * centroid_b_y_f64 +
636
+ centroid_b_z_f64 * centroid_b_z_f64);
637
+ if (*centered_norm_squared_a < 0.0) *centered_norm_squared_a = 0.0;
638
+ if (*centered_norm_squared_b < 0.0) *centered_norm_squared_b = 0.0;
666
639
  }
667
640
 
668
641
  /** @brief Compute R = V * Uᵀ from SVD factors (f32), vectorized with `vfmul_vf`/`vfmacc_vf`. */
669
642
  NK_INTERNAL void nk_rotation_from_svd_f32_rvv_( //
670
- nk_f32_t *svd_u, nk_f32_t *svd_v, nk_f32_t r[9]) {
643
+ nk_f32_t *svd_left, nk_f32_t *svd_right, nk_f32_t optimal_rotation[9]) {
671
644
  nk_size_t vl3 = __riscv_vsetvl_e32m1(3);
672
- vfloat32m1_t u_row0_f32m1 = __riscv_vle32_v_f32m1(svd_u + 0, vl3);
673
- vfloat32m1_t u_row1_f32m1 = __riscv_vle32_v_f32m1(svd_u + 3, vl3);
674
- vfloat32m1_t u_row2_f32m1 = __riscv_vle32_v_f32m1(svd_u + 6, vl3);
645
+ vfloat32m1_t u_row0_f32m1 = __riscv_vle32_v_f32m1(svd_left + 0, vl3);
646
+ vfloat32m1_t u_row1_f32m1 = __riscv_vle32_v_f32m1(svd_left + 3, vl3);
647
+ vfloat32m1_t u_row2_f32m1 = __riscv_vle32_v_f32m1(svd_left + 6, vl3);
675
648
  // Row 0: R[0..2] = V[0]*U_row0 + V[1]*U_row1 + V[2]*U_row2
676
- vfloat32m1_t rotation_row_f32m1 = __riscv_vfmul_vf_f32m1(u_row0_f32m1, svd_v[0], vl3);
677
- rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1, svd_v[1], u_row1_f32m1, vl3);
678
- rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1, svd_v[2], u_row2_f32m1, vl3);
679
- __riscv_vse32_v_f32m1(r + 0, rotation_row_f32m1, vl3);
649
+ vfloat32m1_t rotation_row_f32m1 = __riscv_vfmul_vf_f32m1(u_row0_f32m1, svd_right[0], vl3);
650
+ rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1, svd_right[1], u_row1_f32m1, vl3);
651
+ rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1, svd_right[2], u_row2_f32m1, vl3);
652
+ __riscv_vse32_v_f32m1(optimal_rotation + 0, rotation_row_f32m1, vl3);
680
653
  // Row 1: R[3..5]
681
- rotation_row_f32m1 = __riscv_vfmul_vf_f32m1(u_row0_f32m1, svd_v[3], vl3);
682
- rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1, svd_v[4], u_row1_f32m1, vl3);
683
- rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1, svd_v[5], u_row2_f32m1, vl3);
684
- __riscv_vse32_v_f32m1(r + 3, rotation_row_f32m1, vl3);
654
+ rotation_row_f32m1 = __riscv_vfmul_vf_f32m1(u_row0_f32m1, svd_right[3], vl3);
655
+ rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1, svd_right[4], u_row1_f32m1, vl3);
656
+ rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1, svd_right[5], u_row2_f32m1, vl3);
657
+ __riscv_vse32_v_f32m1(optimal_rotation + 3, rotation_row_f32m1, vl3);
685
658
  // Row 2: R[6..8]
686
- rotation_row_f32m1 = __riscv_vfmul_vf_f32m1(u_row0_f32m1, svd_v[6], vl3);
687
- rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1, svd_v[7], u_row1_f32m1, vl3);
688
- rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1, svd_v[8], u_row2_f32m1, vl3);
689
- __riscv_vse32_v_f32m1(r + 6, rotation_row_f32m1, vl3);
690
- }
691
-
692
- /** @brief Compute R = V * Uᵀ from SVD factors (f64), vectorized with `vfmul_vf`/`vfmacc_vf`. */
693
- NK_INTERNAL void nk_rotation_from_svd_f64_rvv_( //
694
- nk_f64_t *svd_u, nk_f64_t *svd_v, nk_f64_t r[9]) {
695
- nk_rotation_from_svd_f64_serial_(svd_u, svd_v, r);
659
+ rotation_row_f32m1 = __riscv_vfmul_vf_f32m1(u_row0_f32m1, svd_right[6], vl3);
660
+ rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1, svd_right[7], u_row1_f32m1, vl3);
661
+ rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1, svd_right[8], u_row2_f32m1, vl3);
662
+ __riscv_vse32_v_f32m1(optimal_rotation + 6, rotation_row_f32m1, vl3);
696
663
  }
697
664
 
698
665
  NK_PUBLIC void nk_rmsd_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
@@ -781,114 +748,232 @@ NK_PUBLIC void nk_kabsch_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t
781
748
  nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
782
749
  if (scale) *scale = 1.0f;
783
750
  nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z;
784
- nk_f64_t h[9];
751
+ nk_f64_t centered_norm_squared_a, centered_norm_squared_b;
752
+ nk_f64_t cross_covariance[9];
785
753
  nk_centroid_and_cross_covariance_f32_rvv_(a, b, points_count, &centroid_a_x, &centroid_a_y, &centroid_a_z,
786
- &centroid_b_x, &centroid_b_y, &centroid_b_z, h);
754
+ &centroid_b_x, &centroid_b_y, &centroid_b_z, cross_covariance,
755
+ &centered_norm_squared_a, &centered_norm_squared_b);
787
756
  if (a_centroid)
788
757
  a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
789
758
  a_centroid[2] = (nk_f32_t)centroid_a_z;
790
759
  if (b_centroid)
791
760
  b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
792
761
  b_centroid[2] = (nk_f32_t)centroid_b_z;
793
- nk_f64_t svd_u[9], svd_s[9], svd_v[9];
794
- nk_svd3x3_f64_(h, svd_u, svd_s, svd_v);
795
- nk_f64_t r[9];
796
- nk_rotation_from_svd_f64_rvv_(svd_u, svd_v, r);
797
- if (nk_det3x3_f64_(r) < 0) {
798
- svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
799
- nk_rotation_from_svd_f64_rvv_(svd_u, svd_v, r);
762
+
763
+ // Identity-dominant short-circuit: if H ≈ diag(positive entries), R = I and trace(R·H) = trace(H).
764
+ nk_f64_t covariance_diagonal_norm_squared = cross_covariance[0] * cross_covariance[0] +
765
+ cross_covariance[4] * cross_covariance[4] +
766
+ cross_covariance[8] * cross_covariance[8];
767
+ nk_f64_t covariance_offdiagonal_norm_squared =
768
+ cross_covariance[1] * cross_covariance[1] + cross_covariance[2] * cross_covariance[2] +
769
+ cross_covariance[3] * cross_covariance[3] + cross_covariance[5] * cross_covariance[5] +
770
+ cross_covariance[6] * cross_covariance[6] + cross_covariance[7] * cross_covariance[7];
771
+ nk_f64_t optimal_rotation[9];
772
+ nk_f64_t trace_rotation_covariance;
773
+ if (covariance_offdiagonal_norm_squared < 1e-20 * covariance_diagonal_norm_squared && cross_covariance[0] > 0.0 &&
774
+ cross_covariance[4] > 0.0 && cross_covariance[8] > 0.0) {
775
+ optimal_rotation[0] = 1, optimal_rotation[1] = 0, optimal_rotation[2] = 0, optimal_rotation[3] = 0,
776
+ optimal_rotation[4] = 1, optimal_rotation[5] = 0, optimal_rotation[6] = 0, optimal_rotation[7] = 0,
777
+ optimal_rotation[8] = 1;
778
+ trace_rotation_covariance = cross_covariance[0] + cross_covariance[4] + cross_covariance[8];
779
+ }
780
+ else {
781
+ nk_f64_t svd_left[9], svd_diagonal[9], svd_right[9];
782
+ nk_svd3x3_f64_(cross_covariance, svd_left, svd_diagonal, svd_right);
783
+ nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
784
+ if (nk_det3x3_f64_(optimal_rotation) < 0) {
785
+ svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8];
786
+ nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
787
+ }
788
+ trace_rotation_covariance =
789
+ optimal_rotation[0] * cross_covariance[0] + optimal_rotation[1] * cross_covariance[3] +
790
+ optimal_rotation[2] * cross_covariance[6] + optimal_rotation[3] * cross_covariance[1] +
791
+ optimal_rotation[4] * cross_covariance[4] + optimal_rotation[5] * cross_covariance[7] +
792
+ optimal_rotation[6] * cross_covariance[2] + optimal_rotation[7] * cross_covariance[5] +
793
+ optimal_rotation[8] * cross_covariance[8];
800
794
  }
801
795
  if (rotation)
802
- for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)r[j];
803
- nk_f64_t ssd = nk_transformed_ssd_f32_rvv_(a, b, points_count, r, 1.0, centroid_a_x, centroid_a_y, centroid_a_z,
804
- centroid_b_x, centroid_b_y, centroid_b_z);
805
- *result = nk_f64_sqrt_rvv(ssd / (nk_f64_t)points_count);
796
+ for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)optimal_rotation[j];
797
+ // Folded SSD via trace identity: SSD = a-ā‖² + ‖b-b̄‖² 2·trace(R · H_centered).
798
+ nk_f64_t sum_squared = centered_norm_squared_a + centered_norm_squared_b - 2.0 * trace_rotation_covariance;
799
+ if (sum_squared < 0.0) sum_squared = 0.0;
800
+ *result = nk_f64_sqrt_rvv(sum_squared / (nk_f64_t)points_count);
806
801
  }
807
802
 
808
803
  NK_PUBLIC void nk_kabsch_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, nk_f64_t *a_centroid,
809
804
  nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
810
805
  if (scale) *scale = 1.0;
811
806
  nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z;
812
- nk_f64_t h[9];
807
+ nk_f64_t centered_norm_squared_a, centered_norm_squared_b;
808
+ nk_f64_t cross_covariance[9];
813
809
  nk_centroid_and_cross_covariance_f64_rvv_(a, b, points_count, &centroid_a_x, &centroid_a_y, &centroid_a_z,
814
- &centroid_b_x, &centroid_b_y, &centroid_b_z, h);
810
+ &centroid_b_x, &centroid_b_y, &centroid_b_z, cross_covariance,
811
+ &centered_norm_squared_a, &centered_norm_squared_b);
815
812
  if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
816
813
  if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
817
- nk_f64_t svd_u[9], svd_s[9], svd_v[9];
818
- nk_svd3x3_f64_(h, svd_u, svd_s, svd_v);
819
- nk_f64_t r[9];
820
- nk_rotation_from_svd_f64_rvv_(svd_u, svd_v, r);
821
- if (nk_det3x3_f64_(r) < 0) {
822
- svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
823
- nk_rotation_from_svd_f64_rvv_(svd_u, svd_v, r);
814
+
815
+ // Identity-dominant short-circuit: if H ≈ diag(positive entries), R = I and trace(R·H) = trace(H).
816
+ nk_f64_t covariance_diagonal_norm_squared = cross_covariance[0] * cross_covariance[0] +
817
+ cross_covariance[4] * cross_covariance[4] +
818
+ cross_covariance[8] * cross_covariance[8];
819
+ nk_f64_t covariance_offdiagonal_norm_squared =
820
+ cross_covariance[1] * cross_covariance[1] + cross_covariance[2] * cross_covariance[2] +
821
+ cross_covariance[3] * cross_covariance[3] + cross_covariance[5] * cross_covariance[5] +
822
+ cross_covariance[6] * cross_covariance[6] + cross_covariance[7] * cross_covariance[7];
823
+ nk_f64_t optimal_rotation[9];
824
+ nk_f64_t trace_rotation_covariance;
825
+ if (covariance_offdiagonal_norm_squared < 1e-20 * covariance_diagonal_norm_squared && cross_covariance[0] > 0.0 &&
826
+ cross_covariance[4] > 0.0 && cross_covariance[8] > 0.0) {
827
+ optimal_rotation[0] = 1, optimal_rotation[1] = 0, optimal_rotation[2] = 0, optimal_rotation[3] = 0,
828
+ optimal_rotation[4] = 1, optimal_rotation[5] = 0, optimal_rotation[6] = 0, optimal_rotation[7] = 0,
829
+ optimal_rotation[8] = 1;
830
+ trace_rotation_covariance = cross_covariance[0] + cross_covariance[4] + cross_covariance[8];
831
+ }
832
+ else {
833
+ nk_f64_t svd_left[9], svd_diagonal[9], svd_right[9];
834
+ nk_svd3x3_f64_(cross_covariance, svd_left, svd_diagonal, svd_right);
835
+ nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
836
+ if (nk_det3x3_f64_(optimal_rotation) < 0) {
837
+ svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8];
838
+ nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
839
+ }
840
+ trace_rotation_covariance =
841
+ optimal_rotation[0] * cross_covariance[0] + optimal_rotation[1] * cross_covariance[3] +
842
+ optimal_rotation[2] * cross_covariance[6] + optimal_rotation[3] * cross_covariance[1] +
843
+ optimal_rotation[4] * cross_covariance[4] + optimal_rotation[5] * cross_covariance[7] +
844
+ optimal_rotation[6] * cross_covariance[2] + optimal_rotation[7] * cross_covariance[5] +
845
+ optimal_rotation[8] * cross_covariance[8];
824
846
  }
825
847
  if (rotation)
826
- for (int j = 0; j < 9; ++j) rotation[j] = r[j];
827
- nk_f64_t ssd = nk_transformed_ssd_f64_rvv_(a, b, points_count, r, 1.0, centroid_a_x, centroid_a_y, centroid_a_z,
828
- centroid_b_x, centroid_b_y, centroid_b_z);
829
- *result = nk_f64_sqrt_rvv(ssd / (nk_f64_t)points_count);
848
+ for (int j = 0; j < 9; ++j) rotation[j] = optimal_rotation[j];
849
+ // Folded SSD via trace identity: SSD = a-ā‖² + ‖b-b̄‖² 2·trace(R · H_centered).
850
+ nk_f64_t sum_squared = centered_norm_squared_a + centered_norm_squared_b - 2.0 * trace_rotation_covariance;
851
+ if (sum_squared < 0.0) sum_squared = 0.0;
852
+ *result = nk_f64_sqrt_rvv(sum_squared / (nk_f64_t)points_count);
830
853
  }
831
854
 
832
855
  NK_PUBLIC void nk_umeyama_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
833
856
  nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
834
857
  nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z;
835
- nk_f64_t h[9], variance_a;
836
- nk_centroid_and_cross_covariance_and_variance_f32_rvv_(a, b, points_count, &centroid_a_x, &centroid_a_y,
837
- &centroid_a_z, &centroid_b_x, &centroid_b_y, &centroid_b_z,
838
- h, &variance_a);
858
+ nk_f64_t centered_norm_squared_a, centered_norm_squared_b;
859
+ nk_f64_t cross_covariance[9];
860
+ nk_centroid_and_cross_covariance_and_variance_f32_rvv_(
861
+ a, b, points_count, &centroid_a_x, &centroid_a_y, &centroid_a_z, &centroid_b_x, &centroid_b_y, &centroid_b_z,
862
+ cross_covariance, &centered_norm_squared_a, &centered_norm_squared_b);
839
863
  if (a_centroid)
840
864
  a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
841
865
  a_centroid[2] = (nk_f32_t)centroid_a_z;
842
866
  if (b_centroid)
843
867
  b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
844
868
  b_centroid[2] = (nk_f32_t)centroid_b_z;
845
- nk_f64_t svd_u[9], svd_s[9], svd_v[9];
846
- nk_svd3x3_f64_(h, svd_u, svd_s, svd_v);
847
- nk_f64_t r[9];
848
- nk_rotation_from_svd_f64_rvv_(svd_u, svd_v, r);
849
- nk_f64_t det = nk_det3x3_f64_(r);
850
- nk_f64_t sign_det = det < 0 ? -1.0 : 1.0;
851
- nk_f64_t trace_ds = nk_sum_three_products_f64_(svd_s[0], 1.0, svd_s[4], 1.0, svd_s[8], sign_det);
852
- nk_f64_t scale_factor = trace_ds / ((nk_f64_t)points_count * variance_a);
853
- if (scale) *scale = (nk_f32_t)scale_factor;
854
- if (det < 0) {
855
- svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
856
- nk_rotation_from_svd_f64_rvv_(svd_u, svd_v, r);
869
+
870
+ // Identity-dominant short-circuit: if H ≈ diag(positive entries), R = I and trace(R·H) = trace(H).
871
+ nk_f64_t covariance_diagonal_norm_squared = cross_covariance[0] * cross_covariance[0] +
872
+ cross_covariance[4] * cross_covariance[4] +
873
+ cross_covariance[8] * cross_covariance[8];
874
+ nk_f64_t covariance_offdiagonal_norm_squared =
875
+ cross_covariance[1] * cross_covariance[1] + cross_covariance[2] * cross_covariance[2] +
876
+ cross_covariance[3] * cross_covariance[3] + cross_covariance[5] * cross_covariance[5] +
877
+ cross_covariance[6] * cross_covariance[6] + cross_covariance[7] * cross_covariance[7];
878
+ nk_f64_t optimal_rotation[9];
879
+ nk_f64_t trace_rotation_covariance;
880
+ nk_f64_t scale_factor;
881
+ if (covariance_offdiagonal_norm_squared < 1e-20 * covariance_diagonal_norm_squared && cross_covariance[0] > 0.0 &&
882
+ cross_covariance[4] > 0.0 && cross_covariance[8] > 0.0) {
883
+ optimal_rotation[0] = 1, optimal_rotation[1] = 0, optimal_rotation[2] = 0, optimal_rotation[3] = 0,
884
+ optimal_rotation[4] = 1, optimal_rotation[5] = 0, optimal_rotation[6] = 0, optimal_rotation[7] = 0,
885
+ optimal_rotation[8] = 1;
886
+ trace_rotation_covariance = cross_covariance[0] + cross_covariance[4] + cross_covariance[8];
887
+ scale_factor = centered_norm_squared_a > 0.0 ? trace_rotation_covariance / centered_norm_squared_a : 0.0;
888
+ }
889
+ else {
890
+ nk_f64_t svd_left[9], svd_diagonal[9], svd_right[9];
891
+ nk_svd3x3_f64_(cross_covariance, svd_left, svd_diagonal, svd_right);
892
+ nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
893
+ nk_f64_t det = nk_det3x3_f64_(optimal_rotation);
894
+ nk_f64_t sign_det = det < 0 ? -1.0 : 1.0;
895
+ nk_f64_t trace_ds = nk_sum_three_products_f64_(svd_diagonal[0], 1.0, svd_diagonal[4], 1.0, svd_diagonal[8],
896
+ sign_det);
897
+ scale_factor = centered_norm_squared_a > 0.0 ? trace_ds / centered_norm_squared_a : 0.0;
898
+ if (det < 0) {
899
+ svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8];
900
+ nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
901
+ }
902
+ trace_rotation_covariance =
903
+ optimal_rotation[0] * cross_covariance[0] + optimal_rotation[1] * cross_covariance[3] +
904
+ optimal_rotation[2] * cross_covariance[6] + optimal_rotation[3] * cross_covariance[1] +
905
+ optimal_rotation[4] * cross_covariance[4] + optimal_rotation[5] * cross_covariance[7] +
906
+ optimal_rotation[6] * cross_covariance[2] + optimal_rotation[7] * cross_covariance[5] +
907
+ optimal_rotation[8] * cross_covariance[8];
857
908
  }
909
+ if (scale) *scale = (nk_f32_t)scale_factor;
858
910
  if (rotation)
859
- for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)r[j];
860
- nk_f64_t ssd = nk_transformed_ssd_f32_rvv_(a, b, points_count, r, scale_factor, centroid_a_x, centroid_a_y,
861
- centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z);
862
- *result = nk_f64_sqrt_rvv(ssd / (nk_f64_t)points_count);
911
+ for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)optimal_rotation[j];
912
+ // Folded SSD with scale: c²·‖a-ā‖² + ‖b-b̄‖² 2c·trace(R · H_centered).
913
+ nk_f64_t sum_squared = scale_factor * scale_factor * centered_norm_squared_a + centered_norm_squared_b -
914
+ 2.0 * scale_factor * trace_rotation_covariance;
915
+ if (sum_squared < 0.0) sum_squared = 0.0;
916
+ *result = nk_f64_sqrt_rvv(sum_squared / (nk_f64_t)points_count);
863
917
  }
864
918
 
865
919
  NK_PUBLIC void nk_umeyama_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, nk_f64_t *a_centroid,
866
920
  nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
867
921
  nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z;
868
- nk_f64_t h[9], variance_a;
869
- nk_centroid_and_cross_covariance_and_variance_f64_rvv_(a, b, points_count, &centroid_a_x, &centroid_a_y,
870
- &centroid_a_z, &centroid_b_x, &centroid_b_y, &centroid_b_z,
871
- h, &variance_a);
922
+ nk_f64_t centered_norm_squared_a, centered_norm_squared_b;
923
+ nk_f64_t cross_covariance[9];
924
+ nk_centroid_and_cross_covariance_and_variance_f64_rvv_(
925
+ a, b, points_count, &centroid_a_x, &centroid_a_y, &centroid_a_z, &centroid_b_x, &centroid_b_y, &centroid_b_z,
926
+ cross_covariance, &centered_norm_squared_a, &centered_norm_squared_b);
872
927
  if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
873
928
  if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
874
- nk_f64_t svd_u[9], svd_s[9], svd_v[9];
875
- nk_svd3x3_f64_(h, svd_u, svd_s, svd_v);
876
- nk_f64_t r[9];
877
- nk_rotation_from_svd_f64_rvv_(svd_u, svd_v, r);
878
- nk_f64_t det = nk_det3x3_f64_(r);
879
- nk_f64_t sign_det = det < 0 ? -1.0 : 1.0;
880
- nk_f64_t trace_ds = nk_sum_three_products_f64_(svd_s[0], 1.0, svd_s[4], 1.0, svd_s[8], sign_det);
881
- nk_f64_t scale_factor = trace_ds / ((nk_f64_t)points_count * variance_a);
882
- if (scale) *scale = scale_factor;
883
- if (det < 0) {
884
- svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
885
- nk_rotation_from_svd_f64_rvv_(svd_u, svd_v, r);
929
+
930
+ // Identity-dominant short-circuit: if H ≈ diag(positive entries), R = I and trace(R·H) = trace(H).
931
+ nk_f64_t covariance_diagonal_norm_squared = cross_covariance[0] * cross_covariance[0] +
932
+ cross_covariance[4] * cross_covariance[4] +
933
+ cross_covariance[8] * cross_covariance[8];
934
+ nk_f64_t covariance_offdiagonal_norm_squared =
935
+ cross_covariance[1] * cross_covariance[1] + cross_covariance[2] * cross_covariance[2] +
936
+ cross_covariance[3] * cross_covariance[3] + cross_covariance[5] * cross_covariance[5] +
937
+ cross_covariance[6] * cross_covariance[6] + cross_covariance[7] * cross_covariance[7];
938
+ nk_f64_t optimal_rotation[9];
939
+ nk_f64_t trace_rotation_covariance;
940
+ nk_f64_t scale_factor;
941
+ if (covariance_offdiagonal_norm_squared < 1e-20 * covariance_diagonal_norm_squared && cross_covariance[0] > 0.0 &&
942
+ cross_covariance[4] > 0.0 && cross_covariance[8] > 0.0) {
943
+ optimal_rotation[0] = 1, optimal_rotation[1] = 0, optimal_rotation[2] = 0, optimal_rotation[3] = 0,
944
+ optimal_rotation[4] = 1, optimal_rotation[5] = 0, optimal_rotation[6] = 0, optimal_rotation[7] = 0,
945
+ optimal_rotation[8] = 1;
946
+ trace_rotation_covariance = cross_covariance[0] + cross_covariance[4] + cross_covariance[8];
947
+ scale_factor = centered_norm_squared_a > 0.0 ? trace_rotation_covariance / centered_norm_squared_a : 0.0;
886
948
  }
949
+ else {
950
+ nk_f64_t svd_left[9], svd_diagonal[9], svd_right[9];
951
+ nk_svd3x3_f64_(cross_covariance, svd_left, svd_diagonal, svd_right);
952
+ nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
953
+ nk_f64_t det = nk_det3x3_f64_(optimal_rotation);
954
+ nk_f64_t sign_det = det < 0 ? -1.0 : 1.0;
955
+ nk_f64_t trace_ds = nk_sum_three_products_f64_(svd_diagonal[0], 1.0, svd_diagonal[4], 1.0, svd_diagonal[8],
956
+ sign_det);
957
+ scale_factor = centered_norm_squared_a > 0.0 ? trace_ds / centered_norm_squared_a : 0.0;
958
+ if (det < 0) {
959
+ svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8];
960
+ nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
961
+ }
962
+ trace_rotation_covariance =
963
+ optimal_rotation[0] * cross_covariance[0] + optimal_rotation[1] * cross_covariance[3] +
964
+ optimal_rotation[2] * cross_covariance[6] + optimal_rotation[3] * cross_covariance[1] +
965
+ optimal_rotation[4] * cross_covariance[4] + optimal_rotation[5] * cross_covariance[7] +
966
+ optimal_rotation[6] * cross_covariance[2] + optimal_rotation[7] * cross_covariance[5] +
967
+ optimal_rotation[8] * cross_covariance[8];
968
+ }
969
+ if (scale) *scale = scale_factor;
887
970
  if (rotation)
888
- for (int j = 0; j < 9; ++j) rotation[j] = r[j];
889
- nk_f64_t ssd = nk_transformed_ssd_f64_rvv_(a, b, points_count, r, scale_factor, centroid_a_x, centroid_a_y,
890
- centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z);
891
- *result = nk_f64_sqrt_rvv(ssd / (nk_f64_t)points_count);
971
+ for (int j = 0; j < 9; ++j) rotation[j] = optimal_rotation[j];
972
+ // Folded SSD with scale: c²·‖a-ā‖² + ‖b-b̄‖² 2c·trace(R · H_centered).
973
+ nk_f64_t sum_squared = scale_factor * scale_factor * centered_norm_squared_a + centered_norm_squared_b -
974
+ 2.0 * scale_factor * trace_rotation_covariance;
975
+ if (sum_squared < 0.0) sum_squared = 0.0;
976
+ *result = nk_f64_sqrt_rvv(sum_squared / (nk_f64_t)points_count);
892
977
  }
893
978
 
894
979
  NK_PUBLIC void nk_rmsd_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,