numkong 7.5.0 → 7.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/binding.gyp +18 -0
  2. package/c/dispatch_e5m2.c +23 -3
  3. package/include/numkong/capabilities.h +1 -1
  4. package/include/numkong/cast/README.md +3 -0
  5. package/include/numkong/cast/haswell.h +28 -64
  6. package/include/numkong/cast/serial.h +17 -0
  7. package/include/numkong/cast/skylake.h +67 -52
  8. package/include/numkong/cast.h +1 -0
  9. package/include/numkong/dot/README.md +1 -0
  10. package/include/numkong/dot/haswell.h +92 -13
  11. package/include/numkong/dot/serial.h +15 -0
  12. package/include/numkong/dot/skylake.h +61 -14
  13. package/include/numkong/dots/README.md +2 -0
  14. package/include/numkong/dots/graniteamx.h +434 -0
  15. package/include/numkong/dots/haswell.h +28 -28
  16. package/include/numkong/dots/sapphireamx.h +1 -1
  17. package/include/numkong/dots/serial.h +23 -8
  18. package/include/numkong/dots/skylake.h +28 -23
  19. package/include/numkong/dots.h +12 -0
  20. package/include/numkong/each/serial.h +18 -1
  21. package/include/numkong/geospatial/serial.h +14 -3
  22. package/include/numkong/maxsim/serial.h +15 -0
  23. package/include/numkong/mesh/README.md +50 -44
  24. package/include/numkong/mesh/genoa.h +462 -0
  25. package/include/numkong/mesh/haswell.h +806 -933
  26. package/include/numkong/mesh/neon.h +871 -943
  27. package/include/numkong/mesh/neonbfdot.h +382 -522
  28. package/include/numkong/mesh/neonfhm.h +676 -0
  29. package/include/numkong/mesh/rvv.h +404 -319
  30. package/include/numkong/mesh/serial.h +204 -162
  31. package/include/numkong/mesh/skylake.h +1029 -1585
  32. package/include/numkong/mesh/v128relaxed.h +403 -377
  33. package/include/numkong/mesh.h +38 -0
  34. package/include/numkong/reduce/serial.h +15 -1
  35. package/include/numkong/sparse/serial.h +17 -2
  36. package/include/numkong/spatial/genoa.h +0 -68
  37. package/include/numkong/spatial/haswell.h +98 -56
  38. package/include/numkong/spatial/serial.h +15 -0
  39. package/include/numkong/spatial/skylake.h +114 -54
  40. package/include/numkong/spatial.h +0 -12
  41. package/include/numkong/spatials/graniteamx.h +128 -0
  42. package/include/numkong/spatials/serial.h +18 -1
  43. package/include/numkong/spatials/skylake.h +2 -2
  44. package/include/numkong/spatials.h +17 -0
  45. package/include/numkong/tensor.hpp +107 -23
  46. package/javascript/numkong.c +3 -2
  47. package/package.json +7 -7
  48. package/wasm/numkong.wasm +0 -0
@@ -101,11 +101,6 @@ NK_INTERNAL nk_f64_t nk_reduce_stable_f64x2_v128relaxed_(v128_t values_f64x2) {
101
101
  return sum + compensation;
102
102
  }
103
103
 
104
- NK_INTERNAL void nk_rotation_from_svd_f64_v128relaxed_(nk_f64_t const *svd_u, nk_f64_t const *svd_v,
105
- nk_f64_t *rotation) {
106
- nk_rotation_from_svd_f64_serial_(svd_u, svd_v, rotation);
107
- }
108
-
109
104
  NK_INTERNAL void nk_accumulate_square_f64x2_v128relaxed_(v128_t *sum_f64x2, v128_t *compensation_f64x2,
110
105
  v128_t values_f64x2) {
111
106
  v128_t product_f64x2 = wasm_f64x2_mul(values_f64x2, values_f64x2);
@@ -124,7 +119,7 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_v128relaxed_( //
124
119
  nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, //
125
120
  nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
126
121
  nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
127
- nk_f64_t h[9]) {
122
+ nk_f64_t cross_covariance[9], nk_f64_t *centered_norm_squared_a, nk_f64_t *centered_norm_squared_b) {
128
123
  v128_t zero_f64x2 = wasm_f64x2_splat(0.0);
129
124
  v128_t sum_a_x_low_f64x2 = zero_f64x2, sum_a_x_high_f64x2 = zero_f64x2;
130
125
  v128_t sum_a_y_low_f64x2 = zero_f64x2, sum_a_y_high_f64x2 = zero_f64x2;
@@ -141,6 +136,8 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_v128relaxed_( //
141
136
  v128_t cross_20_low_f64x2 = zero_f64x2, cross_20_high_f64x2 = zero_f64x2;
142
137
  v128_t cross_21_low_f64x2 = zero_f64x2, cross_21_high_f64x2 = zero_f64x2;
143
138
  v128_t cross_22_low_f64x2 = zero_f64x2, cross_22_high_f64x2 = zero_f64x2;
139
+ v128_t norm_squared_a_low_f64x2 = zero_f64x2, norm_squared_a_high_f64x2 = zero_f64x2;
140
+ v128_t norm_squared_b_low_f64x2 = zero_f64x2, norm_squared_b_high_f64x2 = zero_f64x2;
144
141
  nk_size_t index = 0;
145
142
 
146
143
  for (; index + 4 <= n; index += 4) {
@@ -192,6 +189,19 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_v128relaxed_( //
192
189
  cross_21_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, b_y_high_f64x2, cross_21_high_f64x2);
193
190
  cross_22_low_f64x2 = wasm_f64x2_relaxed_madd(a_z_low_f64x2, b_z_low_f64x2, cross_22_low_f64x2),
194
191
  cross_22_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, b_z_high_f64x2, cross_22_high_f64x2);
192
+
193
+ norm_squared_a_low_f64x2 = wasm_f64x2_relaxed_madd(a_x_low_f64x2, a_x_low_f64x2, norm_squared_a_low_f64x2);
194
+ norm_squared_a_high_f64x2 = wasm_f64x2_relaxed_madd(a_x_high_f64x2, a_x_high_f64x2, norm_squared_a_high_f64x2);
195
+ norm_squared_a_low_f64x2 = wasm_f64x2_relaxed_madd(a_y_low_f64x2, a_y_low_f64x2, norm_squared_a_low_f64x2);
196
+ norm_squared_a_high_f64x2 = wasm_f64x2_relaxed_madd(a_y_high_f64x2, a_y_high_f64x2, norm_squared_a_high_f64x2);
197
+ norm_squared_a_low_f64x2 = wasm_f64x2_relaxed_madd(a_z_low_f64x2, a_z_low_f64x2, norm_squared_a_low_f64x2);
198
+ norm_squared_a_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, a_z_high_f64x2, norm_squared_a_high_f64x2);
199
+ norm_squared_b_low_f64x2 = wasm_f64x2_relaxed_madd(b_x_low_f64x2, b_x_low_f64x2, norm_squared_b_low_f64x2);
200
+ norm_squared_b_high_f64x2 = wasm_f64x2_relaxed_madd(b_x_high_f64x2, b_x_high_f64x2, norm_squared_b_high_f64x2);
201
+ norm_squared_b_low_f64x2 = wasm_f64x2_relaxed_madd(b_y_low_f64x2, b_y_low_f64x2, norm_squared_b_low_f64x2);
202
+ norm_squared_b_high_f64x2 = wasm_f64x2_relaxed_madd(b_y_high_f64x2, b_y_high_f64x2, norm_squared_b_high_f64x2);
203
+ norm_squared_b_low_f64x2 = wasm_f64x2_relaxed_madd(b_z_low_f64x2, b_z_low_f64x2, norm_squared_b_low_f64x2);
204
+ norm_squared_b_high_f64x2 = wasm_f64x2_relaxed_madd(b_z_high_f64x2, b_z_high_f64x2, norm_squared_b_high_f64x2);
195
205
  }
196
206
 
197
207
  nk_f64_t sum_a_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_x_low_f64x2, sum_a_x_high_f64x2));
@@ -209,6 +219,10 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_v128relaxed_( //
209
219
  nk_f64_t cross_20 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_20_low_f64x2, cross_20_high_f64x2));
210
220
  nk_f64_t cross_21 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_21_low_f64x2, cross_21_high_f64x2));
211
221
  nk_f64_t cross_22 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_22_low_f64x2, cross_22_high_f64x2));
222
+ nk_f64_t norm_squared_a_sum = nk_hsum_f64x2_v128relaxed_(
223
+ wasm_f64x2_add(norm_squared_a_low_f64x2, norm_squared_a_high_f64x2));
224
+ nk_f64_t norm_squared_b_sum = nk_hsum_f64x2_v128relaxed_(
225
+ wasm_f64x2_add(norm_squared_b_low_f64x2, norm_squared_b_high_f64x2));
212
226
 
213
227
  for (; index < n; ++index) {
214
228
  nk_f64_t a_x = a[index * 3 + 0], a_y = a[index * 3 + 1], a_z = a[index * 3 + 2];
@@ -218,6 +232,8 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_v128relaxed_( //
218
232
  cross_00 += a_x * b_x, cross_01 += a_x * b_y, cross_02 += a_x * b_z;
219
233
  cross_10 += a_y * b_x, cross_11 += a_y * b_y, cross_12 += a_y * b_z;
220
234
  cross_20 += a_z * b_x, cross_21 += a_z * b_y, cross_22 += a_z * b_z;
235
+ norm_squared_a_sum += a_x * a_x + a_y * a_y + a_z * a_z;
236
+ norm_squared_b_sum += b_x * b_x + b_y * b_y + b_z * b_z;
221
237
  }
222
238
 
223
239
  nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
@@ -227,22 +243,31 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_v128relaxed_( //
227
243
  *centroid_b_z = sum_b_z * inv_points_count;
228
244
 
229
245
  nk_f64_t n_f64 = (nk_f64_t)n;
230
- h[0] = cross_00 - n_f64 * (*centroid_a_x) * (*centroid_b_x),
231
- h[1] = cross_01 - n_f64 * (*centroid_a_x) * (*centroid_b_y),
232
- h[2] = cross_02 - n_f64 * (*centroid_a_x) * (*centroid_b_z);
233
- h[3] = cross_10 - n_f64 * (*centroid_a_y) * (*centroid_b_x),
234
- h[4] = cross_11 - n_f64 * (*centroid_a_y) * (*centroid_b_y),
235
- h[5] = cross_12 - n_f64 * (*centroid_a_y) * (*centroid_b_z);
236
- h[6] = cross_20 - n_f64 * (*centroid_a_z) * (*centroid_b_x),
237
- h[7] = cross_21 - n_f64 * (*centroid_a_z) * (*centroid_b_y),
238
- h[8] = cross_22 - n_f64 * (*centroid_a_z) * (*centroid_b_z);
246
+ cross_covariance[0] = cross_00 - n_f64 * (*centroid_a_x) * (*centroid_b_x),
247
+ cross_covariance[1] = cross_01 - n_f64 * (*centroid_a_x) * (*centroid_b_y),
248
+ cross_covariance[2] = cross_02 - n_f64 * (*centroid_a_x) * (*centroid_b_z);
249
+ cross_covariance[3] = cross_10 - n_f64 * (*centroid_a_y) * (*centroid_b_x),
250
+ cross_covariance[4] = cross_11 - n_f64 * (*centroid_a_y) * (*centroid_b_y),
251
+ cross_covariance[5] = cross_12 - n_f64 * (*centroid_a_y) * (*centroid_b_z);
252
+ cross_covariance[6] = cross_20 - n_f64 * (*centroid_a_z) * (*centroid_b_x),
253
+ cross_covariance[7] = cross_21 - n_f64 * (*centroid_a_z) * (*centroid_b_y),
254
+ cross_covariance[8] = cross_22 - n_f64 * (*centroid_a_z) * (*centroid_b_z);
255
+
256
+ *centered_norm_squared_a = norm_squared_a_sum -
257
+ n_f64 * ((*centroid_a_x) * (*centroid_a_x) + (*centroid_a_y) * (*centroid_a_y) +
258
+ (*centroid_a_z) * (*centroid_a_z));
259
+ *centered_norm_squared_b = norm_squared_b_sum -
260
+ n_f64 * ((*centroid_b_x) * (*centroid_b_x) + (*centroid_b_y) * (*centroid_b_y) +
261
+ (*centroid_b_z) * (*centroid_b_z));
262
+ if (*centered_norm_squared_a < 0.0) *centered_norm_squared_a = 0.0;
263
+ if (*centered_norm_squared_b < 0.0) *centered_norm_squared_b = 0.0;
239
264
  }
240
265
 
241
266
  NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_v128relaxed_( //
242
267
  nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, //
243
268
  nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
244
269
  nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
245
- nk_f64_t h[9], nk_f64_t *variance_a) {
270
+ nk_f64_t cross_covariance[9], nk_f64_t *centered_norm_squared_a, nk_f64_t *centered_norm_squared_b) {
246
271
  v128_t zero_f64x2 = wasm_f64x2_splat(0.0);
247
272
  v128_t sum_a_x_low_f64x2 = zero_f64x2, sum_a_x_high_f64x2 = zero_f64x2;
248
273
  v128_t sum_a_y_low_f64x2 = zero_f64x2, sum_a_y_high_f64x2 = zero_f64x2;
@@ -250,6 +275,7 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_v128relaxed_(
250
275
  v128_t sum_b_x_low_f64x2 = zero_f64x2, sum_b_x_high_f64x2 = zero_f64x2;
251
276
  v128_t sum_b_y_low_f64x2 = zero_f64x2, sum_b_y_high_f64x2 = zero_f64x2;
252
277
  v128_t sum_b_z_low_f64x2 = zero_f64x2, sum_b_z_high_f64x2 = zero_f64x2;
278
+ v128_t norm_squared_b_low_f64x2 = zero_f64x2, norm_squared_b_high_f64x2 = zero_f64x2;
253
279
  v128_t cross_00_low_f64x2 = zero_f64x2, cross_00_high_f64x2 = zero_f64x2;
254
280
  v128_t cross_01_low_f64x2 = zero_f64x2, cross_01_high_f64x2 = zero_f64x2;
255
281
  v128_t cross_02_low_f64x2 = zero_f64x2, cross_02_high_f64x2 = zero_f64x2;
@@ -312,7 +338,7 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_v128relaxed_(
312
338
  cross_22_low_f64x2 = wasm_f64x2_relaxed_madd(a_z_low_f64x2, b_z_low_f64x2, cross_22_low_f64x2),
313
339
  cross_22_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, b_z_high_f64x2, cross_22_high_f64x2);
314
340
 
315
- // Variance: accumulate ||a||^2.
341
+ // Norm-squared accumulators for both point sets (used for folded SSD).
316
342
  v128_t norm_squared_low_f64x2 = wasm_f64x2_relaxed_madd(a_y_low_f64x2, a_y_low_f64x2,
317
343
  wasm_f64x2_mul(a_x_low_f64x2, a_x_low_f64x2));
318
344
  v128_t norm_squared_high_f64x2 = wasm_f64x2_relaxed_madd(a_y_high_f64x2, a_y_high_f64x2,
@@ -321,6 +347,13 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_v128relaxed_(
321
347
  norm_squared_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, a_z_high_f64x2, norm_squared_high_f64x2);
322
348
  sum_norm_squared_low_f64x2 = wasm_f64x2_add(sum_norm_squared_low_f64x2, norm_squared_low_f64x2);
323
349
  sum_norm_squared_high_f64x2 = wasm_f64x2_add(sum_norm_squared_high_f64x2, norm_squared_high_f64x2);
350
+
351
+ norm_squared_b_low_f64x2 = wasm_f64x2_relaxed_madd(b_x_low_f64x2, b_x_low_f64x2, norm_squared_b_low_f64x2);
352
+ norm_squared_b_high_f64x2 = wasm_f64x2_relaxed_madd(b_x_high_f64x2, b_x_high_f64x2, norm_squared_b_high_f64x2);
353
+ norm_squared_b_low_f64x2 = wasm_f64x2_relaxed_madd(b_y_low_f64x2, b_y_low_f64x2, norm_squared_b_low_f64x2);
354
+ norm_squared_b_high_f64x2 = wasm_f64x2_relaxed_madd(b_y_high_f64x2, b_y_high_f64x2, norm_squared_b_high_f64x2);
355
+ norm_squared_b_low_f64x2 = wasm_f64x2_relaxed_madd(b_z_low_f64x2, b_z_low_f64x2, norm_squared_b_low_f64x2);
356
+ norm_squared_b_high_f64x2 = wasm_f64x2_relaxed_madd(b_z_high_f64x2, b_z_high_f64x2, norm_squared_b_high_f64x2);
324
357
  }
325
358
 
326
359
  nk_f64_t sum_a_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_x_low_f64x2, sum_a_x_high_f64x2));
@@ -338,8 +371,10 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_v128relaxed_(
338
371
  nk_f64_t cross_20 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_20_low_f64x2, cross_20_high_f64x2));
339
372
  nk_f64_t cross_21 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_21_low_f64x2, cross_21_high_f64x2));
340
373
  nk_f64_t cross_22 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_22_low_f64x2, cross_22_high_f64x2));
341
- nk_f64_t sum_norm_squared = nk_hsum_f64x2_v128relaxed_(
374
+ nk_f64_t norm_squared_a_sum = nk_hsum_f64x2_v128relaxed_(
342
375
  wasm_f64x2_add(sum_norm_squared_low_f64x2, sum_norm_squared_high_f64x2));
376
+ nk_f64_t norm_squared_b_sum = nk_hsum_f64x2_v128relaxed_(
377
+ wasm_f64x2_add(norm_squared_b_low_f64x2, norm_squared_b_high_f64x2));
343
378
 
344
379
  for (; index < n; ++index) {
345
380
  nk_f64_t a_x = a[index * 3 + 0], a_y = a[index * 3 + 1], a_z = a[index * 3 + 2];
@@ -349,7 +384,8 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_v128relaxed_(
349
384
  cross_00 += a_x * b_x, cross_01 += a_x * b_y, cross_02 += a_x * b_z;
350
385
  cross_10 += a_y * b_x, cross_11 += a_y * b_y, cross_12 += a_y * b_z;
351
386
  cross_20 += a_z * b_x, cross_21 += a_z * b_y, cross_22 += a_z * b_z;
352
- sum_norm_squared += a_x * a_x + a_y * a_y + a_z * a_z;
387
+ norm_squared_a_sum += a_x * a_x + a_y * a_y + a_z * a_z;
388
+ norm_squared_b_sum += b_x * b_x + b_y * b_y + b_z * b_z;
353
389
  }
354
390
 
355
391
  nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
@@ -359,209 +395,24 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_v128relaxed_(
359
395
  *centroid_b_z = sum_b_z * inv_points_count;
360
396
 
361
397
  nk_f64_t n_f64 = (nk_f64_t)n;
362
- h[0] = cross_00 - n_f64 * (*centroid_a_x) * (*centroid_b_x),
363
- h[1] = cross_01 - n_f64 * (*centroid_a_x) * (*centroid_b_y),
364
- h[2] = cross_02 - n_f64 * (*centroid_a_x) * (*centroid_b_z);
365
- h[3] = cross_10 - n_f64 * (*centroid_a_y) * (*centroid_b_x),
366
- h[4] = cross_11 - n_f64 * (*centroid_a_y) * (*centroid_b_y),
367
- h[5] = cross_12 - n_f64 * (*centroid_a_y) * (*centroid_b_z);
368
- h[6] = cross_20 - n_f64 * (*centroid_a_z) * (*centroid_b_x),
369
- h[7] = cross_21 - n_f64 * (*centroid_a_z) * (*centroid_b_y),
370
- h[8] = cross_22 - n_f64 * (*centroid_a_z) * (*centroid_b_z);
371
- *variance_a = sum_norm_squared * inv_points_count -
372
- ((*centroid_a_x) * (*centroid_a_x) + (*centroid_a_y) * (*centroid_a_y) +
373
- (*centroid_a_z) * (*centroid_a_z));
374
- }
375
-
376
- NK_INTERNAL nk_f64_t nk_transformed_ssd_f32_v128relaxed_( //
377
- nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t const *r, nk_f64_t scale, nk_f64_t centroid_a_x,
378
- nk_f64_t centroid_a_y, nk_f64_t centroid_a_z, nk_f64_t centroid_b_x, nk_f64_t centroid_b_y, nk_f64_t centroid_b_z) {
379
- v128_t scaled_rotation_x_x_f64x2 = wasm_f64x2_splat(scale * r[0]);
380
- v128_t scaled_rotation_x_y_f64x2 = wasm_f64x2_splat(scale * r[1]);
381
- v128_t scaled_rotation_x_z_f64x2 = wasm_f64x2_splat(scale * r[2]);
382
- v128_t scaled_rotation_y_x_f64x2 = wasm_f64x2_splat(scale * r[3]);
383
- v128_t scaled_rotation_y_y_f64x2 = wasm_f64x2_splat(scale * r[4]);
384
- v128_t scaled_rotation_y_z_f64x2 = wasm_f64x2_splat(scale * r[5]);
385
- v128_t scaled_rotation_z_x_f64x2 = wasm_f64x2_splat(scale * r[6]);
386
- v128_t scaled_rotation_z_y_f64x2 = wasm_f64x2_splat(scale * r[7]);
387
- v128_t scaled_rotation_z_z_f64x2 = wasm_f64x2_splat(scale * r[8]);
388
- v128_t centroid_a_x_f64x2 = wasm_f64x2_splat(centroid_a_x), centroid_a_y_f64x2 = wasm_f64x2_splat(centroid_a_y);
389
- v128_t centroid_a_z_f64x2 = wasm_f64x2_splat(centroid_a_z), centroid_b_x_f64x2 = wasm_f64x2_splat(centroid_b_x);
390
- v128_t centroid_b_y_f64x2 = wasm_f64x2_splat(centroid_b_y), centroid_b_z_f64x2 = wasm_f64x2_splat(centroid_b_z);
391
- v128_t sum_squared_low_f64x2 = wasm_f64x2_splat(0.0), sum_squared_high_f64x2 = wasm_f64x2_splat(0.0);
392
- nk_size_t index = 0;
393
-
394
- for (; index + 4 <= n; index += 4) {
395
- v128_t a_x_f32x4, a_y_f32x4, a_z_f32x4, b_x_f32x4, b_y_f32x4, b_z_f32x4;
396
- nk_deinterleave_f32x4_v128relaxed_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4);
397
- nk_deinterleave_f32x4_v128relaxed_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
398
-
399
- v128_t centered_a_x_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(a_x_f32x4), centroid_a_x_f64x2);
400
- v128_t centered_a_x_high_f64x2 = wasm_f64x2_sub(
401
- wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_x_f32x4, a_x_f32x4, 2, 3, 0, 1)), centroid_a_x_f64x2);
402
- v128_t centered_a_y_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(a_y_f32x4), centroid_a_y_f64x2);
403
- v128_t centered_a_y_high_f64x2 = wasm_f64x2_sub(
404
- wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_y_f32x4, a_y_f32x4, 2, 3, 0, 1)), centroid_a_y_f64x2);
405
- v128_t centered_a_z_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(a_z_f32x4), centroid_a_z_f64x2);
406
- v128_t centered_a_z_high_f64x2 = wasm_f64x2_sub(
407
- wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_z_f32x4, a_z_f32x4, 2, 3, 0, 1)), centroid_a_z_f64x2);
408
- v128_t centered_b_x_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(b_x_f32x4), centroid_b_x_f64x2);
409
- v128_t centered_b_x_high_f64x2 = wasm_f64x2_sub(
410
- wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_x_f32x4, b_x_f32x4, 2, 3, 0, 1)), centroid_b_x_f64x2);
411
- v128_t centered_b_y_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(b_y_f32x4), centroid_b_y_f64x2);
412
- v128_t centered_b_y_high_f64x2 = wasm_f64x2_sub(
413
- wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_y_f32x4, b_y_f32x4, 2, 3, 0, 1)), centroid_b_y_f64x2);
414
- v128_t centered_b_z_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(b_z_f32x4), centroid_b_z_f64x2);
415
- v128_t centered_b_z_high_f64x2 = wasm_f64x2_sub(
416
- wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_z_f32x4, b_z_f32x4, 2, 3, 0, 1)), centroid_b_z_f64x2);
417
-
418
- v128_t rotated_a_x_low_f64x2 = wasm_f64x2_relaxed_madd(
419
- scaled_rotation_x_z_f64x2, centered_a_z_low_f64x2,
420
- wasm_f64x2_relaxed_madd(scaled_rotation_x_y_f64x2, centered_a_y_low_f64x2,
421
- wasm_f64x2_mul(scaled_rotation_x_x_f64x2, centered_a_x_low_f64x2)));
422
- v128_t rotated_a_x_high_f64x2 = wasm_f64x2_relaxed_madd(
423
- scaled_rotation_x_z_f64x2, centered_a_z_high_f64x2,
424
- wasm_f64x2_relaxed_madd(scaled_rotation_x_y_f64x2, centered_a_y_high_f64x2,
425
- wasm_f64x2_mul(scaled_rotation_x_x_f64x2, centered_a_x_high_f64x2)));
426
- v128_t rotated_a_y_low_f64x2 = wasm_f64x2_relaxed_madd(
427
- scaled_rotation_y_z_f64x2, centered_a_z_low_f64x2,
428
- wasm_f64x2_relaxed_madd(scaled_rotation_y_y_f64x2, centered_a_y_low_f64x2,
429
- wasm_f64x2_mul(scaled_rotation_y_x_f64x2, centered_a_x_low_f64x2)));
430
- v128_t rotated_a_y_high_f64x2 = wasm_f64x2_relaxed_madd(
431
- scaled_rotation_y_z_f64x2, centered_a_z_high_f64x2,
432
- wasm_f64x2_relaxed_madd(scaled_rotation_y_y_f64x2, centered_a_y_high_f64x2,
433
- wasm_f64x2_mul(scaled_rotation_y_x_f64x2, centered_a_x_high_f64x2)));
434
- v128_t rotated_a_z_low_f64x2 = wasm_f64x2_relaxed_madd(
435
- scaled_rotation_z_z_f64x2, centered_a_z_low_f64x2,
436
- wasm_f64x2_relaxed_madd(scaled_rotation_z_y_f64x2, centered_a_y_low_f64x2,
437
- wasm_f64x2_mul(scaled_rotation_z_x_f64x2, centered_a_x_low_f64x2)));
438
- v128_t rotated_a_z_high_f64x2 = wasm_f64x2_relaxed_madd(
439
- scaled_rotation_z_z_f64x2, centered_a_z_high_f64x2,
440
- wasm_f64x2_relaxed_madd(scaled_rotation_z_y_f64x2, centered_a_y_high_f64x2,
441
- wasm_f64x2_mul(scaled_rotation_z_x_f64x2, centered_a_x_high_f64x2)));
442
-
443
- v128_t delta_x_low_f64x2 = wasm_f64x2_sub(rotated_a_x_low_f64x2, centered_b_x_low_f64x2);
444
- v128_t delta_x_high_f64x2 = wasm_f64x2_sub(rotated_a_x_high_f64x2, centered_b_x_high_f64x2);
445
- v128_t delta_y_low_f64x2 = wasm_f64x2_sub(rotated_a_y_low_f64x2, centered_b_y_low_f64x2);
446
- v128_t delta_y_high_f64x2 = wasm_f64x2_sub(rotated_a_y_high_f64x2, centered_b_y_high_f64x2);
447
- v128_t delta_z_low_f64x2 = wasm_f64x2_sub(rotated_a_z_low_f64x2, centered_b_z_low_f64x2);
448
- v128_t delta_z_high_f64x2 = wasm_f64x2_sub(rotated_a_z_high_f64x2, centered_b_z_high_f64x2);
449
-
450
- sum_squared_low_f64x2 = wasm_f64x2_relaxed_madd(delta_x_low_f64x2, delta_x_low_f64x2, sum_squared_low_f64x2);
451
- sum_squared_high_f64x2 = wasm_f64x2_relaxed_madd(delta_x_high_f64x2, delta_x_high_f64x2,
452
- sum_squared_high_f64x2);
453
- sum_squared_low_f64x2 = wasm_f64x2_relaxed_madd(delta_y_low_f64x2, delta_y_low_f64x2, sum_squared_low_f64x2);
454
- sum_squared_high_f64x2 = wasm_f64x2_relaxed_madd(delta_y_high_f64x2, delta_y_high_f64x2,
455
- sum_squared_high_f64x2);
456
- sum_squared_low_f64x2 = wasm_f64x2_relaxed_madd(delta_z_low_f64x2, delta_z_low_f64x2, sum_squared_low_f64x2);
457
- sum_squared_high_f64x2 = wasm_f64x2_relaxed_madd(delta_z_high_f64x2, delta_z_high_f64x2,
458
- sum_squared_high_f64x2);
459
- }
460
-
461
- nk_f64_t sum_squared = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_squared_low_f64x2, sum_squared_high_f64x2));
462
- for (; index < n; ++index) {
463
- nk_f64_t centered_a_x = (nk_f64_t)a[index * 3 + 0] - centroid_a_x,
464
- centered_a_y = (nk_f64_t)a[index * 3 + 1] - centroid_a_y,
465
- centered_a_z = (nk_f64_t)a[index * 3 + 2] - centroid_a_z;
466
- nk_f64_t centered_b_x = (nk_f64_t)b[index * 3 + 0] - centroid_b_x,
467
- centered_b_y = (nk_f64_t)b[index * 3 + 1] - centroid_b_y,
468
- centered_b_z = (nk_f64_t)b[index * 3 + 2] - centroid_b_z;
469
- nk_f64_t rotated_a_x = scale * (r[0] * centered_a_x + r[1] * centered_a_y + r[2] * centered_a_z),
470
- rotated_a_y = scale * (r[3] * centered_a_x + r[4] * centered_a_y + r[5] * centered_a_z),
471
- rotated_a_z = scale * (r[6] * centered_a_x + r[7] * centered_a_y + r[8] * centered_a_z);
472
- nk_f64_t delta_x = rotated_a_x - centered_b_x, delta_y = rotated_a_y - centered_b_y,
473
- delta_z = rotated_a_z - centered_b_z;
474
- sum_squared += delta_x * delta_x + delta_y * delta_y + delta_z * delta_z;
475
- }
476
-
477
- return sum_squared;
478
- }
479
-
480
- /* Compute sum of squared distances for f64 after applying rotation (and optional scale). */
481
- NK_INTERNAL nk_f64_t nk_transformed_ssd_f64_v128relaxed_(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n,
482
- nk_f64_t const *r, nk_f64_t scale, nk_f64_t centroid_a_x,
483
- nk_f64_t centroid_a_y, nk_f64_t centroid_a_z,
484
- nk_f64_t centroid_b_x, nk_f64_t centroid_b_y,
485
- nk_f64_t centroid_b_z) {
486
- // Broadcast scaled rotation matrix elements
487
- v128_t scaled_rotation_x_x_f64x2 = wasm_f64x2_splat(scale * r[0]);
488
- v128_t scaled_rotation_x_y_f64x2 = wasm_f64x2_splat(scale * r[1]);
489
- v128_t scaled_rotation_x_z_f64x2 = wasm_f64x2_splat(scale * r[2]);
490
- v128_t scaled_rotation_y_x_f64x2 = wasm_f64x2_splat(scale * r[3]);
491
- v128_t scaled_rotation_y_y_f64x2 = wasm_f64x2_splat(scale * r[4]);
492
- v128_t scaled_rotation_y_z_f64x2 = wasm_f64x2_splat(scale * r[5]);
493
- v128_t scaled_rotation_z_x_f64x2 = wasm_f64x2_splat(scale * r[6]);
494
- v128_t scaled_rotation_z_y_f64x2 = wasm_f64x2_splat(scale * r[7]);
495
- v128_t scaled_rotation_z_z_f64x2 = wasm_f64x2_splat(scale * r[8]);
496
-
497
- // Broadcast centroids
498
- v128_t centroid_a_x_f64x2 = wasm_f64x2_splat(centroid_a_x);
499
- v128_t centroid_a_y_f64x2 = wasm_f64x2_splat(centroid_a_y);
500
- v128_t centroid_a_z_f64x2 = wasm_f64x2_splat(centroid_a_z);
501
- v128_t centroid_b_x_f64x2 = wasm_f64x2_splat(centroid_b_x);
502
- v128_t centroid_b_y_f64x2 = wasm_f64x2_splat(centroid_b_y);
503
- v128_t centroid_b_z_f64x2 = wasm_f64x2_splat(centroid_b_z);
504
-
505
- v128_t sum_squared_f64x2 = wasm_f64x2_splat(0), sum_squared_compensation_f64x2 = wasm_f64x2_splat(0);
506
- nk_size_t j = 0;
507
-
508
- // Main loop: process 2 points per iteration
509
- for (; j + 2 <= n; j += 2) {
510
- v128_t a_x_f64x2, a_y_f64x2, a_z_f64x2, b_x_f64x2, b_y_f64x2, b_z_f64x2;
511
- nk_deinterleave_f64x2_v128relaxed_(a + j * 3, &a_x_f64x2, &a_y_f64x2, &a_z_f64x2);
512
- nk_deinterleave_f64x2_v128relaxed_(b + j * 3, &b_x_f64x2, &b_y_f64x2, &b_z_f64x2);
513
-
514
- v128_t centered_a_x_f64x2 = wasm_f64x2_sub(a_x_f64x2, centroid_a_x_f64x2);
515
- v128_t centered_a_y_f64x2 = wasm_f64x2_sub(a_y_f64x2, centroid_a_y_f64x2);
516
- v128_t centered_a_z_f64x2 = wasm_f64x2_sub(a_z_f64x2, centroid_a_z_f64x2);
517
- v128_t centered_b_x_f64x2 = wasm_f64x2_sub(b_x_f64x2, centroid_b_x_f64x2);
518
- v128_t centered_b_y_f64x2 = wasm_f64x2_sub(b_y_f64x2, centroid_b_y_f64x2);
519
- v128_t centered_b_z_f64x2 = wasm_f64x2_sub(b_z_f64x2, centroid_b_z_f64x2);
520
-
521
- // Rotate and scale: ra = scale * R * pa
522
- v128_t rotated_a_x_f64x2 = wasm_f64x2_relaxed_madd(
523
- scaled_rotation_x_z_f64x2, centered_a_z_f64x2,
524
- wasm_f64x2_relaxed_madd(scaled_rotation_x_y_f64x2, centered_a_y_f64x2,
525
- wasm_f64x2_mul(scaled_rotation_x_x_f64x2, centered_a_x_f64x2)));
526
- v128_t rotated_a_y_f64x2 = wasm_f64x2_relaxed_madd(
527
- scaled_rotation_y_z_f64x2, centered_a_z_f64x2,
528
- wasm_f64x2_relaxed_madd(scaled_rotation_y_y_f64x2, centered_a_y_f64x2,
529
- wasm_f64x2_mul(scaled_rotation_y_x_f64x2, centered_a_x_f64x2)));
530
- v128_t rotated_a_z_f64x2 = wasm_f64x2_relaxed_madd(
531
- scaled_rotation_z_z_f64x2, centered_a_z_f64x2,
532
- wasm_f64x2_relaxed_madd(scaled_rotation_z_y_f64x2, centered_a_y_f64x2,
533
- wasm_f64x2_mul(scaled_rotation_z_x_f64x2, centered_a_x_f64x2)));
534
-
535
- v128_t delta_x_f64x2 = wasm_f64x2_sub(rotated_a_x_f64x2, centered_b_x_f64x2);
536
- v128_t delta_y_f64x2 = wasm_f64x2_sub(rotated_a_y_f64x2, centered_b_y_f64x2);
537
- v128_t delta_z_f64x2 = wasm_f64x2_sub(rotated_a_z_f64x2, centered_b_z_f64x2);
538
-
539
- nk_accumulate_square_f64x2_v128relaxed_(&sum_squared_f64x2, &sum_squared_compensation_f64x2, delta_x_f64x2);
540
- nk_accumulate_square_f64x2_v128relaxed_(&sum_squared_f64x2, &sum_squared_compensation_f64x2, delta_y_f64x2);
541
- nk_accumulate_square_f64x2_v128relaxed_(&sum_squared_f64x2, &sum_squared_compensation_f64x2, delta_z_f64x2);
542
- }
543
-
544
- nk_f64_t sum_squared = nk_dot_stable_sum_f64x2_v128relaxed_(sum_squared_f64x2, sum_squared_compensation_f64x2);
545
- nk_f64_t sum_squared_compensation = 0.0;
546
-
547
- // Scalar tail
548
- for (; j < n; ++j) {
549
- nk_f64_t pa_x = a[j * 3 + 0] - centroid_a_x, pa_y = a[j * 3 + 1] - centroid_a_y,
550
- pa_z = a[j * 3 + 2] - centroid_a_z;
551
- nk_f64_t pb_x = b[j * 3 + 0] - centroid_b_x, pb_y = b[j * 3 + 1] - centroid_b_y,
552
- pb_z = b[j * 3 + 2] - centroid_b_z;
553
-
554
- nk_f64_t ra_x = scale * (r[0] * pa_x + r[1] * pa_y + r[2] * pa_z),
555
- ra_y = scale * (r[3] * pa_x + r[4] * pa_y + r[5] * pa_z),
556
- ra_z = scale * (r[6] * pa_x + r[7] * pa_y + r[8] * pa_z);
557
-
558
- nk_f64_t delta_x = ra_x - pb_x, delta_y = ra_y - pb_y, delta_z = ra_z - pb_z;
559
- nk_accumulate_square_f64_(&sum_squared, &sum_squared_compensation, delta_x);
560
- nk_accumulate_square_f64_(&sum_squared, &sum_squared_compensation, delta_y);
561
- nk_accumulate_square_f64_(&sum_squared, &sum_squared_compensation, delta_z);
562
- }
563
-
564
- return sum_squared + sum_squared_compensation;
398
+ cross_covariance[0] = cross_00 - n_f64 * (*centroid_a_x) * (*centroid_b_x),
399
+ cross_covariance[1] = cross_01 - n_f64 * (*centroid_a_x) * (*centroid_b_y),
400
+ cross_covariance[2] = cross_02 - n_f64 * (*centroid_a_x) * (*centroid_b_z);
401
+ cross_covariance[3] = cross_10 - n_f64 * (*centroid_a_y) * (*centroid_b_x),
402
+ cross_covariance[4] = cross_11 - n_f64 * (*centroid_a_y) * (*centroid_b_y),
403
+ cross_covariance[5] = cross_12 - n_f64 * (*centroid_a_y) * (*centroid_b_z);
404
+ cross_covariance[6] = cross_20 - n_f64 * (*centroid_a_z) * (*centroid_b_x),
405
+ cross_covariance[7] = cross_21 - n_f64 * (*centroid_a_z) * (*centroid_b_y),
406
+ cross_covariance[8] = cross_22 - n_f64 * (*centroid_a_z) * (*centroid_b_z);
407
+
408
+ *centered_norm_squared_a = norm_squared_a_sum -
409
+ n_f64 * ((*centroid_a_x) * (*centroid_a_x) + (*centroid_a_y) * (*centroid_a_y) +
410
+ (*centroid_a_z) * (*centroid_a_z));
411
+ *centered_norm_squared_b = norm_squared_b_sum -
412
+ n_f64 * ((*centroid_b_x) * (*centroid_b_x) + (*centroid_b_y) * (*centroid_b_y) +
413
+ (*centroid_b_z) * (*centroid_b_z));
414
+ if (*centered_norm_squared_a < 0.0) *centered_norm_squared_a = 0.0;
415
+ if (*centered_norm_squared_b < 0.0) *centered_norm_squared_b = 0.0;
565
416
  }
566
417
 
567
418
  NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
@@ -687,51 +538,79 @@ NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_
687
538
  NK_PUBLIC void nk_kabsch_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
688
539
  nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
689
540
  nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z;
690
- nk_f64_t h[9];
541
+ nk_f64_t centered_norm_squared_a, centered_norm_squared_b;
542
+ nk_f64_t cross_covariance[9];
691
543
  nk_centroid_and_cross_covariance_f32_v128relaxed_(a, b, n, &centroid_a_x, &centroid_a_y, &centroid_a_z,
692
- &centroid_b_x, &centroid_b_y, &centroid_b_z, h);
544
+ &centroid_b_x, &centroid_b_y, &centroid_b_z, cross_covariance,
545
+ &centered_norm_squared_a, &centered_norm_squared_b);
693
546
  if (a_centroid)
694
547
  a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
695
548
  a_centroid[2] = (nk_f32_t)centroid_a_z;
696
549
  if (b_centroid)
697
550
  b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
698
551
  b_centroid[2] = (nk_f32_t)centroid_b_z;
699
- nk_f64_t svd_u[9], svd_s[9], svd_v[9];
700
- nk_svd3x3_f64_(h, svd_u, svd_s, svd_v);
701
-
702
- nk_f64_t r[9];
703
- r[0] = svd_v[0] * svd_u[0] + svd_v[1] * svd_u[1] + svd_v[2] * svd_u[2];
704
- r[1] = svd_v[0] * svd_u[3] + svd_v[1] * svd_u[4] + svd_v[2] * svd_u[5];
705
- r[2] = svd_v[0] * svd_u[6] + svd_v[1] * svd_u[7] + svd_v[2] * svd_u[8];
706
- r[3] = svd_v[3] * svd_u[0] + svd_v[4] * svd_u[1] + svd_v[5] * svd_u[2];
707
- r[4] = svd_v[3] * svd_u[3] + svd_v[4] * svd_u[4] + svd_v[5] * svd_u[5];
708
- r[5] = svd_v[3] * svd_u[6] + svd_v[4] * svd_u[7] + svd_v[5] * svd_u[8];
709
- r[6] = svd_v[6] * svd_u[0] + svd_v[7] * svd_u[1] + svd_v[8] * svd_u[2];
710
- r[7] = svd_v[6] * svd_u[3] + svd_v[7] * svd_u[4] + svd_v[8] * svd_u[5];
711
- r[8] = svd_v[6] * svd_u[6] + svd_v[7] * svd_u[7] + svd_v[8] * svd_u[8];
712
-
713
- // Handle reflection: if det(R) < 0, negate third column of V and recompute R.
714
- if (nk_det3x3_f64_(r) < 0) {
715
- svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
716
- r[0] = svd_v[0] * svd_u[0] + svd_v[1] * svd_u[1] + svd_v[2] * svd_u[2];
717
- r[1] = svd_v[0] * svd_u[3] + svd_v[1] * svd_u[4] + svd_v[2] * svd_u[5];
718
- r[2] = svd_v[0] * svd_u[6] + svd_v[1] * svd_u[7] + svd_v[2] * svd_u[8];
719
- r[3] = svd_v[3] * svd_u[0] + svd_v[4] * svd_u[1] + svd_v[5] * svd_u[2];
720
- r[4] = svd_v[3] * svd_u[3] + svd_v[4] * svd_u[4] + svd_v[5] * svd_u[5];
721
- r[5] = svd_v[3] * svd_u[6] + svd_v[4] * svd_u[7] + svd_v[5] * svd_u[8];
722
- r[6] = svd_v[6] * svd_u[0] + svd_v[7] * svd_u[1] + svd_v[8] * svd_u[2];
723
- r[7] = svd_v[6] * svd_u[3] + svd_v[7] * svd_u[4] + svd_v[8] * svd_u[5];
724
- r[8] = svd_v[6] * svd_u[6] + svd_v[7] * svd_u[7] + svd_v[8] * svd_u[8];
552
+
553
+ // Identity-dominant short-circuit: if H ≈ diag(positive entries), R = I and trace(R·H) = trace(H).
554
+ nk_f64_t covariance_diagonal_norm_squared = cross_covariance[0] * cross_covariance[0] +
555
+ cross_covariance[4] * cross_covariance[4] +
556
+ cross_covariance[8] * cross_covariance[8];
557
+ nk_f64_t covariance_offdiagonal_norm_squared =
558
+ cross_covariance[1] * cross_covariance[1] + cross_covariance[2] * cross_covariance[2] +
559
+ cross_covariance[3] * cross_covariance[3] + cross_covariance[5] * cross_covariance[5] +
560
+ cross_covariance[6] * cross_covariance[6] + cross_covariance[7] * cross_covariance[7];
561
+ nk_f64_t optimal_rotation[9];
562
+ nk_f64_t trace_rotation_covariance;
563
+ if (covariance_offdiagonal_norm_squared < 1e-20 * covariance_diagonal_norm_squared && cross_covariance[0] > 0.0 &&
564
+ cross_covariance[4] > 0.0 && cross_covariance[8] > 0.0) {
565
+ optimal_rotation[0] = 1, optimal_rotation[1] = 0, optimal_rotation[2] = 0, optimal_rotation[3] = 0,
566
+ optimal_rotation[4] = 1, optimal_rotation[5] = 0, optimal_rotation[6] = 0, optimal_rotation[7] = 0,
567
+ optimal_rotation[8] = 1;
568
+ trace_rotation_covariance = cross_covariance[0] + cross_covariance[4] + cross_covariance[8];
569
+ }
570
+ else {
571
+ nk_f64_t svd_left[9], svd_diagonal[9], svd_right[9];
572
+ nk_svd3x3_f64_(cross_covariance, svd_left, svd_diagonal, svd_right);
573
+
574
+ optimal_rotation[0] = svd_right[0] * svd_left[0] + svd_right[1] * svd_left[1] + svd_right[2] * svd_left[2];
575
+ optimal_rotation[1] = svd_right[0] * svd_left[3] + svd_right[1] * svd_left[4] + svd_right[2] * svd_left[5];
576
+ optimal_rotation[2] = svd_right[0] * svd_left[6] + svd_right[1] * svd_left[7] + svd_right[2] * svd_left[8];
577
+ optimal_rotation[3] = svd_right[3] * svd_left[0] + svd_right[4] * svd_left[1] + svd_right[5] * svd_left[2];
578
+ optimal_rotation[4] = svd_right[3] * svd_left[3] + svd_right[4] * svd_left[4] + svd_right[5] * svd_left[5];
579
+ optimal_rotation[5] = svd_right[3] * svd_left[6] + svd_right[4] * svd_left[7] + svd_right[5] * svd_left[8];
580
+ optimal_rotation[6] = svd_right[6] * svd_left[0] + svd_right[7] * svd_left[1] + svd_right[8] * svd_left[2];
581
+ optimal_rotation[7] = svd_right[6] * svd_left[3] + svd_right[7] * svd_left[4] + svd_right[8] * svd_left[5];
582
+ optimal_rotation[8] = svd_right[6] * svd_left[6] + svd_right[7] * svd_left[7] + svd_right[8] * svd_left[8];
583
+
584
+ // Handle reflection: if det(R) < 0, negate third column of V and recompute R.
585
+ if (nk_det3x3_f64_(optimal_rotation) < 0) {
586
+ svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8];
587
+ optimal_rotation[0] = svd_right[0] * svd_left[0] + svd_right[1] * svd_left[1] + svd_right[2] * svd_left[2];
588
+ optimal_rotation[1] = svd_right[0] * svd_left[3] + svd_right[1] * svd_left[4] + svd_right[2] * svd_left[5];
589
+ optimal_rotation[2] = svd_right[0] * svd_left[6] + svd_right[1] * svd_left[7] + svd_right[2] * svd_left[8];
590
+ optimal_rotation[3] = svd_right[3] * svd_left[0] + svd_right[4] * svd_left[1] + svd_right[5] * svd_left[2];
591
+ optimal_rotation[4] = svd_right[3] * svd_left[3] + svd_right[4] * svd_left[4] + svd_right[5] * svd_left[5];
592
+ optimal_rotation[5] = svd_right[3] * svd_left[6] + svd_right[4] * svd_left[7] + svd_right[5] * svd_left[8];
593
+ optimal_rotation[6] = svd_right[6] * svd_left[0] + svd_right[7] * svd_left[1] + svd_right[8] * svd_left[2];
594
+ optimal_rotation[7] = svd_right[6] * svd_left[3] + svd_right[7] * svd_left[4] + svd_right[8] * svd_left[5];
595
+ optimal_rotation[8] = svd_right[6] * svd_left[6] + svd_right[7] * svd_left[7] + svd_right[8] * svd_left[8];
596
+ }
597
+
598
+ trace_rotation_covariance =
599
+ optimal_rotation[0] * cross_covariance[0] + optimal_rotation[1] * cross_covariance[3] +
600
+ optimal_rotation[2] * cross_covariance[6] + optimal_rotation[3] * cross_covariance[1] +
601
+ optimal_rotation[4] * cross_covariance[4] + optimal_rotation[5] * cross_covariance[7] +
602
+ optimal_rotation[6] * cross_covariance[2] + optimal_rotation[7] * cross_covariance[5] +
603
+ optimal_rotation[8] * cross_covariance[8];
725
604
  }
726
605
 
727
606
  if (rotation)
728
- for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)r[j];
607
+ for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)optimal_rotation[j];
729
608
  if (scale) *scale = 1.0f;
730
609
 
731
- *result = nk_f64_sqrt_v128relaxed(nk_transformed_ssd_f32_v128relaxed_(a, b, n, r, 1.0, centroid_a_x, centroid_a_y,
732
- centroid_a_z, centroid_b_x, centroid_b_y,
733
- centroid_b_z) /
734
- (nk_f64_t)n);
610
+ // Folded SSD via trace identity: SSD = a-ā‖² + ‖b-b̄‖² 2·trace(R · H_centered).
611
+ nk_f64_t sum_squared = centered_norm_squared_a + centered_norm_squared_b - 2.0 * trace_rotation_covariance;
612
+ if (sum_squared < 0.0) sum_squared = 0.0;
613
+ *result = nk_f64_sqrt_v128relaxed(sum_squared / (nk_f64_t)n);
735
614
  }
736
615
 
737
616
  NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
@@ -742,9 +621,10 @@ NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, n
742
621
  v128_t sum_a_x_f64x2 = zeros_f64x2, sum_a_y_f64x2 = zeros_f64x2, sum_a_z_f64x2 = zeros_f64x2;
743
622
  v128_t sum_b_x_f64x2 = zeros_f64x2, sum_b_y_f64x2 = zeros_f64x2, sum_b_z_f64x2 = zeros_f64x2;
744
623
 
745
- v128_t cov_xx_f64x2 = zeros_f64x2, cov_xy_f64x2 = zeros_f64x2, cov_xz_f64x2 = zeros_f64x2;
746
- v128_t cov_yx_f64x2 = zeros_f64x2, cov_yy_f64x2 = zeros_f64x2, cov_yz_f64x2 = zeros_f64x2;
747
- v128_t cov_zx_f64x2 = zeros_f64x2, cov_zy_f64x2 = zeros_f64x2, cov_zz_f64x2 = zeros_f64x2;
624
+ v128_t covariance_xx_f64x2 = zeros_f64x2, covariance_xy_f64x2 = zeros_f64x2, covariance_xz_f64x2 = zeros_f64x2;
625
+ v128_t covariance_yx_f64x2 = zeros_f64x2, covariance_yy_f64x2 = zeros_f64x2, covariance_yz_f64x2 = zeros_f64x2;
626
+ v128_t covariance_zx_f64x2 = zeros_f64x2, covariance_zy_f64x2 = zeros_f64x2, covariance_zz_f64x2 = zeros_f64x2;
627
+ v128_t norm_squared_a_f64x2 = zeros_f64x2, norm_squared_b_f64x2 = zeros_f64x2;
748
628
 
749
629
  nk_size_t i = 0;
750
630
  v128_t a_x_f64x2, a_y_f64x2, a_z_f64x2, b_x_f64x2, b_y_f64x2, b_z_f64x2;
@@ -761,15 +641,21 @@ NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, n
761
641
  sum_b_y_f64x2 = wasm_f64x2_add(sum_b_y_f64x2, b_y_f64x2);
762
642
  sum_b_z_f64x2 = wasm_f64x2_add(sum_b_z_f64x2, b_z_f64x2);
763
643
 
764
- cov_xx_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, b_x_f64x2, cov_xx_f64x2);
765
- cov_xy_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, b_y_f64x2, cov_xy_f64x2);
766
- cov_xz_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, b_z_f64x2, cov_xz_f64x2);
767
- cov_yx_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, b_x_f64x2, cov_yx_f64x2);
768
- cov_yy_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, b_y_f64x2, cov_yy_f64x2);
769
- cov_yz_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, b_z_f64x2, cov_yz_f64x2);
770
- cov_zx_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, b_x_f64x2, cov_zx_f64x2);
771
- cov_zy_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, b_y_f64x2, cov_zy_f64x2);
772
- cov_zz_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, b_z_f64x2, cov_zz_f64x2);
644
+ covariance_xx_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, b_x_f64x2, covariance_xx_f64x2);
645
+ covariance_xy_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, b_y_f64x2, covariance_xy_f64x2);
646
+ covariance_xz_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, b_z_f64x2, covariance_xz_f64x2);
647
+ covariance_yx_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, b_x_f64x2, covariance_yx_f64x2);
648
+ covariance_yy_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, b_y_f64x2, covariance_yy_f64x2);
649
+ covariance_yz_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, b_z_f64x2, covariance_yz_f64x2);
650
+ covariance_zx_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, b_x_f64x2, covariance_zx_f64x2);
651
+ covariance_zy_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, b_y_f64x2, covariance_zy_f64x2);
652
+ covariance_zz_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, b_z_f64x2, covariance_zz_f64x2);
653
+ norm_squared_a_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, a_x_f64x2, norm_squared_a_f64x2);
654
+ norm_squared_a_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, a_y_f64x2, norm_squared_a_f64x2);
655
+ norm_squared_a_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, a_z_f64x2, norm_squared_a_f64x2);
656
+ norm_squared_b_f64x2 = wasm_f64x2_relaxed_madd(b_x_f64x2, b_x_f64x2, norm_squared_b_f64x2);
657
+ norm_squared_b_f64x2 = wasm_f64x2_relaxed_madd(b_y_f64x2, b_y_f64x2, norm_squared_b_f64x2);
658
+ norm_squared_b_f64x2 = wasm_f64x2_relaxed_madd(b_z_f64x2, b_z_f64x2, norm_squared_b_f64x2);
773
659
  }
774
660
 
775
661
  // Reduce vector accumulators.
@@ -780,15 +666,28 @@ NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, n
780
666
  nk_f64_t sum_b_y = nk_reduce_stable_f64x2_v128relaxed_(sum_b_y_f64x2), sum_b_y_compensation = 0.0;
781
667
  nk_f64_t sum_b_z = nk_reduce_stable_f64x2_v128relaxed_(sum_b_z_f64x2), sum_b_z_compensation = 0.0;
782
668
 
783
- nk_f64_t covariance_x_x = nk_reduce_stable_f64x2_v128relaxed_(cov_xx_f64x2), covariance_x_x_compensation = 0.0;
784
- nk_f64_t covariance_x_y = nk_reduce_stable_f64x2_v128relaxed_(cov_xy_f64x2), covariance_x_y_compensation = 0.0;
785
- nk_f64_t covariance_x_z = nk_reduce_stable_f64x2_v128relaxed_(cov_xz_f64x2), covariance_x_z_compensation = 0.0;
786
- nk_f64_t covariance_y_x = nk_reduce_stable_f64x2_v128relaxed_(cov_yx_f64x2), covariance_y_x_compensation = 0.0;
787
- nk_f64_t covariance_y_y = nk_reduce_stable_f64x2_v128relaxed_(cov_yy_f64x2), covariance_y_y_compensation = 0.0;
788
- nk_f64_t covariance_y_z = nk_reduce_stable_f64x2_v128relaxed_(cov_yz_f64x2), covariance_y_z_compensation = 0.0;
789
- nk_f64_t covariance_z_x = nk_reduce_stable_f64x2_v128relaxed_(cov_zx_f64x2), covariance_z_x_compensation = 0.0;
790
- nk_f64_t covariance_z_y = nk_reduce_stable_f64x2_v128relaxed_(cov_zy_f64x2), covariance_z_y_compensation = 0.0;
791
- nk_f64_t covariance_z_z = nk_reduce_stable_f64x2_v128relaxed_(cov_zz_f64x2), covariance_z_z_compensation = 0.0;
669
+ nk_f64_t covariance_x_x = nk_reduce_stable_f64x2_v128relaxed_(covariance_xx_f64x2),
670
+ covariance_x_x_compensation = 0.0;
671
+ nk_f64_t covariance_x_y = nk_reduce_stable_f64x2_v128relaxed_(covariance_xy_f64x2),
672
+ covariance_x_y_compensation = 0.0;
673
+ nk_f64_t covariance_x_z = nk_reduce_stable_f64x2_v128relaxed_(covariance_xz_f64x2),
674
+ covariance_x_z_compensation = 0.0;
675
+ nk_f64_t covariance_y_x = nk_reduce_stable_f64x2_v128relaxed_(covariance_yx_f64x2),
676
+ covariance_y_x_compensation = 0.0;
677
+ nk_f64_t covariance_y_y = nk_reduce_stable_f64x2_v128relaxed_(covariance_yy_f64x2),
678
+ covariance_y_y_compensation = 0.0;
679
+ nk_f64_t covariance_y_z = nk_reduce_stable_f64x2_v128relaxed_(covariance_yz_f64x2),
680
+ covariance_y_z_compensation = 0.0;
681
+ nk_f64_t covariance_z_x = nk_reduce_stable_f64x2_v128relaxed_(covariance_zx_f64x2),
682
+ covariance_z_x_compensation = 0.0;
683
+ nk_f64_t covariance_z_y = nk_reduce_stable_f64x2_v128relaxed_(covariance_zy_f64x2),
684
+ covariance_z_y_compensation = 0.0;
685
+ nk_f64_t covariance_z_z = nk_reduce_stable_f64x2_v128relaxed_(covariance_zz_f64x2),
686
+ covariance_z_z_compensation = 0.0;
687
+ nk_f64_t norm_squared_a_sum = nk_reduce_stable_f64x2_v128relaxed_(norm_squared_a_f64x2),
688
+ norm_squared_a_compensation = 0.0;
689
+ nk_f64_t norm_squared_b_sum = nk_reduce_stable_f64x2_v128relaxed_(norm_squared_b_f64x2),
690
+ norm_squared_b_compensation = 0.0;
792
691
 
793
692
  // Scalar tail
794
693
  for (; i < n; ++i) {
@@ -809,6 +708,12 @@ NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, n
809
708
  nk_accumulate_product_f64_(&covariance_z_x, &covariance_z_x_compensation, az, bx),
810
709
  nk_accumulate_product_f64_(&covariance_z_y, &covariance_z_y_compensation, az, by),
811
710
  nk_accumulate_product_f64_(&covariance_z_z, &covariance_z_z_compensation, az, bz);
711
+ nk_accumulate_square_f64_(&norm_squared_a_sum, &norm_squared_a_compensation, ax),
712
+ nk_accumulate_square_f64_(&norm_squared_a_sum, &norm_squared_a_compensation, ay),
713
+ nk_accumulate_square_f64_(&norm_squared_a_sum, &norm_squared_a_compensation, az);
714
+ nk_accumulate_square_f64_(&norm_squared_b_sum, &norm_squared_b_compensation, bx),
715
+ nk_accumulate_square_f64_(&norm_squared_b_sum, &norm_squared_b_compensation, by),
716
+ nk_accumulate_square_f64_(&norm_squared_b_sum, &norm_squared_b_compensation, bz);
812
717
  }
813
718
 
814
719
  sum_a_x += sum_a_x_compensation, sum_a_y += sum_a_y_compensation, sum_a_z += sum_a_z_compensation;
@@ -819,6 +724,8 @@ NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, n
819
724
  covariance_y_z += covariance_y_z_compensation;
820
725
  covariance_z_x += covariance_z_x_compensation, covariance_z_y += covariance_z_y_compensation,
821
726
  covariance_z_z += covariance_z_z_compensation;
727
+ norm_squared_a_sum += norm_squared_a_compensation;
728
+ norm_squared_b_sum += norm_squared_b_compensation;
822
729
 
823
730
  // Compute centroids
824
731
  nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
@@ -829,6 +736,16 @@ NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, n
829
736
  if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
830
737
  if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
831
738
 
739
+ // Centered norm-squared via parallel-axis identity; clamp at zero for numeric safety.
740
+ nk_f64_t centered_norm_squared_a = norm_squared_a_sum -
741
+ (nk_f64_t)n * (centroid_a_x * centroid_a_x + centroid_a_y * centroid_a_y +
742
+ centroid_a_z * centroid_a_z);
743
+ nk_f64_t centered_norm_squared_b = norm_squared_b_sum -
744
+ (nk_f64_t)n * (centroid_b_x * centroid_b_x + centroid_b_y * centroid_b_y +
745
+ centroid_b_z * centroid_b_z);
746
+ if (centered_norm_squared_a < 0.0) centered_norm_squared_a = 0.0;
747
+ if (centered_norm_squared_b < 0.0) centered_norm_squared_b = 0.0;
748
+
832
749
  // Apply centering correction: H_centered = H - n * centroid_a * centroid_bT
833
750
  covariance_x_x -= n * centroid_a_x * centroid_b_x;
834
751
  covariance_x_y -= n * centroid_a_x * centroid_b_y;
@@ -843,37 +760,64 @@ NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, n
843
760
  // Compute SVD and optimal rotation
844
761
  nk_f64_t cross_covariance[9] = {covariance_x_x, covariance_x_y, covariance_x_z, covariance_y_x, covariance_y_y,
845
762
  covariance_y_z, covariance_z_x, covariance_z_y, covariance_z_z};
846
- nk_f64_t svd_u[9], svd_s[9], svd_v[9];
847
- nk_svd3x3_f64_(cross_covariance, svd_u, svd_s, svd_v);
848
-
849
- nk_f64_t r[9];
850
- nk_rotation_from_svd_f64_v128relaxed_(svd_u, svd_v, r);
851
763
 
852
- // Handle reflection: if det(R) < 0, negate third column of V and recompute R
853
- if (nk_det3x3_f64_(r) < 0) {
854
- svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
855
- nk_rotation_from_svd_f64_v128relaxed_(svd_u, svd_v, r);
764
+ // Identity-dominant short-circuit: if H ≈ diag(positive entries), R = I and trace(R·H) = trace(H).
765
+ nk_f64_t covariance_diagonal_norm_squared = cross_covariance[0] * cross_covariance[0] +
766
+ cross_covariance[4] * cross_covariance[4] +
767
+ cross_covariance[8] * cross_covariance[8];
768
+ nk_f64_t covariance_offdiagonal_norm_squared =
769
+ cross_covariance[1] * cross_covariance[1] + cross_covariance[2] * cross_covariance[2] +
770
+ cross_covariance[3] * cross_covariance[3] + cross_covariance[5] * cross_covariance[5] +
771
+ cross_covariance[6] * cross_covariance[6] + cross_covariance[7] * cross_covariance[7];
772
+ nk_f64_t optimal_rotation[9];
773
+ nk_f64_t trace_rotation_covariance;
774
+ if (covariance_offdiagonal_norm_squared < 1e-20 * covariance_diagonal_norm_squared && cross_covariance[0] > 0.0 &&
775
+ cross_covariance[4] > 0.0 && cross_covariance[8] > 0.0) {
776
+ optimal_rotation[0] = 1, optimal_rotation[1] = 0, optimal_rotation[2] = 0, optimal_rotation[3] = 0,
777
+ optimal_rotation[4] = 1, optimal_rotation[5] = 0, optimal_rotation[6] = 0, optimal_rotation[7] = 0,
778
+ optimal_rotation[8] = 1;
779
+ trace_rotation_covariance = cross_covariance[0] + cross_covariance[4] + cross_covariance[8];
780
+ }
781
+ else {
782
+ nk_f64_t svd_left[9], svd_diagonal[9], svd_right[9];
783
+ nk_svd3x3_f64_(cross_covariance, svd_left, svd_diagonal, svd_right);
784
+ nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
785
+
786
+ // Handle reflection: if det(R) < 0, negate third column of V and recompute R
787
+ if (nk_det3x3_f64_(optimal_rotation) < 0) {
788
+ svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8];
789
+ nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
790
+ }
791
+
792
+ trace_rotation_covariance =
793
+ optimal_rotation[0] * cross_covariance[0] + optimal_rotation[1] * cross_covariance[3] +
794
+ optimal_rotation[2] * cross_covariance[6] + optimal_rotation[3] * cross_covariance[1] +
795
+ optimal_rotation[4] * cross_covariance[4] + optimal_rotation[5] * cross_covariance[7] +
796
+ optimal_rotation[6] * cross_covariance[2] + optimal_rotation[7] * cross_covariance[5] +
797
+ optimal_rotation[8] * cross_covariance[8];
856
798
  }
857
799
 
858
800
  // Output rotation matrix and scale=1.0
859
801
  if (rotation)
860
- for (int j = 0; j < 9; ++j) rotation[j] = r[j];
802
+ for (int j = 0; j < 9; ++j) rotation[j] = optimal_rotation[j];
861
803
 
862
804
  if (scale) *scale = 1.0;
863
805
 
864
- // Compute RMSD after optimal rotation
865
- nk_f64_t sum_squared = nk_transformed_ssd_f64_v128relaxed_(a, b, n, r, 1.0, centroid_a_x, centroid_a_y,
866
- centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z);
806
+ // Folded SSD via trace identity: SSD = ‖a-ā‖² + ‖b-b̄‖² − 2·trace(R · H_centered).
807
+ nk_f64_t sum_squared = centered_norm_squared_a + centered_norm_squared_b - 2.0 * trace_rotation_covariance;
808
+ if (sum_squared < 0.0) sum_squared = 0.0;
867
809
  *result = nk_f64_sqrt_v128relaxed(sum_squared * inv_points_count);
868
810
  }
869
811
 
870
812
  NK_PUBLIC void nk_umeyama_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
871
813
  nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
872
- nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z, variance_a;
873
- nk_f64_t h[9];
814
+ nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z;
815
+ nk_f64_t centered_norm_squared_a, centered_norm_squared_b;
816
+ nk_f64_t cross_covariance[9];
874
817
  nk_centroid_and_cross_covariance_and_variance_f32_v128relaxed_( //
875
818
  a, b, n, &centroid_a_x, &centroid_a_y, &centroid_a_z, //
876
- &centroid_b_x, &centroid_b_y, &centroid_b_z, h, &variance_a);
819
+ &centroid_b_x, &centroid_b_y, &centroid_b_z, cross_covariance, &centered_norm_squared_a,
820
+ &centered_norm_squared_b);
877
821
  if (a_centroid)
878
822
  a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
879
823
  a_centroid[2] = (nk_f32_t)centroid_a_z;
@@ -881,44 +825,73 @@ NK_PUBLIC void nk_umeyama_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b,
881
825
  b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
882
826
  b_centroid[2] = (nk_f32_t)centroid_b_z;
883
827
 
884
- nk_f64_t svd_u[9], svd_s[9], svd_v[9];
885
- nk_svd3x3_f64_(h, svd_u, svd_s, svd_v);
886
-
887
- nk_f64_t r[9];
888
- r[0] = svd_v[0] * svd_u[0] + svd_v[1] * svd_u[1] + svd_v[2] * svd_u[2];
889
- r[1] = svd_v[0] * svd_u[3] + svd_v[1] * svd_u[4] + svd_v[2] * svd_u[5];
890
- r[2] = svd_v[0] * svd_u[6] + svd_v[1] * svd_u[7] + svd_v[2] * svd_u[8];
891
- r[3] = svd_v[3] * svd_u[0] + svd_v[4] * svd_u[1] + svd_v[5] * svd_u[2];
892
- r[4] = svd_v[3] * svd_u[3] + svd_v[4] * svd_u[4] + svd_v[5] * svd_u[5];
893
- r[5] = svd_v[3] * svd_u[6] + svd_v[4] * svd_u[7] + svd_v[5] * svd_u[8];
894
- r[6] = svd_v[6] * svd_u[0] + svd_v[7] * svd_u[1] + svd_v[8] * svd_u[2];
895
- r[7] = svd_v[6] * svd_u[3] + svd_v[7] * svd_u[4] + svd_v[8] * svd_u[5];
896
- r[8] = svd_v[6] * svd_u[6] + svd_v[7] * svd_u[7] + svd_v[8] * svd_u[8];
897
-
898
- nk_f64_t det = nk_det3x3_f64_(r);
899
- if (det < 0) {
900
- svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
901
- r[0] = svd_v[0] * svd_u[0] + svd_v[1] * svd_u[1] + svd_v[2] * svd_u[2];
902
- r[1] = svd_v[0] * svd_u[3] + svd_v[1] * svd_u[4] + svd_v[2] * svd_u[5];
903
- r[2] = svd_v[0] * svd_u[6] + svd_v[1] * svd_u[7] + svd_v[2] * svd_u[8];
904
- r[3] = svd_v[3] * svd_u[0] + svd_v[4] * svd_u[1] + svd_v[5] * svd_u[2];
905
- r[4] = svd_v[3] * svd_u[3] + svd_v[4] * svd_u[4] + svd_v[5] * svd_u[5];
906
- r[5] = svd_v[3] * svd_u[6] + svd_v[4] * svd_u[7] + svd_v[5] * svd_u[8];
907
- r[6] = svd_v[6] * svd_u[0] + svd_v[7] * svd_u[1] + svd_v[8] * svd_u[2];
908
- r[7] = svd_v[6] * svd_u[3] + svd_v[7] * svd_u[4] + svd_v[8] * svd_u[5];
909
- r[8] = svd_v[6] * svd_u[6] + svd_v[7] * svd_u[7] + svd_v[8] * svd_u[8];
828
+ // Identity-dominant short-circuit: if H ≈ diag(positive entries), R = I and trace(R·H) = trace(H).
829
+ nk_f64_t covariance_diagonal_norm_squared = cross_covariance[0] * cross_covariance[0] +
830
+ cross_covariance[4] * cross_covariance[4] +
831
+ cross_covariance[8] * cross_covariance[8];
832
+ nk_f64_t covariance_offdiagonal_norm_squared =
833
+ cross_covariance[1] * cross_covariance[1] + cross_covariance[2] * cross_covariance[2] +
834
+ cross_covariance[3] * cross_covariance[3] + cross_covariance[5] * cross_covariance[5] +
835
+ cross_covariance[6] * cross_covariance[6] + cross_covariance[7] * cross_covariance[7];
836
+ nk_f64_t optimal_rotation[9];
837
+ nk_f64_t trace_rotation_covariance;
838
+ nk_f64_t computed_scale;
839
+ if (covariance_offdiagonal_norm_squared < 1e-20 * covariance_diagonal_norm_squared && cross_covariance[0] > 0.0 &&
840
+ cross_covariance[4] > 0.0 && cross_covariance[8] > 0.0) {
841
+ optimal_rotation[0] = 1, optimal_rotation[1] = 0, optimal_rotation[2] = 0, optimal_rotation[3] = 0,
842
+ optimal_rotation[4] = 1, optimal_rotation[5] = 0, optimal_rotation[6] = 0, optimal_rotation[7] = 0,
843
+ optimal_rotation[8] = 1;
844
+ trace_rotation_covariance = cross_covariance[0] + cross_covariance[4] + cross_covariance[8];
845
+ computed_scale = centered_norm_squared_a > 0.0 ? trace_rotation_covariance / centered_norm_squared_a : 0.0;
846
+ }
847
+ else {
848
+ nk_f64_t svd_left[9], svd_diagonal[9], svd_right[9];
849
+ nk_svd3x3_f64_(cross_covariance, svd_left, svd_diagonal, svd_right);
850
+
851
+ optimal_rotation[0] = svd_right[0] * svd_left[0] + svd_right[1] * svd_left[1] + svd_right[2] * svd_left[2];
852
+ optimal_rotation[1] = svd_right[0] * svd_left[3] + svd_right[1] * svd_left[4] + svd_right[2] * svd_left[5];
853
+ optimal_rotation[2] = svd_right[0] * svd_left[6] + svd_right[1] * svd_left[7] + svd_right[2] * svd_left[8];
854
+ optimal_rotation[3] = svd_right[3] * svd_left[0] + svd_right[4] * svd_left[1] + svd_right[5] * svd_left[2];
855
+ optimal_rotation[4] = svd_right[3] * svd_left[3] + svd_right[4] * svd_left[4] + svd_right[5] * svd_left[5];
856
+ optimal_rotation[5] = svd_right[3] * svd_left[6] + svd_right[4] * svd_left[7] + svd_right[5] * svd_left[8];
857
+ optimal_rotation[6] = svd_right[6] * svd_left[0] + svd_right[7] * svd_left[1] + svd_right[8] * svd_left[2];
858
+ optimal_rotation[7] = svd_right[6] * svd_left[3] + svd_right[7] * svd_left[4] + svd_right[8] * svd_left[5];
859
+ optimal_rotation[8] = svd_right[6] * svd_left[6] + svd_right[7] * svd_left[7] + svd_right[8] * svd_left[8];
860
+
861
+ nk_f64_t det = nk_det3x3_f64_(optimal_rotation);
862
+ if (det < 0) {
863
+ svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8];
864
+ optimal_rotation[0] = svd_right[0] * svd_left[0] + svd_right[1] * svd_left[1] + svd_right[2] * svd_left[2];
865
+ optimal_rotation[1] = svd_right[0] * svd_left[3] + svd_right[1] * svd_left[4] + svd_right[2] * svd_left[5];
866
+ optimal_rotation[2] = svd_right[0] * svd_left[6] + svd_right[1] * svd_left[7] + svd_right[2] * svd_left[8];
867
+ optimal_rotation[3] = svd_right[3] * svd_left[0] + svd_right[4] * svd_left[1] + svd_right[5] * svd_left[2];
868
+ optimal_rotation[4] = svd_right[3] * svd_left[3] + svd_right[4] * svd_left[4] + svd_right[5] * svd_left[5];
869
+ optimal_rotation[5] = svd_right[3] * svd_left[6] + svd_right[4] * svd_left[7] + svd_right[5] * svd_left[8];
870
+ optimal_rotation[6] = svd_right[6] * svd_left[0] + svd_right[7] * svd_left[1] + svd_right[8] * svd_left[2];
871
+ optimal_rotation[7] = svd_right[6] * svd_left[3] + svd_right[7] * svd_left[4] + svd_right[8] * svd_left[5];
872
+ optimal_rotation[8] = svd_right[6] * svd_left[6] + svd_right[7] * svd_left[7] + svd_right[8] * svd_left[8];
873
+ }
874
+
875
+ nk_f64_t trace_signed_singular_values = svd_diagonal[0] + svd_diagonal[4] +
876
+ (det < 0 ? -svd_diagonal[8] : svd_diagonal[8]);
877
+ computed_scale = centered_norm_squared_a > 0.0 ? trace_signed_singular_values / centered_norm_squared_a : 0.0;
878
+
879
+ trace_rotation_covariance =
880
+ optimal_rotation[0] * cross_covariance[0] + optimal_rotation[1] * cross_covariance[3] +
881
+ optimal_rotation[2] * cross_covariance[6] + optimal_rotation[3] * cross_covariance[1] +
882
+ optimal_rotation[4] * cross_covariance[4] + optimal_rotation[5] * cross_covariance[7] +
883
+ optimal_rotation[6] * cross_covariance[2] + optimal_rotation[7] * cross_covariance[5] +
884
+ optimal_rotation[8] * cross_covariance[8];
910
885
  }
911
-
912
- nk_f64_t trace_signed_singular_values = svd_s[0] + svd_s[4] + (det < 0 ? -svd_s[8] : svd_s[8]);
913
- nk_f64_t computed_scale = trace_signed_singular_values / ((nk_f64_t)n * variance_a);
914
886
  if (rotation)
915
- for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)r[j];
887
+ for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)optimal_rotation[j];
916
888
  if (scale) *scale = (nk_f32_t)computed_scale;
917
889
 
918
- *result = nk_f64_sqrt_v128relaxed(nk_transformed_ssd_f32_v128relaxed_(a, b, n, r, computed_scale, centroid_a_x,
919
- centroid_a_y, centroid_a_z, centroid_b_x,
920
- centroid_b_y, centroid_b_z) /
921
- (nk_f64_t)n);
890
+ // Folded SSD with scale: c²·‖a-ā‖² + ‖b-b̄‖² 2c·trace(R · H_centered).
891
+ nk_f64_t sum_squared = computed_scale * computed_scale * centered_norm_squared_a + centered_norm_squared_b -
892
+ 2.0 * computed_scale * trace_rotation_covariance;
893
+ if (sum_squared < 0.0) sum_squared = 0.0;
894
+ *result = nk_f64_sqrt_v128relaxed(sum_squared / (nk_f64_t)n);
922
895
  }
923
896
 
924
897
  NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
@@ -929,10 +902,10 @@ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b,
929
902
  v128_t sum_a_x_f64x2 = zeros_f64x2, sum_a_y_f64x2 = zeros_f64x2, sum_a_z_f64x2 = zeros_f64x2;
930
903
  v128_t sum_b_x_f64x2 = zeros_f64x2, sum_b_y_f64x2 = zeros_f64x2, sum_b_z_f64x2 = zeros_f64x2;
931
904
 
932
- v128_t cov_xx_f64x2 = zeros_f64x2, cov_xy_f64x2 = zeros_f64x2, cov_xz_f64x2 = zeros_f64x2;
933
- v128_t cov_yx_f64x2 = zeros_f64x2, cov_yy_f64x2 = zeros_f64x2, cov_yz_f64x2 = zeros_f64x2;
934
- v128_t cov_zx_f64x2 = zeros_f64x2, cov_zy_f64x2 = zeros_f64x2, cov_zz_f64x2 = zeros_f64x2;
935
- v128_t variance_a_f64x2 = zeros_f64x2;
905
+ v128_t covariance_xx_f64x2 = zeros_f64x2, covariance_xy_f64x2 = zeros_f64x2, covariance_xz_f64x2 = zeros_f64x2;
906
+ v128_t covariance_yx_f64x2 = zeros_f64x2, covariance_yy_f64x2 = zeros_f64x2, covariance_yz_f64x2 = zeros_f64x2;
907
+ v128_t covariance_zx_f64x2 = zeros_f64x2, covariance_zy_f64x2 = zeros_f64x2, covariance_zz_f64x2 = zeros_f64x2;
908
+ v128_t norm_squared_a_f64x2 = zeros_f64x2, norm_squared_b_f64x2 = zeros_f64x2;
936
909
 
937
910
  nk_size_t i = 0;
938
911
  v128_t a_x_f64x2, a_y_f64x2, a_z_f64x2, b_x_f64x2, b_y_f64x2, b_z_f64x2;
@@ -949,19 +922,22 @@ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b,
949
922
  sum_b_y_f64x2 = wasm_f64x2_add(sum_b_y_f64x2, b_y_f64x2);
950
923
  sum_b_z_f64x2 = wasm_f64x2_add(sum_b_z_f64x2, b_z_f64x2);
951
924
 
952
- cov_xx_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, b_x_f64x2, cov_xx_f64x2);
953
- cov_xy_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, b_y_f64x2, cov_xy_f64x2);
954
- cov_xz_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, b_z_f64x2, cov_xz_f64x2);
955
- cov_yx_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, b_x_f64x2, cov_yx_f64x2);
956
- cov_yy_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, b_y_f64x2, cov_yy_f64x2);
957
- cov_yz_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, b_z_f64x2, cov_yz_f64x2);
958
- cov_zx_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, b_x_f64x2, cov_zx_f64x2);
959
- cov_zy_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, b_y_f64x2, cov_zy_f64x2);
960
- cov_zz_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, b_z_f64x2, cov_zz_f64x2);
961
-
962
- variance_a_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, a_x_f64x2, variance_a_f64x2);
963
- variance_a_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, a_y_f64x2, variance_a_f64x2);
964
- variance_a_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, a_z_f64x2, variance_a_f64x2);
925
+ covariance_xx_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, b_x_f64x2, covariance_xx_f64x2);
926
+ covariance_xy_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, b_y_f64x2, covariance_xy_f64x2);
927
+ covariance_xz_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, b_z_f64x2, covariance_xz_f64x2);
928
+ covariance_yx_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, b_x_f64x2, covariance_yx_f64x2);
929
+ covariance_yy_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, b_y_f64x2, covariance_yy_f64x2);
930
+ covariance_yz_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, b_z_f64x2, covariance_yz_f64x2);
931
+ covariance_zx_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, b_x_f64x2, covariance_zx_f64x2);
932
+ covariance_zy_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, b_y_f64x2, covariance_zy_f64x2);
933
+ covariance_zz_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, b_z_f64x2, covariance_zz_f64x2);
934
+
935
+ norm_squared_a_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, a_x_f64x2, norm_squared_a_f64x2);
936
+ norm_squared_a_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, a_y_f64x2, norm_squared_a_f64x2);
937
+ norm_squared_a_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, a_z_f64x2, norm_squared_a_f64x2);
938
+ norm_squared_b_f64x2 = wasm_f64x2_relaxed_madd(b_x_f64x2, b_x_f64x2, norm_squared_b_f64x2);
939
+ norm_squared_b_f64x2 = wasm_f64x2_relaxed_madd(b_y_f64x2, b_y_f64x2, norm_squared_b_f64x2);
940
+ norm_squared_b_f64x2 = wasm_f64x2_relaxed_madd(b_z_f64x2, b_z_f64x2, norm_squared_b_f64x2);
965
941
  }
966
942
 
967
943
  // Reduce vector accumulators.
@@ -971,16 +947,28 @@ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b,
971
947
  nk_f64_t sum_b_x = nk_reduce_stable_f64x2_v128relaxed_(sum_b_x_f64x2), sum_b_x_compensation = 0.0;
972
948
  nk_f64_t sum_b_y = nk_reduce_stable_f64x2_v128relaxed_(sum_b_y_f64x2), sum_b_y_compensation = 0.0;
973
949
  nk_f64_t sum_b_z = nk_reduce_stable_f64x2_v128relaxed_(sum_b_z_f64x2), sum_b_z_compensation = 0.0;
974
- nk_f64_t covariance_x_x = nk_reduce_stable_f64x2_v128relaxed_(cov_xx_f64x2), covariance_x_x_compensation = 0.0;
975
- nk_f64_t covariance_x_y = nk_reduce_stable_f64x2_v128relaxed_(cov_xy_f64x2), covariance_x_y_compensation = 0.0;
976
- nk_f64_t covariance_x_z = nk_reduce_stable_f64x2_v128relaxed_(cov_xz_f64x2), covariance_x_z_compensation = 0.0;
977
- nk_f64_t covariance_y_x = nk_reduce_stable_f64x2_v128relaxed_(cov_yx_f64x2), covariance_y_x_compensation = 0.0;
978
- nk_f64_t covariance_y_y = nk_reduce_stable_f64x2_v128relaxed_(cov_yy_f64x2), covariance_y_y_compensation = 0.0;
979
- nk_f64_t covariance_y_z = nk_reduce_stable_f64x2_v128relaxed_(cov_yz_f64x2), covariance_y_z_compensation = 0.0;
980
- nk_f64_t covariance_z_x = nk_reduce_stable_f64x2_v128relaxed_(cov_zx_f64x2), covariance_z_x_compensation = 0.0;
981
- nk_f64_t covariance_z_y = nk_reduce_stable_f64x2_v128relaxed_(cov_zy_f64x2), covariance_z_y_compensation = 0.0;
982
- nk_f64_t covariance_z_z = nk_reduce_stable_f64x2_v128relaxed_(cov_zz_f64x2), covariance_z_z_compensation = 0.0;
983
- nk_f64_t sum_sq_a = nk_reduce_stable_f64x2_v128relaxed_(variance_a_f64x2), sum_sq_a_compensation = 0.0;
950
+ nk_f64_t covariance_x_x = nk_reduce_stable_f64x2_v128relaxed_(covariance_xx_f64x2),
951
+ covariance_x_x_compensation = 0.0;
952
+ nk_f64_t covariance_x_y = nk_reduce_stable_f64x2_v128relaxed_(covariance_xy_f64x2),
953
+ covariance_x_y_compensation = 0.0;
954
+ nk_f64_t covariance_x_z = nk_reduce_stable_f64x2_v128relaxed_(covariance_xz_f64x2),
955
+ covariance_x_z_compensation = 0.0;
956
+ nk_f64_t covariance_y_x = nk_reduce_stable_f64x2_v128relaxed_(covariance_yx_f64x2),
957
+ covariance_y_x_compensation = 0.0;
958
+ nk_f64_t covariance_y_y = nk_reduce_stable_f64x2_v128relaxed_(covariance_yy_f64x2),
959
+ covariance_y_y_compensation = 0.0;
960
+ nk_f64_t covariance_y_z = nk_reduce_stable_f64x2_v128relaxed_(covariance_yz_f64x2),
961
+ covariance_y_z_compensation = 0.0;
962
+ nk_f64_t covariance_z_x = nk_reduce_stable_f64x2_v128relaxed_(covariance_zx_f64x2),
963
+ covariance_z_x_compensation = 0.0;
964
+ nk_f64_t covariance_z_y = nk_reduce_stable_f64x2_v128relaxed_(covariance_zy_f64x2),
965
+ covariance_z_y_compensation = 0.0;
966
+ nk_f64_t covariance_z_z = nk_reduce_stable_f64x2_v128relaxed_(covariance_zz_f64x2),
967
+ covariance_z_z_compensation = 0.0;
968
+ nk_f64_t norm_squared_a_sum = nk_reduce_stable_f64x2_v128relaxed_(norm_squared_a_f64x2),
969
+ norm_squared_a_compensation = 0.0;
970
+ nk_f64_t norm_squared_b_sum = nk_reduce_stable_f64x2_v128relaxed_(norm_squared_b_f64x2),
971
+ norm_squared_b_compensation = 0.0;
984
972
 
985
973
  // Scalar tail
986
974
  for (; i < n; ++i) {
@@ -1001,9 +989,12 @@ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b,
1001
989
  nk_accumulate_product_f64_(&covariance_z_x, &covariance_z_x_compensation, az, bx),
1002
990
  nk_accumulate_product_f64_(&covariance_z_y, &covariance_z_y_compensation, az, by),
1003
991
  nk_accumulate_product_f64_(&covariance_z_z, &covariance_z_z_compensation, az, bz);
1004
- nk_accumulate_square_f64_(&sum_sq_a, &sum_sq_a_compensation, ax),
1005
- nk_accumulate_square_f64_(&sum_sq_a, &sum_sq_a_compensation, ay),
1006
- nk_accumulate_square_f64_(&sum_sq_a, &sum_sq_a_compensation, az);
992
+ nk_accumulate_square_f64_(&norm_squared_a_sum, &norm_squared_a_compensation, ax),
993
+ nk_accumulate_square_f64_(&norm_squared_a_sum, &norm_squared_a_compensation, ay),
994
+ nk_accumulate_square_f64_(&norm_squared_a_sum, &norm_squared_a_compensation, az);
995
+ nk_accumulate_square_f64_(&norm_squared_b_sum, &norm_squared_b_compensation, bx),
996
+ nk_accumulate_square_f64_(&norm_squared_b_sum, &norm_squared_b_compensation, by),
997
+ nk_accumulate_square_f64_(&norm_squared_b_sum, &norm_squared_b_compensation, bz);
1007
998
  }
1008
999
 
1009
1000
  sum_a_x += sum_a_x_compensation, sum_a_y += sum_a_y_compensation, sum_a_z += sum_a_z_compensation;
@@ -1014,7 +1005,8 @@ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b,
1014
1005
  covariance_y_z += covariance_y_z_compensation;
1015
1006
  covariance_z_x += covariance_z_x_compensation, covariance_z_y += covariance_z_y_compensation,
1016
1007
  covariance_z_z += covariance_z_z_compensation;
1017
- sum_sq_a += sum_sq_a_compensation;
1008
+ norm_squared_a_sum += norm_squared_a_compensation;
1009
+ norm_squared_b_sum += norm_squared_b_compensation;
1018
1010
 
1019
1011
  // Compute centroids
1020
1012
  nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
@@ -1025,9 +1017,15 @@ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b,
1025
1017
  if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
1026
1018
  if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
1027
1019
 
1028
- // Compute variance of A (centered)
1029
- nk_f64_t centroid_sq = centroid_a_x * centroid_a_x + centroid_a_y * centroid_a_y + centroid_a_z * centroid_a_z;
1030
- nk_f64_t var_a = sum_sq_a * inv_points_count - centroid_sq;
1020
+ // Centered norm-squared via parallel-axis identity; clamp at zero for numeric safety.
1021
+ nk_f64_t centered_norm_squared_a = norm_squared_a_sum -
1022
+ (nk_f64_t)n * (centroid_a_x * centroid_a_x + centroid_a_y * centroid_a_y +
1023
+ centroid_a_z * centroid_a_z);
1024
+ nk_f64_t centered_norm_squared_b = norm_squared_b_sum -
1025
+ (nk_f64_t)n * (centroid_b_x * centroid_b_x + centroid_b_y * centroid_b_y +
1026
+ centroid_b_z * centroid_b_z);
1027
+ if (centered_norm_squared_a < 0.0) centered_norm_squared_a = 0.0;
1028
+ if (centered_norm_squared_b < 0.0) centered_norm_squared_b = 0.0;
1031
1029
 
1032
1030
  // Apply centering correction: H_centered = H - n * centroid_a * centroid_bT
1033
1031
  covariance_x_x -= n * centroid_a_x * centroid_b_x;
@@ -1043,29 +1041,57 @@ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b,
1043
1041
  // Compute SVD
1044
1042
  nk_f64_t cross_covariance[9] = {covariance_x_x, covariance_x_y, covariance_x_z, covariance_y_x, covariance_y_y,
1045
1043
  covariance_y_z, covariance_z_x, covariance_z_y, covariance_z_z};
1046
- nk_f64_t svd_u[9], svd_s[9], svd_v[9];
1047
- nk_svd3x3_f64_(cross_covariance, svd_u, svd_s, svd_v);
1048
-
1049
- nk_f64_t r[9];
1050
- nk_rotation_from_svd_f64_v128relaxed_(svd_u, svd_v, r);
1051
1044
 
1052
- // Handle reflection and compute scale
1053
- nk_f64_t det = nk_det3x3_f64_(r);
1054
- nk_f64_t trace_d_s = svd_s[0] + svd_s[4] + (det < 0 ? -svd_s[8] : svd_s[8]);
1055
- nk_f64_t computed_scale = trace_d_s / (n * var_a);
1056
-
1057
- if (det < 0) {
1058
- svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
1059
- nk_rotation_from_svd_f64_v128relaxed_(svd_u, svd_v, r);
1045
+ // Identity-dominant short-circuit: if H ≈ diag(positive entries), R = I and trace(R·H) = trace(H).
1046
+ nk_f64_t covariance_diagonal_norm_squared = cross_covariance[0] * cross_covariance[0] +
1047
+ cross_covariance[4] * cross_covariance[4] +
1048
+ cross_covariance[8] * cross_covariance[8];
1049
+ nk_f64_t covariance_offdiagonal_norm_squared =
1050
+ cross_covariance[1] * cross_covariance[1] + cross_covariance[2] * cross_covariance[2] +
1051
+ cross_covariance[3] * cross_covariance[3] + cross_covariance[5] * cross_covariance[5] +
1052
+ cross_covariance[6] * cross_covariance[6] + cross_covariance[7] * cross_covariance[7];
1053
+ nk_f64_t optimal_rotation[9];
1054
+ nk_f64_t trace_rotation_covariance;
1055
+ nk_f64_t computed_scale;
1056
+ if (covariance_offdiagonal_norm_squared < 1e-20 * covariance_diagonal_norm_squared && cross_covariance[0] > 0.0 &&
1057
+ cross_covariance[4] > 0.0 && cross_covariance[8] > 0.0) {
1058
+ optimal_rotation[0] = 1, optimal_rotation[1] = 0, optimal_rotation[2] = 0, optimal_rotation[3] = 0,
1059
+ optimal_rotation[4] = 1, optimal_rotation[5] = 0, optimal_rotation[6] = 0, optimal_rotation[7] = 0,
1060
+ optimal_rotation[8] = 1;
1061
+ trace_rotation_covariance = cross_covariance[0] + cross_covariance[4] + cross_covariance[8];
1062
+ computed_scale = centered_norm_squared_a > 0.0 ? trace_rotation_covariance / centered_norm_squared_a : 0.0;
1063
+ }
1064
+ else {
1065
+ nk_f64_t svd_left[9], svd_diagonal[9], svd_right[9];
1066
+ nk_svd3x3_f64_(cross_covariance, svd_left, svd_diagonal, svd_right);
1067
+ nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
1068
+
1069
+ // Handle reflection and compute scale
1070
+ nk_f64_t det = nk_det3x3_f64_(optimal_rotation);
1071
+ nk_f64_t trace_d_s = svd_diagonal[0] + svd_diagonal[4] + (det < 0 ? -svd_diagonal[8] : svd_diagonal[8]);
1072
+ computed_scale = centered_norm_squared_a > 0.0 ? trace_d_s / centered_norm_squared_a : 0.0;
1073
+
1074
+ if (det < 0) {
1075
+ svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8];
1076
+ nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
1077
+ }
1078
+
1079
+ trace_rotation_covariance =
1080
+ optimal_rotation[0] * cross_covariance[0] + optimal_rotation[1] * cross_covariance[3] +
1081
+ optimal_rotation[2] * cross_covariance[6] + optimal_rotation[3] * cross_covariance[1] +
1082
+ optimal_rotation[4] * cross_covariance[4] + optimal_rotation[5] * cross_covariance[7] +
1083
+ optimal_rotation[6] * cross_covariance[2] + optimal_rotation[7] * cross_covariance[5] +
1084
+ optimal_rotation[8] * cross_covariance[8];
1060
1085
  }
1061
1086
 
1062
1087
  if (rotation)
1063
- for (int j = 0; j < 9; ++j) rotation[j] = r[j];
1088
+ for (int j = 0; j < 9; ++j) rotation[j] = optimal_rotation[j];
1064
1089
  if (scale) *scale = computed_scale;
1065
1090
 
1066
- // Compute RMSD after transformation
1067
- nk_f64_t sum_squared = nk_transformed_ssd_f64_v128relaxed_(a, b, n, r, computed_scale, centroid_a_x, centroid_a_y,
1068
- centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z);
1091
+ // Folded SSD with scale: c²·‖a-ā‖² + ‖b-b̄‖² − 2c·trace(R · H_centered).
1092
+ nk_f64_t sum_squared = computed_scale * computed_scale * centered_norm_squared_a + centered_norm_squared_b -
1093
+ 2.0 * computed_scale * trace_rotation_covariance;
1094
+ if (sum_squared < 0.0) sum_squared = 0.0;
1069
1095
  *result = nk_f64_sqrt_v128relaxed(sum_squared * inv_points_count);
1070
1096
  }
1071
1097