npm - numkong - Versions diffs - 7.4.3 → 7.4.5 - Mend

numkong 7.4.3 → 7.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/README.md +49 -49
package/binding.gyp +3 -0
package/include/numkong/capabilities.h +1 -1
package/include/numkong/each/haswell.h +4 -4
package/include/numkong/maxsim/sme.h +65 -27
package/include/numkong/mesh/README.md +13 -27
package/include/numkong/mesh/haswell.h +25 -122
package/include/numkong/mesh/neon.h +21 -110
package/include/numkong/mesh/neonbfdot.h +4 -43
package/include/numkong/mesh/rvv.h +7 -82
package/include/numkong/mesh/serial.h +26 -53
package/include/numkong/mesh/skylake.h +7 -123
package/include/numkong/mesh/v128relaxed.h +9 -93
package/include/numkong/mesh.h +2 -2
package/include/numkong/mesh.hpp +35 -96
package/include/numkong/types.h +15 -9
package/numkong.gypi +3 -0
package/package.json +7 -7
package/wasm/numkong.wasm +0 -0

package/include/numkong/mesh/skylake.h CHANGED Viewed

@@ -644,12 +644,10 @@ NK_PUBLIC void nk_rmsd_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size
         rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
         rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
     if (scale) *scale = 1.0f;
+    if (a_centroid) a_centroid[0] = 0, a_centroid[1] = 0, a_centroid[2] = 0;
+    if (b_centroid) b_centroid[0] = 0, b_centroid[1] = 0, b_centroid[2] = 0;
-    // Fused single-pass: centroids + squared differences in f64, using the identity:
-    //   RMSD = √(E[(a-b)²] - (ā - b̄)²)
     __m512d const zeros_f64x8 = _mm512_setzero_pd();
-    __m512d sum_a_x_f64x8 = zeros_f64x8, sum_a_y_f64x8 = zeros_f64x8, sum_a_z_f64x8 = zeros_f64x8;
-    __m512d sum_b_x_f64x8 = zeros_f64x8, sum_b_y_f64x8 = zeros_f64x8, sum_b_z_f64x8 = zeros_f64x8;
     __m512d sum_squared_x_f64x8 = zeros_f64x8, sum_squared_y_f64x8 = zeros_f64x8, sum_squared_z_f64x8 = zeros_f64x8;
     __m512 a_x_f32x16, a_y_f32x16, a_z_f32x16, b_x_f32x16, b_y_f32x16, b_z_f32x16;
     nk_size_t i = 0;
@@ -672,13 +670,6 @@ NK_PUBLIC void nk_rmsd_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size
         __m512d b_z_low_f64x8 = _mm512_cvtps_pd(_mm512_castps512_ps256(b_z_f32x16));
         __m512d b_z_high_f64x8 = _mm512_cvtps_pd(_mm512_extractf32x8_ps(b_z_f32x16, 1));
-        sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, _mm512_add_pd(a_x_low_f64x8, a_x_high_f64x8));
-        sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, _mm512_add_pd(a_y_low_f64x8, a_y_high_f64x8));
-        sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, _mm512_add_pd(a_z_low_f64x8, a_z_high_f64x8));
-        sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, _mm512_add_pd(b_x_low_f64x8, b_x_high_f64x8));
-        sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, _mm512_add_pd(b_y_low_f64x8, b_y_high_f64x8));
-        sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, _mm512_add_pd(b_z_low_f64x8, b_z_high_f64x8));
         __m512d delta_x_low_f64x8 = _mm512_sub_pd(a_x_low_f64x8, b_x_low_f64x8);
         __m512d delta_x_high_f64x8 = _mm512_sub_pd(a_x_high_f64x8, b_x_high_f64x8);
         __m512d delta_y_low_f64x8 = _mm512_sub_pd(a_y_low_f64x8, b_y_low_f64x8);
@@ -708,13 +699,6 @@ NK_PUBLIC void nk_rmsd_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size
         b_z_low_f64x8 = _mm512_cvtps_pd(_mm512_castps512_ps256(b_z_f32x16));
         b_z_high_f64x8 = _mm512_cvtps_pd(_mm512_extractf32x8_ps(b_z_f32x16, 1));
-        sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, _mm512_add_pd(a_x_low_f64x8, a_x_high_f64x8));
-        sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, _mm512_add_pd(a_y_low_f64x8, a_y_high_f64x8));
-        sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, _mm512_add_pd(a_z_low_f64x8, a_z_high_f64x8));
-        sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, _mm512_add_pd(b_x_low_f64x8, b_x_high_f64x8));
-        sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, _mm512_add_pd(b_y_low_f64x8, b_y_high_f64x8));
-        sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, _mm512_add_pd(b_z_low_f64x8, b_z_high_f64x8));
         delta_x_low_f64x8 = _mm512_sub_pd(a_x_low_f64x8, b_x_low_f64x8);
         delta_x_high_f64x8 = _mm512_sub_pd(a_x_high_f64x8, b_x_high_f64x8);
         delta_y_low_f64x8 = _mm512_sub_pd(a_y_low_f64x8, b_y_low_f64x8);
@@ -746,13 +730,6 @@ NK_PUBLIC void nk_rmsd_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size
         __m512d b_z_low_f64x8 = _mm512_cvtps_pd(_mm512_castps512_ps256(b_z_f32x16));
         __m512d b_z_high_f64x8 = _mm512_cvtps_pd(_mm512_extractf32x8_ps(b_z_f32x16, 1));
-        sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, _mm512_add_pd(a_x_low_f64x8, a_x_high_f64x8));
-        sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, _mm512_add_pd(a_y_low_f64x8, a_y_high_f64x8));
-        sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, _mm512_add_pd(a_z_low_f64x8, a_z_high_f64x8));
-        sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, _mm512_add_pd(b_x_low_f64x8, b_x_high_f64x8));
-        sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, _mm512_add_pd(b_y_low_f64x8, b_y_high_f64x8));
-        sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, _mm512_add_pd(b_z_low_f64x8, b_z_high_f64x8));
         __m512d delta_x_low_f64x8 = _mm512_sub_pd(a_x_low_f64x8, b_x_low_f64x8);
         __m512d delta_x_high_f64x8 = _mm512_sub_pd(a_x_high_f64x8, b_x_high_f64x8);
         __m512d delta_y_low_f64x8 = _mm512_sub_pd(a_y_low_f64x8, b_y_low_f64x8);
@@ -796,13 +773,6 @@ NK_PUBLIC void nk_rmsd_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size
         __m512d b_z_low_f64x8 = _mm512_cvtps_pd(_mm512_castps512_ps256(b_z_f32x16));
         __m512d b_z_high_f64x8 = _mm512_cvtps_pd(_mm512_extractf32x8_ps(b_z_f32x16, 1));
-        sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, _mm512_add_pd(a_x_low_f64x8, a_x_high_f64x8));
-        sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, _mm512_add_pd(a_y_low_f64x8, a_y_high_f64x8));
-        sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, _mm512_add_pd(a_z_low_f64x8, a_z_high_f64x8));
-        sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, _mm512_add_pd(b_x_low_f64x8, b_x_high_f64x8));
-        sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, _mm512_add_pd(b_y_low_f64x8, b_y_high_f64x8));
-        sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, _mm512_add_pd(b_z_low_f64x8, b_z_high_f64x8));
         __m512d delta_x_low_f64x8 = _mm512_sub_pd(a_x_low_f64x8, b_x_low_f64x8);
         __m512d delta_x_high_f64x8 = _mm512_sub_pd(a_x_high_f64x8, b_x_high_f64x8);
         __m512d delta_y_low_f64x8 = _mm512_sub_pd(a_y_low_f64x8, b_y_low_f64x8);
@@ -817,32 +787,10 @@ NK_PUBLIC void nk_rmsd_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size
         sum_squared_z_f64x8 = _mm512_fmadd_pd(delta_z_high_f64x8, delta_z_high_f64x8, sum_squared_z_f64x8);
     }
-    // Reduce and compute centroids
-    nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
-    nk_f64_t total_ax = _mm512_reduce_add_pd(sum_a_x_f64x8);
-    nk_f64_t total_ay = _mm512_reduce_add_pd(sum_a_y_f64x8);
-    nk_f64_t total_az = _mm512_reduce_add_pd(sum_a_z_f64x8);
-    nk_f64_t total_bx = _mm512_reduce_add_pd(sum_b_x_f64x8);
-    nk_f64_t total_by = _mm512_reduce_add_pd(sum_b_y_f64x8);
-    nk_f64_t total_bz = _mm512_reduce_add_pd(sum_b_z_f64x8);
     nk_f64_t total_sq_x = _mm512_reduce_add_pd(sum_squared_x_f64x8);
     nk_f64_t total_sq_y = _mm512_reduce_add_pd(sum_squared_y_f64x8);
     nk_f64_t total_sq_z = _mm512_reduce_add_pd(sum_squared_z_f64x8);
-    nk_f64_t centroid_a_x = total_ax * inv_n, centroid_a_y = total_ay * inv_n, centroid_a_z = total_az * inv_n;
-    nk_f64_t centroid_b_x = total_bx * inv_n, centroid_b_y = total_by * inv_n, centroid_b_z = total_bz * inv_n;
-    if (a_centroid)
-        a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
-        a_centroid[2] = (nk_f32_t)centroid_a_z;
-    if (b_centroid)
-        b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
-        b_centroid[2] = (nk_f32_t)centroid_b_z;
-    nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x, mean_diff_y = centroid_a_y - centroid_b_y,
-             mean_diff_z = centroid_a_z - centroid_b_z;
-    nk_f64_t sum_squared = total_sq_x + total_sq_y + total_sq_z;
-    nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
-    *result = nk_f64_sqrt_haswell(sum_squared * inv_n - mean_diff_sq);
+    *result = nk_f64_sqrt_haswell((total_sq_x + total_sq_y + total_sq_z) / (nk_f64_t)n);
 }
 NK_PUBLIC void nk_kabsch_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
@@ -1008,21 +956,15 @@ NK_PUBLIC void nk_kabsch_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_si
 NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
                                    nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
-    // RMSD uses identity rotation and scale=1.0.
     if (rotation)
         rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
         rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
     if (scale) *scale = 1.0;
-    // Optimized fused single-pass implementation for f64.
-    // Computes centroids and squared differences in one pass using the identity:
-    //   RMSD = √(E[(a-ā) - (b-b̄)]²)
-    //        = √(E[(a-b)²] - (ā - b̄)²)
+    if (a_centroid) a_centroid[0] = 0, a_centroid[1] = 0, a_centroid[2] = 0;
+    if (b_centroid) b_centroid[0] = 0, b_centroid[1] = 0, b_centroid[2] = 0;
     __m512i const gather_idx_i64x8 = _mm512_setr_epi64(0, 3, 6, 9, 12, 15, 18, 21);
     __m512d const zeros_f64x8 = _mm512_setzero_pd();
-    // Accumulators for centroids and squared differences
-    __m512d sum_a_x_f64x8 = zeros_f64x8, sum_a_y_f64x8 = zeros_f64x8, sum_a_z_f64x8 = zeros_f64x8;
-    __m512d sum_b_x_f64x8 = zeros_f64x8, sum_b_y_f64x8 = zeros_f64x8, sum_b_z_f64x8 = zeros_f64x8;
     __m512d sum_squared_x_f64x8 = zeros_f64x8, sum_squared_y_f64x8 = zeros_f64x8, sum_squared_z_f64x8 = zeros_f64x8;
     __m512d a_x_f64x8, a_y_f64x8, a_z_f64x8, b_x_f64x8, b_y_f64x8, b_z_f64x8;
@@ -1034,13 +976,6 @@ NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size
         nk_deinterleave_f64x8_skylake_(a + i * 3, &a_x_f64x8, &a_y_f64x8, &a_z_f64x8);
         nk_deinterleave_f64x8_skylake_(b + i * 3, &b_x_f64x8, &b_y_f64x8, &b_z_f64x8);
-        sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, a_x_f64x8),
-        sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, a_y_f64x8),
-        sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, a_z_f64x8);
-        sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, b_x_f64x8),
-        sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, b_y_f64x8),
-        sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, b_z_f64x8);
         __m512d delta_x_f64x8 = _mm512_sub_pd(a_x_f64x8, b_x_f64x8),
                 delta_y_f64x8 = _mm512_sub_pd(a_y_f64x8, b_y_f64x8),
                 delta_z_f64x8 = _mm512_sub_pd(a_z_f64x8, b_z_f64x8);
@@ -1053,13 +988,6 @@ NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size
         nk_deinterleave_f64x8_skylake_(a + (i + 8) * 3, &a_x1_f64x8, &a_y1_f64x8, &a_z1_f64x8);
         nk_deinterleave_f64x8_skylake_(b + (i + 8) * 3, &b_x1_f64x8, &b_y1_f64x8, &b_z1_f64x8);
-        sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, a_x1_f64x8),
-        sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, a_y1_f64x8),
-        sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, a_z1_f64x8);
-        sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, b_x1_f64x8),
-        sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, b_y1_f64x8),
-        sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, b_z1_f64x8);
         __m512d delta_x1_f64x8 = _mm512_sub_pd(a_x1_f64x8, b_x1_f64x8),
                 delta_y1_f64x8 = _mm512_sub_pd(a_y1_f64x8, b_y1_f64x8),
                 delta_z1_f64x8 = _mm512_sub_pd(a_z1_f64x8, b_z1_f64x8);
@@ -1073,13 +1001,6 @@ NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size
         nk_deinterleave_f64x8_skylake_(a + i * 3, &a_x_f64x8, &a_y_f64x8, &a_z_f64x8);
         nk_deinterleave_f64x8_skylake_(b + i * 3, &b_x_f64x8, &b_y_f64x8, &b_z_f64x8);
-        sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, a_x_f64x8),
-        sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, a_y_f64x8),
-        sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, a_z_f64x8);
-        sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, b_x_f64x8),
-        sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, b_y_f64x8),
-        sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, b_z_f64x8);
         __m512d delta_x_f64x8 = _mm512_sub_pd(a_x_f64x8, b_x_f64x8),
                 delta_y_f64x8 = _mm512_sub_pd(a_y_f64x8, b_y_f64x8),
                 delta_z_f64x8 = _mm512_sub_pd(a_z_f64x8, b_z_f64x8);
@@ -1102,13 +1023,6 @@ NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size
         b_y_f64x8 = _mm512_mask_i64gather_pd(zeros_f64x8, mask, gather_idx_i64x8, b_tail + 1, 8);
         b_z_f64x8 = _mm512_mask_i64gather_pd(zeros_f64x8, mask, gather_idx_i64x8, b_tail + 2, 8);
-        sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, a_x_f64x8),
-        sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, a_y_f64x8),
-        sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, a_z_f64x8);
-        sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, b_x_f64x8),
-        sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, b_y_f64x8),
-        sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, b_z_f64x8);
         __m512d delta_x_f64x8 = _mm512_sub_pd(a_x_f64x8, b_x_f64x8),
                 delta_y_f64x8 = _mm512_sub_pd(a_y_f64x8, b_y_f64x8),
                 delta_z_f64x8 = _mm512_sub_pd(a_z_f64x8, b_z_f64x8);
@@ -1118,14 +1032,6 @@ NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size
         i = n;
     }
-    // Reduce and compute centroids.
-    nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
-    nk_f64_t total_ax = nk_reduce_stable_f64x8_skylake_(sum_a_x_f64x8), total_ax_compensation = 0.0;
-    nk_f64_t total_ay = nk_reduce_stable_f64x8_skylake_(sum_a_y_f64x8), total_ay_compensation = 0.0;
-    nk_f64_t total_az = nk_reduce_stable_f64x8_skylake_(sum_a_z_f64x8), total_az_compensation = 0.0;
-    nk_f64_t total_bx = nk_reduce_stable_f64x8_skylake_(sum_b_x_f64x8), total_bx_compensation = 0.0;
-    nk_f64_t total_by = nk_reduce_stable_f64x8_skylake_(sum_b_y_f64x8), total_by_compensation = 0.0;
-    nk_f64_t total_bz = nk_reduce_stable_f64x8_skylake_(sum_b_z_f64x8), total_bz_compensation = 0.0;
     nk_f64_t total_squared_x = nk_reduce_stable_f64x8_skylake_(sum_squared_x_f64x8), total_squared_x_compensation = 0.0;
     nk_f64_t total_squared_y = nk_reduce_stable_f64x8_skylake_(sum_squared_y_f64x8), total_squared_y_compensation = 0.0;
     nk_f64_t total_squared_z = nk_reduce_stable_f64x8_skylake_(sum_squared_z_f64x8), total_squared_z_compensation = 0.0;
@@ -1133,37 +1039,15 @@ NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size
     for (; i < n; ++i) {
         nk_f64_t ax = a[i * 3 + 0], ay = a[i * 3 + 1], az = a[i * 3 + 2];
         nk_f64_t bx = b[i * 3 + 0], by = b[i * 3 + 1], bz = b[i * 3 + 2];
-        nk_accumulate_sum_f64_(&total_ax, &total_ax_compensation, ax);
-        nk_accumulate_sum_f64_(&total_ay, &total_ay_compensation, ay);
-        nk_accumulate_sum_f64_(&total_az, &total_az_compensation, az);
-        nk_accumulate_sum_f64_(&total_bx, &total_bx_compensation, bx);
-        nk_accumulate_sum_f64_(&total_by, &total_by_compensation, by);
-        nk_accumulate_sum_f64_(&total_bz, &total_bz_compensation, bz);
         nk_f64_t delta_x = ax - bx, delta_y = ay - by, delta_z = az - bz;
         nk_accumulate_square_f64_(&total_squared_x, &total_squared_x_compensation, delta_x);
         nk_accumulate_square_f64_(&total_squared_y, &total_squared_y_compensation, delta_y);
         nk_accumulate_square_f64_(&total_squared_z, &total_squared_z_compensation, delta_z);
     }
-    total_ax += total_ax_compensation, total_ay += total_ay_compensation, total_az += total_az_compensation;
-    total_bx += total_bx_compensation, total_by += total_by_compensation, total_bz += total_bz_compensation;
     total_squared_x += total_squared_x_compensation, total_squared_y += total_squared_y_compensation,
         total_squared_z += total_squared_z_compensation;
-    nk_f64_t centroid_a_x = total_ax * inv_n, centroid_a_y = total_ay * inv_n, centroid_a_z = total_az * inv_n;
-    nk_f64_t centroid_b_x = total_bx * inv_n, centroid_b_y = total_by * inv_n, centroid_b_z = total_bz * inv_n;
-    if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
-    if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
-    // Compute RMSD using the formula:
-    // RMSD = √(E[(a-b)²] - (ā - b̄)²).
-    nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x, mean_diff_y = centroid_a_y - centroid_b_y,
-             mean_diff_z = centroid_a_z - centroid_b_z;
-    nk_f64_t sum_squared = total_squared_x + total_squared_y + total_squared_z;
-    nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
-    *result = nk_f64_sqrt_haswell(sum_squared * inv_n - mean_diff_sq);
+    *result = nk_f64_sqrt_haswell((total_squared_x + total_squared_y + total_squared_z) / (nk_f64_t)n);
 }
 NK_PUBLIC void nk_kabsch_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,

package/include/numkong/mesh/v128relaxed.h CHANGED Viewed

@@ -570,16 +570,10 @@ NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_
         rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
         rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
     if (scale) *scale = 1.0f;
+    if (a_centroid) a_centroid[0] = 0, a_centroid[1] = 0, a_centroid[2] = 0;
+    if (b_centroid) b_centroid[0] = 0, b_centroid[1] = 0, b_centroid[2] = 0;
-    // Fused single-pass: accumulate centroids and squared differences simultaneously.
-    // RMSD = √(E[(a−b)²] − (ā − b̄)²)
     v128_t zero_f64x2 = wasm_f64x2_splat(0.0);
-    v128_t sum_a_x_low_f64x2 = zero_f64x2, sum_a_x_high_f64x2 = zero_f64x2;
-    v128_t sum_a_y_low_f64x2 = zero_f64x2, sum_a_y_high_f64x2 = zero_f64x2;
-    v128_t sum_a_z_low_f64x2 = zero_f64x2, sum_a_z_high_f64x2 = zero_f64x2;
-    v128_t sum_b_x_low_f64x2 = zero_f64x2, sum_b_x_high_f64x2 = zero_f64x2;
-    v128_t sum_b_y_low_f64x2 = zero_f64x2, sum_b_y_high_f64x2 = zero_f64x2;
-    v128_t sum_b_z_low_f64x2 = zero_f64x2, sum_b_z_high_f64x2 = zero_f64x2;
     v128_t sum_sq_x_low_f64x2 = zero_f64x2, sum_sq_x_high_f64x2 = zero_f64x2;
     v128_t sum_sq_y_low_f64x2 = zero_f64x2, sum_sq_y_high_f64x2 = zero_f64x2;
     v128_t sum_sq_z_low_f64x2 = zero_f64x2, sum_sq_z_high_f64x2 = zero_f64x2;
@@ -590,8 +584,7 @@ NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_
         nk_deinterleave_f32x4_v128relaxed_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4);
         nk_deinterleave_f32x4_v128relaxed_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
-        // Promote lower and upper halves to f64. Deltas computed in f64 to avoid
-        // f32 cancellation in the single-pass formula RMSD = √(E[(a−b)²] − (ā − b̄)²).
+        // Promote lower and upper halves to f64 for precision.
         v128_t a_x_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_x_f32x4);
         v128_t a_x_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_x_f32x4, a_x_f32x4, 2, 3, 0, 1));
         v128_t a_y_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_y_f32x4);
@@ -605,21 +598,7 @@ NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_
         v128_t b_z_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_z_f32x4);
         v128_t b_z_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_z_f32x4, b_z_f32x4, 2, 3, 0, 1));
-        // Accumulate centroids.
-        sum_a_x_low_f64x2 = wasm_f64x2_add(sum_a_x_low_f64x2, a_x_low_f64x2);
-        sum_a_x_high_f64x2 = wasm_f64x2_add(sum_a_x_high_f64x2, a_x_high_f64x2);
-        sum_a_y_low_f64x2 = wasm_f64x2_add(sum_a_y_low_f64x2, a_y_low_f64x2);
-        sum_a_y_high_f64x2 = wasm_f64x2_add(sum_a_y_high_f64x2, a_y_high_f64x2);
-        sum_a_z_low_f64x2 = wasm_f64x2_add(sum_a_z_low_f64x2, a_z_low_f64x2);
-        sum_a_z_high_f64x2 = wasm_f64x2_add(sum_a_z_high_f64x2, a_z_high_f64x2);
-        sum_b_x_low_f64x2 = wasm_f64x2_add(sum_b_x_low_f64x2, b_x_low_f64x2);
-        sum_b_x_high_f64x2 = wasm_f64x2_add(sum_b_x_high_f64x2, b_x_high_f64x2);
-        sum_b_y_low_f64x2 = wasm_f64x2_add(sum_b_y_low_f64x2, b_y_low_f64x2);
-        sum_b_y_high_f64x2 = wasm_f64x2_add(sum_b_y_high_f64x2, b_y_high_f64x2);
-        sum_b_z_low_f64x2 = wasm_f64x2_add(sum_b_z_low_f64x2, b_z_low_f64x2);
-        sum_b_z_high_f64x2 = wasm_f64x2_add(sum_b_z_high_f64x2, b_z_high_f64x2);
-        // Accumulate squared differences in f64 — deltas computed in f64 for precision.
+        // Accumulate squared differences in f64.
         v128_t dx_low_f64x2 = wasm_f64x2_sub(a_x_low_f64x2, b_x_low_f64x2);
         v128_t dx_high_f64x2 = wasm_f64x2_sub(a_x_high_f64x2, b_x_high_f64x2);
         v128_t dy_low_f64x2 = wasm_f64x2_sub(a_y_low_f64x2, b_y_low_f64x2);
@@ -635,12 +614,6 @@ NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_
         sum_sq_z_high_f64x2 = wasm_f64x2_relaxed_madd(dz_high_f64x2, dz_high_f64x2, sum_sq_z_high_f64x2);
     }
-    nk_f64_t sum_a_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_x_low_f64x2, sum_a_x_high_f64x2));
-    nk_f64_t sum_a_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_y_low_f64x2, sum_a_y_high_f64x2));
-    nk_f64_t sum_a_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_z_low_f64x2, sum_a_z_high_f64x2));
-    nk_f64_t sum_b_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_x_low_f64x2, sum_b_x_high_f64x2));
-    nk_f64_t sum_b_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_y_low_f64x2, sum_b_y_high_f64x2));
-    nk_f64_t sum_b_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_z_low_f64x2, sum_b_z_high_f64x2));
     nk_f64_t sum_sq_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_sq_x_low_f64x2, sum_sq_x_high_f64x2));
     nk_f64_t sum_sq_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_sq_y_low_f64x2, sum_sq_y_high_f64x2));
     nk_f64_t sum_sq_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_sq_z_low_f64x2, sum_sq_z_high_f64x2));
@@ -649,45 +622,25 @@ NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_
     for (; index < n; ++index) {
         nk_f64_t ax = a[index * 3 + 0], ay = a[index * 3 + 1], az = a[index * 3 + 2];
         nk_f64_t bx = b[index * 3 + 0], by = b[index * 3 + 1], bz = b[index * 3 + 2];
-        sum_a_x += ax, sum_a_y += ay, sum_a_z += az;
-        sum_b_x += bx, sum_b_y += by, sum_b_z += bz;
         nk_f64_t dx = ax - bx, dy = ay - by, dz = az - bz;
         sum_sq_x += dx * dx, sum_sq_y += dy * dy, sum_sq_z += dz * dz;
     }
-    nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
-    nk_f64_t centroid_a_x = sum_a_x * inv_points_count, centroid_a_y = sum_a_y * inv_points_count,
-             centroid_a_z = sum_a_z * inv_points_count;
-    nk_f64_t centroid_b_x = sum_b_x * inv_points_count, centroid_b_y = sum_b_y * inv_points_count,
-             centroid_b_z = sum_b_z * inv_points_count;
-    if (a_centroid)
-        a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
-        a_centroid[2] = (nk_f32_t)centroid_a_z;
-    if (b_centroid)
-        b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
-        b_centroid[2] = (nk_f32_t)centroid_b_z;
-    nk_f64_t sum_squared = sum_sq_x + sum_sq_y + sum_sq_z;
-    nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x;
-    nk_f64_t mean_diff_y = centroid_a_y - centroid_b_y;
-    nk_f64_t mean_diff_z = centroid_a_z - centroid_b_z;
-    nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
-    *result = nk_f64_sqrt_v128relaxed(sum_squared * inv_points_count - mean_diff_sq);
+    *result = nk_f64_sqrt_v128relaxed((sum_sq_x + sum_sq_y + sum_sq_z) / (nk_f64_t)n);
 }
 NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
                                        nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
-    // RMSD uses identity rotation and scale=1.0
     if (rotation)
         rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
         rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
     if (scale) *scale = 1.0;
+    if (a_centroid) a_centroid[0] = 0, a_centroid[1] = 0, a_centroid[2] = 0;
+    if (b_centroid) b_centroid[0] = 0, b_centroid[1] = 0, b_centroid[2] = 0;
     v128_t const zeros_f64x2 = wasm_f64x2_splat(0);
-    // Accumulators for centroids and squared differences
-    v128_t sum_a_x_f64x2 = zeros_f64x2, sum_a_y_f64x2 = zeros_f64x2, sum_a_z_f64x2 = zeros_f64x2;
-    v128_t sum_b_x_f64x2 = zeros_f64x2, sum_b_y_f64x2 = zeros_f64x2, sum_b_z_f64x2 = zeros_f64x2;
+    // Accumulators for squared differences
     v128_t sum_squared_x_f64x2 = zeros_f64x2, sum_squared_y_f64x2 = zeros_f64x2, sum_squared_z_f64x2 = zeros_f64x2;
     v128_t a_x_f64x2, a_y_f64x2, a_z_f64x2, b_x_f64x2, b_y_f64x2, b_z_f64x2;
@@ -698,13 +651,6 @@ NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_
         nk_deinterleave_f64x2_v128relaxed_(a + i * 3, &a_x_f64x2, &a_y_f64x2, &a_z_f64x2);
         nk_deinterleave_f64x2_v128relaxed_(b + i * 3, &b_x_f64x2, &b_y_f64x2, &b_z_f64x2);
-        sum_a_x_f64x2 = wasm_f64x2_add(sum_a_x_f64x2, a_x_f64x2);
-        sum_a_y_f64x2 = wasm_f64x2_add(sum_a_y_f64x2, a_y_f64x2);
-        sum_a_z_f64x2 = wasm_f64x2_add(sum_a_z_f64x2, a_z_f64x2);
-        sum_b_x_f64x2 = wasm_f64x2_add(sum_b_x_f64x2, b_x_f64x2);
-        sum_b_y_f64x2 = wasm_f64x2_add(sum_b_y_f64x2, b_y_f64x2);
-        sum_b_z_f64x2 = wasm_f64x2_add(sum_b_z_f64x2, b_z_f64x2);
         v128_t delta_x_f64x2 = wasm_f64x2_sub(a_x_f64x2, b_x_f64x2);
         v128_t delta_y_f64x2 = wasm_f64x2_sub(a_y_f64x2, b_y_f64x2);
         v128_t delta_z_f64x2 = wasm_f64x2_sub(a_z_f64x2, b_z_f64x2);
@@ -715,12 +661,6 @@ NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_
     }
     // Reduce vectors to scalars.
-    nk_f64_t total_ax = nk_reduce_stable_f64x2_v128relaxed_(sum_a_x_f64x2), total_ax_compensation = 0.0;
-    nk_f64_t total_ay = nk_reduce_stable_f64x2_v128relaxed_(sum_a_y_f64x2), total_ay_compensation = 0.0;
-    nk_f64_t total_az = nk_reduce_stable_f64x2_v128relaxed_(sum_a_z_f64x2), total_az_compensation = 0.0;
-    nk_f64_t total_bx = nk_reduce_stable_f64x2_v128relaxed_(sum_b_x_f64x2), total_bx_compensation = 0.0;
-    nk_f64_t total_by = nk_reduce_stable_f64x2_v128relaxed_(sum_b_y_f64x2), total_by_compensation = 0.0;
-    nk_f64_t total_bz = nk_reduce_stable_f64x2_v128relaxed_(sum_b_z_f64x2), total_bz_compensation = 0.0;
     nk_f64_t total_squared_x = nk_reduce_stable_f64x2_v128relaxed_(sum_squared_x_f64x2),
              total_squared_x_compensation = 0.0;
     nk_f64_t total_squared_y = nk_reduce_stable_f64x2_v128relaxed_(sum_squared_y_f64x2),
@@ -732,40 +672,16 @@ NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_
     for (; i < n; ++i) {
         nk_f64_t ax = a[i * 3 + 0], ay = a[i * 3 + 1], az = a[i * 3 + 2];
         nk_f64_t bx = b[i * 3 + 0], by = b[i * 3 + 1], bz = b[i * 3 + 2];
-        nk_accumulate_sum_f64_(&total_ax, &total_ax_compensation, ax);
-        nk_accumulate_sum_f64_(&total_ay, &total_ay_compensation, ay);
-        nk_accumulate_sum_f64_(&total_az, &total_az_compensation, az);
-        nk_accumulate_sum_f64_(&total_bx, &total_bx_compensation, bx);
-        nk_accumulate_sum_f64_(&total_by, &total_by_compensation, by);
-        nk_accumulate_sum_f64_(&total_bz, &total_bz_compensation, bz);
         nk_f64_t delta_x = ax - bx, delta_y = ay - by, delta_z = az - bz;
         nk_accumulate_square_f64_(&total_squared_x, &total_squared_x_compensation, delta_x);
         nk_accumulate_square_f64_(&total_squared_y, &total_squared_y_compensation, delta_y);
         nk_accumulate_square_f64_(&total_squared_z, &total_squared_z_compensation, delta_z);
     }
-    total_ax += total_ax_compensation, total_ay += total_ay_compensation, total_az += total_az_compensation;
-    total_bx += total_bx_compensation, total_by += total_by_compensation, total_bz += total_bz_compensation;
     total_squared_x += total_squared_x_compensation, total_squared_y += total_squared_y_compensation,
         total_squared_z += total_squared_z_compensation;
-    // Compute centroids
-    nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
-    nk_f64_t centroid_a_x = total_ax * inv_points_count, centroid_a_y = total_ay * inv_points_count,
-             centroid_a_z = total_az * inv_points_count;
-    nk_f64_t centroid_b_x = total_bx * inv_points_count, centroid_b_y = total_by * inv_points_count,
-             centroid_b_z = total_bz * inv_points_count;
-    if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
-    if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
-    // Compute RMSD
-    nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x;
-    nk_f64_t mean_diff_y = centroid_a_y - centroid_b_y;
-    nk_f64_t mean_diff_z = centroid_a_z - centroid_b_z;
-    nk_f64_t sum_squared = total_squared_x + total_squared_y + total_squared_z;
-    nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
-    *result = nk_f64_sqrt_v128relaxed(sum_squared * inv_points_count - mean_diff_sq);
+    *result = nk_f64_sqrt_v128relaxed((total_squared_x + total_squared_y + total_squared_z) / (nk_f64_t)n);
 }
 NK_PUBLIC void nk_kabsch_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,

package/include/numkong/mesh.h CHANGED Viewed

@@ -6,7 +6,7 @@
  *
  *  Contains:
  *
- *  - Root Mean Square Deviation (RMSD) for rigid body superposition
+ *  - Root Mean Square Deviation (RMSD) of raw point differences
  *  - Kabsch algorithm for optimal rigid body alignment (rotation only)
  *  - Umeyama algorithm for similarity transform (rotation + uniform scaling)
  *
@@ -48,7 +48,7 @@
  *
  *  @section algorithm_overview Algorithm Overview
  *
- *  - RMSD: Simple root mean square deviation without alignment. R = identity, scale = 1.0
+ *  - RMSD: Raw √(Σ‖aᵢ − bᵢ‖² / n) without centering or alignment. R = identity, scale = 1.0, centroids zeroed
  *  - Kabsch: Finds optimal rotation R minimizing ‖R × (a - ā) - (b - b̄)‖. scale = 1.0
  *  - Umeyama: Finds optimal rotation R and scale c minimizing ‖c × R × (a - ā) - (b - b̄)‖
  *

package/include/numkong/mesh.hpp CHANGED Viewed

@@ -354,74 +354,30 @@ void rmsd(                                               //
     else if constexpr (std::is_same_v<in_type_, bf16_t> && simd)
         nk_rmsd_bf16(&a->raw_, &b->raw_, n, &a_centroid->raw_, &b_centroid->raw_, &rotation->raw_,
                      scale ? &scale->raw_ : nullptr, &metric->raw_);
-    // Scalar fallback
+    // Scalar fallback: raw √(Σ‖aᵢ − bᵢ‖² / n), no centering
     else {
-        // Step 1: Compute centroids
-        metric_type_ sum_a_x {}, sum_a_y {}, sum_a_z {};
-        metric_type_ sum_b_x {}, sum_b_y {}, sum_b_z {};
-        metric_type_ val_a_x, val_a_y, val_a_z, val_b_x, val_b_y, val_b_z;
-        for (std::size_t i = 0; i < n; i++) {
-            val_a_x = metric_type_(a[i * 3 + 0]);
-            val_a_y = metric_type_(a[i * 3 + 1]);
-            val_a_z = metric_type_(a[i * 3 + 2]);
-            val_b_x = metric_type_(b[i * 3 + 0]);
-            val_b_y = metric_type_(b[i * 3 + 1]);
-            val_b_z = metric_type_(b[i * 3 + 2]);
-            sum_a_x = sum_a_x + val_a_x;
-            sum_a_y = sum_a_y + val_a_y;
-            sum_a_z = sum_a_z + val_a_z;
-            sum_b_x = sum_b_x + val_b_x;
-            sum_b_y = sum_b_y + val_b_y;
-            sum_b_z = sum_b_z + val_b_z;
-        }
-        metric_type_ inv_n = metric_type_(1.0) / metric_type_(static_cast<double>(n));
-        metric_type_ centroid_a_x = sum_a_x * inv_n;
-        metric_type_ centroid_a_y = sum_a_y * inv_n;
-        metric_type_ centroid_a_z = sum_a_z * inv_n;
-        metric_type_ centroid_b_x = sum_b_x * inv_n;
-        metric_type_ centroid_b_y = sum_b_y * inv_n;
-        metric_type_ centroid_b_z = sum_b_z * inv_n;
-        // Step 2: Store centroids if requested
         if (a_centroid)
-            a_centroid[0] = transform_type_(centroid_a_x), a_centroid[1] = transform_type_(centroid_a_y),
-            a_centroid[2] = transform_type_(centroid_a_z);
+            a_centroid[0] = transform_type_(0.0), a_centroid[1] = transform_type_(0.0),
+            a_centroid[2] = transform_type_(0.0);
         if (b_centroid)
-            b_centroid[0] = transform_type_(centroid_b_x), b_centroid[1] = transform_type_(centroid_b_y),
-            b_centroid[2] = transform_type_(centroid_b_z);
-        // Step 3: RMSD uses identity rotation and scale=1.0
+            b_centroid[0] = transform_type_(0.0), b_centroid[1] = transform_type_(0.0),
+            b_centroid[2] = transform_type_(0.0);
         if (rotation) {
-            rotation[0] = transform_type_(1.0);
-            rotation[1] = transform_type_(0.0);
-            rotation[2] = transform_type_(0.0);
-            rotation[3] = transform_type_(0.0);
-            rotation[4] = transform_type_(1.0);
-            rotation[5] = transform_type_(0.0);
-            rotation[6] = transform_type_(0.0);
-            rotation[7] = transform_type_(0.0);
-            rotation[8] = transform_type_(1.0);
+            rotation[0] = transform_type_(1.0), rotation[1] = transform_type_(0.0), rotation[2] = transform_type_(0.0);
+            rotation[3] = transform_type_(0.0), rotation[4] = transform_type_(1.0), rotation[5] = transform_type_(0.0);
+            rotation[6] = transform_type_(0.0), rotation[7] = transform_type_(0.0), rotation[8] = transform_type_(1.0);
         }
         if (scale) *scale = transform_type_(1.0);
-        // Step 4: Compute RMSD between centered point clouds
         metric_type_ sum_squared {};
         for (std::size_t i = 0; i < n; i++) {
-            val_a_x = metric_type_(a[i * 3 + 0]);
-            val_a_y = metric_type_(a[i * 3 + 1]);
-            val_a_z = metric_type_(a[i * 3 + 2]);
-            val_b_x = metric_type_(b[i * 3 + 0]);
-            val_b_y = metric_type_(b[i * 3 + 1]);
-            val_b_z = metric_type_(b[i * 3 + 2]);
-            metric_type_ dx = (val_a_x - centroid_a_x) - (val_b_x - centroid_b_x);
-            metric_type_ dy = (val_a_y - centroid_a_y) - (val_b_y - centroid_b_y);
-            metric_type_ dz = (val_a_z - centroid_a_z) - (val_b_z - centroid_b_z);
+            metric_type_ dx = metric_type_(a[i * 3 + 0]) - metric_type_(b[i * 3 + 0]);
+            metric_type_ dy = metric_type_(a[i * 3 + 1]) - metric_type_(b[i * 3 + 1]);
+            metric_type_ dz = metric_type_(a[i * 3 + 2]) - metric_type_(b[i * 3 + 2]);
             sum_squared = sum_squared + dx * dx + dy * dy + dz * dz;
         }
-        *metric = (sum_squared * inv_n).sqrt();
+        *metric = (sum_squared / metric_type_(static_cast<double>(n))).sqrt();
     }
 }
@@ -470,18 +426,12 @@ void kabsch(                                             //
         metric_type_ val_a_x, val_a_y, val_a_z, val_b_x, val_b_y, val_b_z;
         for (std::size_t i = 0; i < n; i++) {
-            val_a_x = metric_type_(a[i * 3 + 0]);
-            val_a_y = metric_type_(a[i * 3 + 1]);
+            val_a_x = metric_type_(a[i * 3 + 0]), val_a_y = metric_type_(a[i * 3 + 1]),
             val_a_z = metric_type_(a[i * 3 + 2]);
-            val_b_x = metric_type_(b[i * 3 + 0]);
-            val_b_y = metric_type_(b[i * 3 + 1]);
+            val_b_x = metric_type_(b[i * 3 + 0]), val_b_y = metric_type_(b[i * 3 + 1]),
             val_b_z = metric_type_(b[i * 3 + 2]);
-            sum_a_x = sum_a_x + val_a_x;
-            sum_a_y = sum_a_y + val_a_y;
-            sum_a_z = sum_a_z + val_a_z;
-            sum_b_x = sum_b_x + val_b_x;
-            sum_b_y = sum_b_y + val_b_y;
-            sum_b_z = sum_b_z + val_b_z;
+            sum_a_x = sum_a_x + val_a_x, sum_a_y = sum_a_y + val_a_y, sum_a_z = sum_a_z + val_a_z;
+            sum_b_x = sum_b_x + val_b_x, sum_b_y = sum_b_y + val_b_y, sum_b_z = sum_b_z + val_b_z;
         }
         metric_type_ inv_n = metric_type_(1.0) / metric_type_(static_cast<double>(n));
@@ -503,11 +453,9 @@ void kabsch(                                             //
         // Step 2: Build 3x3 covariance matrix H = (A - A_bar)^T x (B - B_bar)
         metric_type_ cross_covariance[9] = {};
         for (std::size_t i = 0; i < n; i++) {
-            val_a_x = metric_type_(a[i * 3 + 0]) - centroid_a_x;
-            val_a_y = metric_type_(a[i * 3 + 1]) - centroid_a_y;
+            val_a_x = metric_type_(a[i * 3 + 0]) - centroid_a_x, val_a_y = metric_type_(a[i * 3 + 1]) - centroid_a_y,
             val_a_z = metric_type_(a[i * 3 + 2]) - centroid_a_z;
-            val_b_x = metric_type_(b[i * 3 + 0]) - centroid_b_x;
-            val_b_y = metric_type_(b[i * 3 + 1]) - centroid_b_y;
+            val_b_x = metric_type_(b[i * 3 + 0]) - centroid_b_x, val_b_y = metric_type_(b[i * 3 + 1]) - centroid_b_y,
             val_b_z = metric_type_(b[i * 3 + 2]) - centroid_b_z;
             cross_covariance[0] = cross_covariance[0] + val_a_x * val_b_x;
             cross_covariance[1] = cross_covariance[1] + val_a_x * val_b_y;
@@ -563,11 +511,11 @@ void kabsch(                                             //
         metric_type_ sum_squared {};
         for (std::size_t i = 0; i < n; i++) {
             metric_type_ point_a[3], point_b[3], rotated_point_a[3];
-            point_a[0] = metric_type_(a[i * 3 + 0]) - centroid_a_x;
-            point_a[1] = metric_type_(a[i * 3 + 1]) - centroid_a_y;
+            point_a[0] = metric_type_(a[i * 3 + 0]) - centroid_a_x,
+            point_a[1] = metric_type_(a[i * 3 + 1]) - centroid_a_y,
             point_a[2] = metric_type_(a[i * 3 + 2]) - centroid_a_z;
-            point_b[0] = metric_type_(b[i * 3 + 0]) - centroid_b_x;
-            point_b[1] = metric_type_(b[i * 3 + 1]) - centroid_b_y;
+            point_b[0] = metric_type_(b[i * 3 + 0]) - centroid_b_x,
+            point_b[1] = metric_type_(b[i * 3 + 1]) - centroid_b_y,
             point_b[2] = metric_type_(b[i * 3 + 2]) - centroid_b_z;
             rotated_point_a[0] = rotation_matrix[0] * point_a[0] + rotation_matrix[1] * point_a[1] +
                                  rotation_matrix[2] * point_a[2];
@@ -628,18 +576,12 @@ void umeyama(in_type_ const *a, in_type_ const *b, std::size_t n, transform_type
         metric_type_ val_a_x, val_a_y, val_a_z, val_b_x, val_b_y, val_b_z;
         for (std::size_t i = 0; i < n; i++) {
-            val_a_x = metric_type_(a[i * 3 + 0]);
-            val_a_y = metric_type_(a[i * 3 + 1]);
+            val_a_x = metric_type_(a[i * 3 + 0]), val_a_y = metric_type_(a[i * 3 + 1]),
             val_a_z = metric_type_(a[i * 3 + 2]);
-            val_b_x = metric_type_(b[i * 3 + 0]);
-            val_b_y = metric_type_(b[i * 3 + 1]);
+            val_b_x = metric_type_(b[i * 3 + 0]), val_b_y = metric_type_(b[i * 3 + 1]),
             val_b_z = metric_type_(b[i * 3 + 2]);
-            sum_a_x = sum_a_x + val_a_x;
-            sum_a_y = sum_a_y + val_a_y;
-            sum_a_z = sum_a_z + val_a_z;
-            sum_b_x = sum_b_x + val_b_x;
-            sum_b_y = sum_b_y + val_b_y;
-            sum_b_z = sum_b_z + val_b_z;
+            sum_a_x = sum_a_x + val_a_x, sum_a_y = sum_a_y + val_a_y, sum_a_z = sum_a_z + val_a_z;
+            sum_b_x = sum_b_x + val_b_x, sum_b_y = sum_b_y + val_b_y, sum_b_z = sum_b_z + val_b_z;
         }
         metric_type_ inv_n = metric_type_(1.0) / metric_type_(static_cast<double>(n));
@@ -650,16 +592,13 @@ void umeyama(in_type_ const *a, in_type_ const *b, std::size_t n, transform_type
         metric_type_ centroid_b_y = sum_b_y * inv_n;
         metric_type_ centroid_b_z = sum_b_z * inv_n;
-        if (a_centroid) {
-            a_centroid[0] = transform_type_(centroid_a_x);
-            a_centroid[1] = transform_type_(centroid_a_y);
+        if (a_centroid)
+            a_centroid[0] = transform_type_(centroid_a_x), a_centroid[1] = transform_type_(centroid_a_y),
             a_centroid[2] = transform_type_(centroid_a_z);
-        }
-        if (b_centroid) {
-            b_centroid[0] = transform_type_(centroid_b_x);
-            b_centroid[1] = transform_type_(centroid_b_y);
+        if (b_centroid)
+            b_centroid[0] = transform_type_(centroid_b_x), b_centroid[1] = transform_type_(centroid_b_y),
             b_centroid[2] = transform_type_(centroid_b_z);
-        }
         // Step 2: Build covariance matrix H and compute variance of A
         metric_type_ cross_covariance[9] = {};
@@ -733,11 +672,11 @@ void umeyama(in_type_ const *a, in_type_ const *b, std::size_t n, transform_type
         metric_type_ sum_squared {};
         for (std::size_t i = 0; i < n; i++) {
             metric_type_ point_a[3], point_b[3], rotated_point_a[3];
-            point_a[0] = metric_type_(a[i * 3 + 0]) - centroid_a_x;
-            point_a[1] = metric_type_(a[i * 3 + 1]) - centroid_a_y;
+            point_a[0] = metric_type_(a[i * 3 + 0]) - centroid_a_x,
+            point_a[1] = metric_type_(a[i * 3 + 1]) - centroid_a_y,
             point_a[2] = metric_type_(a[i * 3 + 2]) - centroid_a_z;
-            point_b[0] = metric_type_(b[i * 3 + 0]) - centroid_b_x;
-            point_b[1] = metric_type_(b[i * 3 + 1]) - centroid_b_y;
+            point_b[0] = metric_type_(b[i * 3 + 0]) - centroid_b_x,
+            point_b[1] = metric_type_(b[i * 3 + 1]) - centroid_b_y,
             point_b[2] = metric_type_(b[i * 3 + 2]) - centroid_b_z;
             rotated_point_a[0] = scale_factor * (rotation_matrix[0] * point_a[0] + rotation_matrix[1] * point_a[1] +
                                                  rotation_matrix[2] * point_a[2]);