numkong 7.4.3 → 7.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -644,12 +644,10 @@ NK_PUBLIC void nk_rmsd_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size
644
644
  rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
645
645
  rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
646
646
  if (scale) *scale = 1.0f;
647
+ if (a_centroid) a_centroid[0] = 0, a_centroid[1] = 0, a_centroid[2] = 0;
648
+ if (b_centroid) b_centroid[0] = 0, b_centroid[1] = 0, b_centroid[2] = 0;
647
649
 
648
- // Fused single-pass: centroids + squared differences in f64, using the identity:
649
- // RMSD = √(E[(a-b)²] - (ā - b̄)²)
650
650
  __m512d const zeros_f64x8 = _mm512_setzero_pd();
651
- __m512d sum_a_x_f64x8 = zeros_f64x8, sum_a_y_f64x8 = zeros_f64x8, sum_a_z_f64x8 = zeros_f64x8;
652
- __m512d sum_b_x_f64x8 = zeros_f64x8, sum_b_y_f64x8 = zeros_f64x8, sum_b_z_f64x8 = zeros_f64x8;
653
651
  __m512d sum_squared_x_f64x8 = zeros_f64x8, sum_squared_y_f64x8 = zeros_f64x8, sum_squared_z_f64x8 = zeros_f64x8;
654
652
  __m512 a_x_f32x16, a_y_f32x16, a_z_f32x16, b_x_f32x16, b_y_f32x16, b_z_f32x16;
655
653
  nk_size_t i = 0;
@@ -672,13 +670,6 @@ NK_PUBLIC void nk_rmsd_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size
672
670
  __m512d b_z_low_f64x8 = _mm512_cvtps_pd(_mm512_castps512_ps256(b_z_f32x16));
673
671
  __m512d b_z_high_f64x8 = _mm512_cvtps_pd(_mm512_extractf32x8_ps(b_z_f32x16, 1));
674
672
 
675
- sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, _mm512_add_pd(a_x_low_f64x8, a_x_high_f64x8));
676
- sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, _mm512_add_pd(a_y_low_f64x8, a_y_high_f64x8));
677
- sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, _mm512_add_pd(a_z_low_f64x8, a_z_high_f64x8));
678
- sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, _mm512_add_pd(b_x_low_f64x8, b_x_high_f64x8));
679
- sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, _mm512_add_pd(b_y_low_f64x8, b_y_high_f64x8));
680
- sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, _mm512_add_pd(b_z_low_f64x8, b_z_high_f64x8));
681
-
682
673
  __m512d delta_x_low_f64x8 = _mm512_sub_pd(a_x_low_f64x8, b_x_low_f64x8);
683
674
  __m512d delta_x_high_f64x8 = _mm512_sub_pd(a_x_high_f64x8, b_x_high_f64x8);
684
675
  __m512d delta_y_low_f64x8 = _mm512_sub_pd(a_y_low_f64x8, b_y_low_f64x8);
@@ -708,13 +699,6 @@ NK_PUBLIC void nk_rmsd_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size
708
699
  b_z_low_f64x8 = _mm512_cvtps_pd(_mm512_castps512_ps256(b_z_f32x16));
709
700
  b_z_high_f64x8 = _mm512_cvtps_pd(_mm512_extractf32x8_ps(b_z_f32x16, 1));
710
701
 
711
- sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, _mm512_add_pd(a_x_low_f64x8, a_x_high_f64x8));
712
- sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, _mm512_add_pd(a_y_low_f64x8, a_y_high_f64x8));
713
- sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, _mm512_add_pd(a_z_low_f64x8, a_z_high_f64x8));
714
- sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, _mm512_add_pd(b_x_low_f64x8, b_x_high_f64x8));
715
- sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, _mm512_add_pd(b_y_low_f64x8, b_y_high_f64x8));
716
- sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, _mm512_add_pd(b_z_low_f64x8, b_z_high_f64x8));
717
-
718
702
  delta_x_low_f64x8 = _mm512_sub_pd(a_x_low_f64x8, b_x_low_f64x8);
719
703
  delta_x_high_f64x8 = _mm512_sub_pd(a_x_high_f64x8, b_x_high_f64x8);
720
704
  delta_y_low_f64x8 = _mm512_sub_pd(a_y_low_f64x8, b_y_low_f64x8);
@@ -746,13 +730,6 @@ NK_PUBLIC void nk_rmsd_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size
746
730
  __m512d b_z_low_f64x8 = _mm512_cvtps_pd(_mm512_castps512_ps256(b_z_f32x16));
747
731
  __m512d b_z_high_f64x8 = _mm512_cvtps_pd(_mm512_extractf32x8_ps(b_z_f32x16, 1));
748
732
 
749
- sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, _mm512_add_pd(a_x_low_f64x8, a_x_high_f64x8));
750
- sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, _mm512_add_pd(a_y_low_f64x8, a_y_high_f64x8));
751
- sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, _mm512_add_pd(a_z_low_f64x8, a_z_high_f64x8));
752
- sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, _mm512_add_pd(b_x_low_f64x8, b_x_high_f64x8));
753
- sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, _mm512_add_pd(b_y_low_f64x8, b_y_high_f64x8));
754
- sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, _mm512_add_pd(b_z_low_f64x8, b_z_high_f64x8));
755
-
756
733
  __m512d delta_x_low_f64x8 = _mm512_sub_pd(a_x_low_f64x8, b_x_low_f64x8);
757
734
  __m512d delta_x_high_f64x8 = _mm512_sub_pd(a_x_high_f64x8, b_x_high_f64x8);
758
735
  __m512d delta_y_low_f64x8 = _mm512_sub_pd(a_y_low_f64x8, b_y_low_f64x8);
@@ -796,13 +773,6 @@ NK_PUBLIC void nk_rmsd_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size
796
773
  __m512d b_z_low_f64x8 = _mm512_cvtps_pd(_mm512_castps512_ps256(b_z_f32x16));
797
774
  __m512d b_z_high_f64x8 = _mm512_cvtps_pd(_mm512_extractf32x8_ps(b_z_f32x16, 1));
798
775
 
799
- sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, _mm512_add_pd(a_x_low_f64x8, a_x_high_f64x8));
800
- sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, _mm512_add_pd(a_y_low_f64x8, a_y_high_f64x8));
801
- sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, _mm512_add_pd(a_z_low_f64x8, a_z_high_f64x8));
802
- sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, _mm512_add_pd(b_x_low_f64x8, b_x_high_f64x8));
803
- sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, _mm512_add_pd(b_y_low_f64x8, b_y_high_f64x8));
804
- sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, _mm512_add_pd(b_z_low_f64x8, b_z_high_f64x8));
805
-
806
776
  __m512d delta_x_low_f64x8 = _mm512_sub_pd(a_x_low_f64x8, b_x_low_f64x8);
807
777
  __m512d delta_x_high_f64x8 = _mm512_sub_pd(a_x_high_f64x8, b_x_high_f64x8);
808
778
  __m512d delta_y_low_f64x8 = _mm512_sub_pd(a_y_low_f64x8, b_y_low_f64x8);
@@ -817,32 +787,10 @@ NK_PUBLIC void nk_rmsd_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size
817
787
  sum_squared_z_f64x8 = _mm512_fmadd_pd(delta_z_high_f64x8, delta_z_high_f64x8, sum_squared_z_f64x8);
818
788
  }
819
789
 
820
- // Reduce and compute centroids
821
- nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
822
- nk_f64_t total_ax = _mm512_reduce_add_pd(sum_a_x_f64x8);
823
- nk_f64_t total_ay = _mm512_reduce_add_pd(sum_a_y_f64x8);
824
- nk_f64_t total_az = _mm512_reduce_add_pd(sum_a_z_f64x8);
825
- nk_f64_t total_bx = _mm512_reduce_add_pd(sum_b_x_f64x8);
826
- nk_f64_t total_by = _mm512_reduce_add_pd(sum_b_y_f64x8);
827
- nk_f64_t total_bz = _mm512_reduce_add_pd(sum_b_z_f64x8);
828
790
  nk_f64_t total_sq_x = _mm512_reduce_add_pd(sum_squared_x_f64x8);
829
791
  nk_f64_t total_sq_y = _mm512_reduce_add_pd(sum_squared_y_f64x8);
830
792
  nk_f64_t total_sq_z = _mm512_reduce_add_pd(sum_squared_z_f64x8);
831
-
832
- nk_f64_t centroid_a_x = total_ax * inv_n, centroid_a_y = total_ay * inv_n, centroid_a_z = total_az * inv_n;
833
- nk_f64_t centroid_b_x = total_bx * inv_n, centroid_b_y = total_by * inv_n, centroid_b_z = total_bz * inv_n;
834
- if (a_centroid)
835
- a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
836
- a_centroid[2] = (nk_f32_t)centroid_a_z;
837
- if (b_centroid)
838
- b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
839
- b_centroid[2] = (nk_f32_t)centroid_b_z;
840
-
841
- nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x, mean_diff_y = centroid_a_y - centroid_b_y,
842
- mean_diff_z = centroid_a_z - centroid_b_z;
843
- nk_f64_t sum_squared = total_sq_x + total_sq_y + total_sq_z;
844
- nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
845
- *result = nk_f64_sqrt_haswell(sum_squared * inv_n - mean_diff_sq);
793
+ *result = nk_f64_sqrt_haswell((total_sq_x + total_sq_y + total_sq_z) / (nk_f64_t)n);
846
794
  }
847
795
 
848
796
  NK_PUBLIC void nk_kabsch_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
@@ -1008,21 +956,15 @@ NK_PUBLIC void nk_kabsch_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_si
1008
956
 
1009
957
  NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
1010
958
  nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
1011
- // RMSD uses identity rotation and scale=1.0.
1012
959
  if (rotation)
1013
960
  rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
1014
961
  rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
1015
962
  if (scale) *scale = 1.0;
1016
- // Optimized fused single-pass implementation for f64.
1017
- // Computes centroids and squared differences in one pass using the identity:
1018
- // RMSD = √(E[(a-ā) - (b-b̄)]²)
1019
- // = √(E[(a-b)²] - (ā - b̄)²)
963
+ if (a_centroid) a_centroid[0] = 0, a_centroid[1] = 0, a_centroid[2] = 0;
964
+ if (b_centroid) b_centroid[0] = 0, b_centroid[1] = 0, b_centroid[2] = 0;
965
+
1020
966
  __m512i const gather_idx_i64x8 = _mm512_setr_epi64(0, 3, 6, 9, 12, 15, 18, 21);
1021
967
  __m512d const zeros_f64x8 = _mm512_setzero_pd();
1022
-
1023
- // Accumulators for centroids and squared differences
1024
- __m512d sum_a_x_f64x8 = zeros_f64x8, sum_a_y_f64x8 = zeros_f64x8, sum_a_z_f64x8 = zeros_f64x8;
1025
- __m512d sum_b_x_f64x8 = zeros_f64x8, sum_b_y_f64x8 = zeros_f64x8, sum_b_z_f64x8 = zeros_f64x8;
1026
968
  __m512d sum_squared_x_f64x8 = zeros_f64x8, sum_squared_y_f64x8 = zeros_f64x8, sum_squared_z_f64x8 = zeros_f64x8;
1027
969
 
1028
970
  __m512d a_x_f64x8, a_y_f64x8, a_z_f64x8, b_x_f64x8, b_y_f64x8, b_z_f64x8;
@@ -1034,13 +976,6 @@ NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size
1034
976
  nk_deinterleave_f64x8_skylake_(a + i * 3, &a_x_f64x8, &a_y_f64x8, &a_z_f64x8);
1035
977
  nk_deinterleave_f64x8_skylake_(b + i * 3, &b_x_f64x8, &b_y_f64x8, &b_z_f64x8);
1036
978
 
1037
- sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, a_x_f64x8),
1038
- sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, a_y_f64x8),
1039
- sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, a_z_f64x8);
1040
- sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, b_x_f64x8),
1041
- sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, b_y_f64x8),
1042
- sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, b_z_f64x8);
1043
-
1044
979
  __m512d delta_x_f64x8 = _mm512_sub_pd(a_x_f64x8, b_x_f64x8),
1045
980
  delta_y_f64x8 = _mm512_sub_pd(a_y_f64x8, b_y_f64x8),
1046
981
  delta_z_f64x8 = _mm512_sub_pd(a_z_f64x8, b_z_f64x8);
@@ -1053,13 +988,6 @@ NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size
1053
988
  nk_deinterleave_f64x8_skylake_(a + (i + 8) * 3, &a_x1_f64x8, &a_y1_f64x8, &a_z1_f64x8);
1054
989
  nk_deinterleave_f64x8_skylake_(b + (i + 8) * 3, &b_x1_f64x8, &b_y1_f64x8, &b_z1_f64x8);
1055
990
 
1056
- sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, a_x1_f64x8),
1057
- sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, a_y1_f64x8),
1058
- sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, a_z1_f64x8);
1059
- sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, b_x1_f64x8),
1060
- sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, b_y1_f64x8),
1061
- sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, b_z1_f64x8);
1062
-
1063
991
  __m512d delta_x1_f64x8 = _mm512_sub_pd(a_x1_f64x8, b_x1_f64x8),
1064
992
  delta_y1_f64x8 = _mm512_sub_pd(a_y1_f64x8, b_y1_f64x8),
1065
993
  delta_z1_f64x8 = _mm512_sub_pd(a_z1_f64x8, b_z1_f64x8);
@@ -1073,13 +1001,6 @@ NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size
1073
1001
  nk_deinterleave_f64x8_skylake_(a + i * 3, &a_x_f64x8, &a_y_f64x8, &a_z_f64x8);
1074
1002
  nk_deinterleave_f64x8_skylake_(b + i * 3, &b_x_f64x8, &b_y_f64x8, &b_z_f64x8);
1075
1003
 
1076
- sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, a_x_f64x8),
1077
- sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, a_y_f64x8),
1078
- sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, a_z_f64x8);
1079
- sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, b_x_f64x8),
1080
- sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, b_y_f64x8),
1081
- sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, b_z_f64x8);
1082
-
1083
1004
  __m512d delta_x_f64x8 = _mm512_sub_pd(a_x_f64x8, b_x_f64x8),
1084
1005
  delta_y_f64x8 = _mm512_sub_pd(a_y_f64x8, b_y_f64x8),
1085
1006
  delta_z_f64x8 = _mm512_sub_pd(a_z_f64x8, b_z_f64x8);
@@ -1102,13 +1023,6 @@ NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size
1102
1023
  b_y_f64x8 = _mm512_mask_i64gather_pd(zeros_f64x8, mask, gather_idx_i64x8, b_tail + 1, 8);
1103
1024
  b_z_f64x8 = _mm512_mask_i64gather_pd(zeros_f64x8, mask, gather_idx_i64x8, b_tail + 2, 8);
1104
1025
 
1105
- sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, a_x_f64x8),
1106
- sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, a_y_f64x8),
1107
- sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, a_z_f64x8);
1108
- sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, b_x_f64x8),
1109
- sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, b_y_f64x8),
1110
- sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, b_z_f64x8);
1111
-
1112
1026
  __m512d delta_x_f64x8 = _mm512_sub_pd(a_x_f64x8, b_x_f64x8),
1113
1027
  delta_y_f64x8 = _mm512_sub_pd(a_y_f64x8, b_y_f64x8),
1114
1028
  delta_z_f64x8 = _mm512_sub_pd(a_z_f64x8, b_z_f64x8);
@@ -1118,14 +1032,6 @@ NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size
1118
1032
  i = n;
1119
1033
  }
1120
1034
 
1121
- // Reduce and compute centroids.
1122
- nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
1123
- nk_f64_t total_ax = nk_reduce_stable_f64x8_skylake_(sum_a_x_f64x8), total_ax_compensation = 0.0;
1124
- nk_f64_t total_ay = nk_reduce_stable_f64x8_skylake_(sum_a_y_f64x8), total_ay_compensation = 0.0;
1125
- nk_f64_t total_az = nk_reduce_stable_f64x8_skylake_(sum_a_z_f64x8), total_az_compensation = 0.0;
1126
- nk_f64_t total_bx = nk_reduce_stable_f64x8_skylake_(sum_b_x_f64x8), total_bx_compensation = 0.0;
1127
- nk_f64_t total_by = nk_reduce_stable_f64x8_skylake_(sum_b_y_f64x8), total_by_compensation = 0.0;
1128
- nk_f64_t total_bz = nk_reduce_stable_f64x8_skylake_(sum_b_z_f64x8), total_bz_compensation = 0.0;
1129
1035
  nk_f64_t total_squared_x = nk_reduce_stable_f64x8_skylake_(sum_squared_x_f64x8), total_squared_x_compensation = 0.0;
1130
1036
  nk_f64_t total_squared_y = nk_reduce_stable_f64x8_skylake_(sum_squared_y_f64x8), total_squared_y_compensation = 0.0;
1131
1037
  nk_f64_t total_squared_z = nk_reduce_stable_f64x8_skylake_(sum_squared_z_f64x8), total_squared_z_compensation = 0.0;
@@ -1133,37 +1039,15 @@ NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size
1133
1039
  for (; i < n; ++i) {
1134
1040
  nk_f64_t ax = a[i * 3 + 0], ay = a[i * 3 + 1], az = a[i * 3 + 2];
1135
1041
  nk_f64_t bx = b[i * 3 + 0], by = b[i * 3 + 1], bz = b[i * 3 + 2];
1136
- nk_accumulate_sum_f64_(&total_ax, &total_ax_compensation, ax);
1137
- nk_accumulate_sum_f64_(&total_ay, &total_ay_compensation, ay);
1138
- nk_accumulate_sum_f64_(&total_az, &total_az_compensation, az);
1139
- nk_accumulate_sum_f64_(&total_bx, &total_bx_compensation, bx);
1140
- nk_accumulate_sum_f64_(&total_by, &total_by_compensation, by);
1141
- nk_accumulate_sum_f64_(&total_bz, &total_bz_compensation, bz);
1142
1042
  nk_f64_t delta_x = ax - bx, delta_y = ay - by, delta_z = az - bz;
1143
1043
  nk_accumulate_square_f64_(&total_squared_x, &total_squared_x_compensation, delta_x);
1144
1044
  nk_accumulate_square_f64_(&total_squared_y, &total_squared_y_compensation, delta_y);
1145
1045
  nk_accumulate_square_f64_(&total_squared_z, &total_squared_z_compensation, delta_z);
1146
1046
  }
1147
1047
 
1148
- total_ax += total_ax_compensation, total_ay += total_ay_compensation, total_az += total_az_compensation;
1149
- total_bx += total_bx_compensation, total_by += total_by_compensation, total_bz += total_bz_compensation;
1150
1048
  total_squared_x += total_squared_x_compensation, total_squared_y += total_squared_y_compensation,
1151
1049
  total_squared_z += total_squared_z_compensation;
1152
-
1153
- nk_f64_t centroid_a_x = total_ax * inv_n, centroid_a_y = total_ay * inv_n, centroid_a_z = total_az * inv_n;
1154
- nk_f64_t centroid_b_x = total_bx * inv_n, centroid_b_y = total_by * inv_n, centroid_b_z = total_bz * inv_n;
1155
-
1156
- if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
1157
- if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
1158
-
1159
- // Compute RMSD using the formula:
1160
- // RMSD = √(E[(a-b)²] - (ā - b̄)²).
1161
- nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x, mean_diff_y = centroid_a_y - centroid_b_y,
1162
- mean_diff_z = centroid_a_z - centroid_b_z;
1163
- nk_f64_t sum_squared = total_squared_x + total_squared_y + total_squared_z;
1164
- nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
1165
-
1166
- *result = nk_f64_sqrt_haswell(sum_squared * inv_n - mean_diff_sq);
1050
+ *result = nk_f64_sqrt_haswell((total_squared_x + total_squared_y + total_squared_z) / (nk_f64_t)n);
1167
1051
  }
1168
1052
 
1169
1053
  NK_PUBLIC void nk_kabsch_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
@@ -570,16 +570,10 @@ NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_
570
570
  rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
571
571
  rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
572
572
  if (scale) *scale = 1.0f;
573
+ if (a_centroid) a_centroid[0] = 0, a_centroid[1] = 0, a_centroid[2] = 0;
574
+ if (b_centroid) b_centroid[0] = 0, b_centroid[1] = 0, b_centroid[2] = 0;
573
575
 
574
- // Fused single-pass: accumulate centroids and squared differences simultaneously.
575
- // RMSD = √(E[(a−b)²] − (ā − b̄)²)
576
576
  v128_t zero_f64x2 = wasm_f64x2_splat(0.0);
577
- v128_t sum_a_x_low_f64x2 = zero_f64x2, sum_a_x_high_f64x2 = zero_f64x2;
578
- v128_t sum_a_y_low_f64x2 = zero_f64x2, sum_a_y_high_f64x2 = zero_f64x2;
579
- v128_t sum_a_z_low_f64x2 = zero_f64x2, sum_a_z_high_f64x2 = zero_f64x2;
580
- v128_t sum_b_x_low_f64x2 = zero_f64x2, sum_b_x_high_f64x2 = zero_f64x2;
581
- v128_t sum_b_y_low_f64x2 = zero_f64x2, sum_b_y_high_f64x2 = zero_f64x2;
582
- v128_t sum_b_z_low_f64x2 = zero_f64x2, sum_b_z_high_f64x2 = zero_f64x2;
583
577
  v128_t sum_sq_x_low_f64x2 = zero_f64x2, sum_sq_x_high_f64x2 = zero_f64x2;
584
578
  v128_t sum_sq_y_low_f64x2 = zero_f64x2, sum_sq_y_high_f64x2 = zero_f64x2;
585
579
  v128_t sum_sq_z_low_f64x2 = zero_f64x2, sum_sq_z_high_f64x2 = zero_f64x2;
@@ -590,8 +584,7 @@ NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_
590
584
  nk_deinterleave_f32x4_v128relaxed_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4);
591
585
  nk_deinterleave_f32x4_v128relaxed_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
592
586
 
593
- // Promote lower and upper halves to f64. Deltas computed in f64 to avoid
594
- // f32 cancellation in the single-pass formula RMSD = √(E[(a−b)²] − (ā − b̄)²).
587
+ // Promote lower and upper halves to f64 for precision.
595
588
  v128_t a_x_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_x_f32x4);
596
589
  v128_t a_x_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_x_f32x4, a_x_f32x4, 2, 3, 0, 1));
597
590
  v128_t a_y_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_y_f32x4);
@@ -605,21 +598,7 @@ NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_
605
598
  v128_t b_z_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_z_f32x4);
606
599
  v128_t b_z_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_z_f32x4, b_z_f32x4, 2, 3, 0, 1));
607
600
 
608
- // Accumulate centroids.
609
- sum_a_x_low_f64x2 = wasm_f64x2_add(sum_a_x_low_f64x2, a_x_low_f64x2);
610
- sum_a_x_high_f64x2 = wasm_f64x2_add(sum_a_x_high_f64x2, a_x_high_f64x2);
611
- sum_a_y_low_f64x2 = wasm_f64x2_add(sum_a_y_low_f64x2, a_y_low_f64x2);
612
- sum_a_y_high_f64x2 = wasm_f64x2_add(sum_a_y_high_f64x2, a_y_high_f64x2);
613
- sum_a_z_low_f64x2 = wasm_f64x2_add(sum_a_z_low_f64x2, a_z_low_f64x2);
614
- sum_a_z_high_f64x2 = wasm_f64x2_add(sum_a_z_high_f64x2, a_z_high_f64x2);
615
- sum_b_x_low_f64x2 = wasm_f64x2_add(sum_b_x_low_f64x2, b_x_low_f64x2);
616
- sum_b_x_high_f64x2 = wasm_f64x2_add(sum_b_x_high_f64x2, b_x_high_f64x2);
617
- sum_b_y_low_f64x2 = wasm_f64x2_add(sum_b_y_low_f64x2, b_y_low_f64x2);
618
- sum_b_y_high_f64x2 = wasm_f64x2_add(sum_b_y_high_f64x2, b_y_high_f64x2);
619
- sum_b_z_low_f64x2 = wasm_f64x2_add(sum_b_z_low_f64x2, b_z_low_f64x2);
620
- sum_b_z_high_f64x2 = wasm_f64x2_add(sum_b_z_high_f64x2, b_z_high_f64x2);
621
-
622
- // Accumulate squared differences in f64 — deltas computed in f64 for precision.
601
+ // Accumulate squared differences in f64.
623
602
  v128_t dx_low_f64x2 = wasm_f64x2_sub(a_x_low_f64x2, b_x_low_f64x2);
624
603
  v128_t dx_high_f64x2 = wasm_f64x2_sub(a_x_high_f64x2, b_x_high_f64x2);
625
604
  v128_t dy_low_f64x2 = wasm_f64x2_sub(a_y_low_f64x2, b_y_low_f64x2);
@@ -635,12 +614,6 @@ NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_
635
614
  sum_sq_z_high_f64x2 = wasm_f64x2_relaxed_madd(dz_high_f64x2, dz_high_f64x2, sum_sq_z_high_f64x2);
636
615
  }
637
616
 
638
- nk_f64_t sum_a_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_x_low_f64x2, sum_a_x_high_f64x2));
639
- nk_f64_t sum_a_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_y_low_f64x2, sum_a_y_high_f64x2));
640
- nk_f64_t sum_a_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_z_low_f64x2, sum_a_z_high_f64x2));
641
- nk_f64_t sum_b_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_x_low_f64x2, sum_b_x_high_f64x2));
642
- nk_f64_t sum_b_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_y_low_f64x2, sum_b_y_high_f64x2));
643
- nk_f64_t sum_b_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_z_low_f64x2, sum_b_z_high_f64x2));
644
617
  nk_f64_t sum_sq_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_sq_x_low_f64x2, sum_sq_x_high_f64x2));
645
618
  nk_f64_t sum_sq_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_sq_y_low_f64x2, sum_sq_y_high_f64x2));
646
619
  nk_f64_t sum_sq_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_sq_z_low_f64x2, sum_sq_z_high_f64x2));
@@ -649,45 +622,25 @@ NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_
649
622
  for (; index < n; ++index) {
650
623
  nk_f64_t ax = a[index * 3 + 0], ay = a[index * 3 + 1], az = a[index * 3 + 2];
651
624
  nk_f64_t bx = b[index * 3 + 0], by = b[index * 3 + 1], bz = b[index * 3 + 2];
652
- sum_a_x += ax, sum_a_y += ay, sum_a_z += az;
653
- sum_b_x += bx, sum_b_y += by, sum_b_z += bz;
654
625
  nk_f64_t dx = ax - bx, dy = ay - by, dz = az - bz;
655
626
  sum_sq_x += dx * dx, sum_sq_y += dy * dy, sum_sq_z += dz * dz;
656
627
  }
657
628
 
658
- nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
659
- nk_f64_t centroid_a_x = sum_a_x * inv_points_count, centroid_a_y = sum_a_y * inv_points_count,
660
- centroid_a_z = sum_a_z * inv_points_count;
661
- nk_f64_t centroid_b_x = sum_b_x * inv_points_count, centroid_b_y = sum_b_y * inv_points_count,
662
- centroid_b_z = sum_b_z * inv_points_count;
663
- if (a_centroid)
664
- a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
665
- a_centroid[2] = (nk_f32_t)centroid_a_z;
666
- if (b_centroid)
667
- b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
668
- b_centroid[2] = (nk_f32_t)centroid_b_z;
669
-
670
- nk_f64_t sum_squared = sum_sq_x + sum_sq_y + sum_sq_z;
671
- nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x;
672
- nk_f64_t mean_diff_y = centroid_a_y - centroid_b_y;
673
- nk_f64_t mean_diff_z = centroid_a_z - centroid_b_z;
674
- nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
675
- *result = nk_f64_sqrt_v128relaxed(sum_squared * inv_points_count - mean_diff_sq);
629
+ *result = nk_f64_sqrt_v128relaxed((sum_sq_x + sum_sq_y + sum_sq_z) / (nk_f64_t)n);
676
630
  }
677
631
 
678
632
  NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
679
633
  nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
680
- // RMSD uses identity rotation and scale=1.0
681
634
  if (rotation)
682
635
  rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
683
636
  rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
684
637
  if (scale) *scale = 1.0;
638
+ if (a_centroid) a_centroid[0] = 0, a_centroid[1] = 0, a_centroid[2] = 0;
639
+ if (b_centroid) b_centroid[0] = 0, b_centroid[1] = 0, b_centroid[2] = 0;
685
640
 
686
641
  v128_t const zeros_f64x2 = wasm_f64x2_splat(0);
687
642
 
688
- // Accumulators for centroids and squared differences
689
- v128_t sum_a_x_f64x2 = zeros_f64x2, sum_a_y_f64x2 = zeros_f64x2, sum_a_z_f64x2 = zeros_f64x2;
690
- v128_t sum_b_x_f64x2 = zeros_f64x2, sum_b_y_f64x2 = zeros_f64x2, sum_b_z_f64x2 = zeros_f64x2;
643
+ // Accumulators for squared differences
691
644
  v128_t sum_squared_x_f64x2 = zeros_f64x2, sum_squared_y_f64x2 = zeros_f64x2, sum_squared_z_f64x2 = zeros_f64x2;
692
645
 
693
646
  v128_t a_x_f64x2, a_y_f64x2, a_z_f64x2, b_x_f64x2, b_y_f64x2, b_z_f64x2;
@@ -698,13 +651,6 @@ NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_
698
651
  nk_deinterleave_f64x2_v128relaxed_(a + i * 3, &a_x_f64x2, &a_y_f64x2, &a_z_f64x2);
699
652
  nk_deinterleave_f64x2_v128relaxed_(b + i * 3, &b_x_f64x2, &b_y_f64x2, &b_z_f64x2);
700
653
 
701
- sum_a_x_f64x2 = wasm_f64x2_add(sum_a_x_f64x2, a_x_f64x2);
702
- sum_a_y_f64x2 = wasm_f64x2_add(sum_a_y_f64x2, a_y_f64x2);
703
- sum_a_z_f64x2 = wasm_f64x2_add(sum_a_z_f64x2, a_z_f64x2);
704
- sum_b_x_f64x2 = wasm_f64x2_add(sum_b_x_f64x2, b_x_f64x2);
705
- sum_b_y_f64x2 = wasm_f64x2_add(sum_b_y_f64x2, b_y_f64x2);
706
- sum_b_z_f64x2 = wasm_f64x2_add(sum_b_z_f64x2, b_z_f64x2);
707
-
708
654
  v128_t delta_x_f64x2 = wasm_f64x2_sub(a_x_f64x2, b_x_f64x2);
709
655
  v128_t delta_y_f64x2 = wasm_f64x2_sub(a_y_f64x2, b_y_f64x2);
710
656
  v128_t delta_z_f64x2 = wasm_f64x2_sub(a_z_f64x2, b_z_f64x2);
@@ -715,12 +661,6 @@ NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_
715
661
  }
716
662
 
717
663
  // Reduce vectors to scalars.
718
- nk_f64_t total_ax = nk_reduce_stable_f64x2_v128relaxed_(sum_a_x_f64x2), total_ax_compensation = 0.0;
719
- nk_f64_t total_ay = nk_reduce_stable_f64x2_v128relaxed_(sum_a_y_f64x2), total_ay_compensation = 0.0;
720
- nk_f64_t total_az = nk_reduce_stable_f64x2_v128relaxed_(sum_a_z_f64x2), total_az_compensation = 0.0;
721
- nk_f64_t total_bx = nk_reduce_stable_f64x2_v128relaxed_(sum_b_x_f64x2), total_bx_compensation = 0.0;
722
- nk_f64_t total_by = nk_reduce_stable_f64x2_v128relaxed_(sum_b_y_f64x2), total_by_compensation = 0.0;
723
- nk_f64_t total_bz = nk_reduce_stable_f64x2_v128relaxed_(sum_b_z_f64x2), total_bz_compensation = 0.0;
724
664
  nk_f64_t total_squared_x = nk_reduce_stable_f64x2_v128relaxed_(sum_squared_x_f64x2),
725
665
  total_squared_x_compensation = 0.0;
726
666
  nk_f64_t total_squared_y = nk_reduce_stable_f64x2_v128relaxed_(sum_squared_y_f64x2),
@@ -732,40 +672,16 @@ NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_
732
672
  for (; i < n; ++i) {
733
673
  nk_f64_t ax = a[i * 3 + 0], ay = a[i * 3 + 1], az = a[i * 3 + 2];
734
674
  nk_f64_t bx = b[i * 3 + 0], by = b[i * 3 + 1], bz = b[i * 3 + 2];
735
- nk_accumulate_sum_f64_(&total_ax, &total_ax_compensation, ax);
736
- nk_accumulate_sum_f64_(&total_ay, &total_ay_compensation, ay);
737
- nk_accumulate_sum_f64_(&total_az, &total_az_compensation, az);
738
- nk_accumulate_sum_f64_(&total_bx, &total_bx_compensation, bx);
739
- nk_accumulate_sum_f64_(&total_by, &total_by_compensation, by);
740
- nk_accumulate_sum_f64_(&total_bz, &total_bz_compensation, bz);
741
675
  nk_f64_t delta_x = ax - bx, delta_y = ay - by, delta_z = az - bz;
742
676
  nk_accumulate_square_f64_(&total_squared_x, &total_squared_x_compensation, delta_x);
743
677
  nk_accumulate_square_f64_(&total_squared_y, &total_squared_y_compensation, delta_y);
744
678
  nk_accumulate_square_f64_(&total_squared_z, &total_squared_z_compensation, delta_z);
745
679
  }
746
680
 
747
- total_ax += total_ax_compensation, total_ay += total_ay_compensation, total_az += total_az_compensation;
748
- total_bx += total_bx_compensation, total_by += total_by_compensation, total_bz += total_bz_compensation;
749
681
  total_squared_x += total_squared_x_compensation, total_squared_y += total_squared_y_compensation,
750
682
  total_squared_z += total_squared_z_compensation;
751
683
 
752
- // Compute centroids
753
- nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
754
- nk_f64_t centroid_a_x = total_ax * inv_points_count, centroid_a_y = total_ay * inv_points_count,
755
- centroid_a_z = total_az * inv_points_count;
756
- nk_f64_t centroid_b_x = total_bx * inv_points_count, centroid_b_y = total_by * inv_points_count,
757
- centroid_b_z = total_bz * inv_points_count;
758
- if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
759
- if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
760
-
761
- // Compute RMSD
762
- nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x;
763
- nk_f64_t mean_diff_y = centroid_a_y - centroid_b_y;
764
- nk_f64_t mean_diff_z = centroid_a_z - centroid_b_z;
765
- nk_f64_t sum_squared = total_squared_x + total_squared_y + total_squared_z;
766
- nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
767
-
768
- *result = nk_f64_sqrt_v128relaxed(sum_squared * inv_points_count - mean_diff_sq);
684
+ *result = nk_f64_sqrt_v128relaxed((total_squared_x + total_squared_y + total_squared_z) / (nk_f64_t)n);
769
685
  }
770
686
 
771
687
  NK_PUBLIC void nk_kabsch_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
@@ -6,7 +6,7 @@
6
6
  *
7
7
  * Contains:
8
8
  *
9
- * - Root Mean Square Deviation (RMSD) for rigid body superposition
9
+ * - Root Mean Square Deviation (RMSD) of raw point differences
10
10
  * - Kabsch algorithm for optimal rigid body alignment (rotation only)
11
11
  * - Umeyama algorithm for similarity transform (rotation + uniform scaling)
12
12
  *
@@ -48,7 +48,7 @@
48
48
  *
49
49
  * @section algorithm_overview Algorithm Overview
50
50
  *
51
- * - RMSD: Simple root mean square deviation without alignment. R = identity, scale = 1.0
51
+ * - RMSD: Raw √(Σ‖aᵢ bᵢ‖² / n) without centering or alignment. R = identity, scale = 1.0, centroids zeroed
52
52
  * - Kabsch: Finds optimal rotation R minimizing ‖R × (a - ā) - (b - b̄)‖. scale = 1.0
53
53
  * - Umeyama: Finds optimal rotation R and scale c minimizing ‖c × R × (a - ā) - (b - b̄)‖
54
54
  *
@@ -354,74 +354,30 @@ void rmsd( //
354
354
  else if constexpr (std::is_same_v<in_type_, bf16_t> && simd)
355
355
  nk_rmsd_bf16(&a->raw_, &b->raw_, n, &a_centroid->raw_, &b_centroid->raw_, &rotation->raw_,
356
356
  scale ? &scale->raw_ : nullptr, &metric->raw_);
357
- // Scalar fallback
357
+ // Scalar fallback: raw √(Σ‖aᵢ − bᵢ‖² / n), no centering
358
358
  else {
359
- // Step 1: Compute centroids
360
- metric_type_ sum_a_x {}, sum_a_y {}, sum_a_z {};
361
- metric_type_ sum_b_x {}, sum_b_y {}, sum_b_z {};
362
- metric_type_ val_a_x, val_a_y, val_a_z, val_b_x, val_b_y, val_b_z;
363
-
364
- for (std::size_t i = 0; i < n; i++) {
365
- val_a_x = metric_type_(a[i * 3 + 0]);
366
- val_a_y = metric_type_(a[i * 3 + 1]);
367
- val_a_z = metric_type_(a[i * 3 + 2]);
368
- val_b_x = metric_type_(b[i * 3 + 0]);
369
- val_b_y = metric_type_(b[i * 3 + 1]);
370
- val_b_z = metric_type_(b[i * 3 + 2]);
371
- sum_a_x = sum_a_x + val_a_x;
372
- sum_a_y = sum_a_y + val_a_y;
373
- sum_a_z = sum_a_z + val_a_z;
374
- sum_b_x = sum_b_x + val_b_x;
375
- sum_b_y = sum_b_y + val_b_y;
376
- sum_b_z = sum_b_z + val_b_z;
377
- }
378
-
379
- metric_type_ inv_n = metric_type_(1.0) / metric_type_(static_cast<double>(n));
380
- metric_type_ centroid_a_x = sum_a_x * inv_n;
381
- metric_type_ centroid_a_y = sum_a_y * inv_n;
382
- metric_type_ centroid_a_z = sum_a_z * inv_n;
383
- metric_type_ centroid_b_x = sum_b_x * inv_n;
384
- metric_type_ centroid_b_y = sum_b_y * inv_n;
385
- metric_type_ centroid_b_z = sum_b_z * inv_n;
386
-
387
- // Step 2: Store centroids if requested
388
359
  if (a_centroid)
389
- a_centroid[0] = transform_type_(centroid_a_x), a_centroid[1] = transform_type_(centroid_a_y),
390
- a_centroid[2] = transform_type_(centroid_a_z);
360
+ a_centroid[0] = transform_type_(0.0), a_centroid[1] = transform_type_(0.0),
361
+ a_centroid[2] = transform_type_(0.0);
391
362
  if (b_centroid)
392
- b_centroid[0] = transform_type_(centroid_b_x), b_centroid[1] = transform_type_(centroid_b_y),
393
- b_centroid[2] = transform_type_(centroid_b_z);
394
-
395
- // Step 3: RMSD uses identity rotation and scale=1.0
363
+ b_centroid[0] = transform_type_(0.0), b_centroid[1] = transform_type_(0.0),
364
+ b_centroid[2] = transform_type_(0.0);
396
365
  if (rotation) {
397
- rotation[0] = transform_type_(1.0);
398
- rotation[1] = transform_type_(0.0);
399
- rotation[2] = transform_type_(0.0);
400
- rotation[3] = transform_type_(0.0);
401
- rotation[4] = transform_type_(1.0);
402
- rotation[5] = transform_type_(0.0);
403
- rotation[6] = transform_type_(0.0);
404
- rotation[7] = transform_type_(0.0);
405
- rotation[8] = transform_type_(1.0);
366
+ rotation[0] = transform_type_(1.0), rotation[1] = transform_type_(0.0), rotation[2] = transform_type_(0.0);
367
+ rotation[3] = transform_type_(0.0), rotation[4] = transform_type_(1.0), rotation[5] = transform_type_(0.0);
368
+ rotation[6] = transform_type_(0.0), rotation[7] = transform_type_(0.0), rotation[8] = transform_type_(1.0);
406
369
  }
407
370
  if (scale) *scale = transform_type_(1.0);
408
371
 
409
- // Step 4: Compute RMSD between centered point clouds
410
372
  metric_type_ sum_squared {};
411
373
  for (std::size_t i = 0; i < n; i++) {
412
- val_a_x = metric_type_(a[i * 3 + 0]);
413
- val_a_y = metric_type_(a[i * 3 + 1]);
414
- val_a_z = metric_type_(a[i * 3 + 2]);
415
- val_b_x = metric_type_(b[i * 3 + 0]);
416
- val_b_y = metric_type_(b[i * 3 + 1]);
417
- val_b_z = metric_type_(b[i * 3 + 2]);
418
- metric_type_ dx = (val_a_x - centroid_a_x) - (val_b_x - centroid_b_x);
419
- metric_type_ dy = (val_a_y - centroid_a_y) - (val_b_y - centroid_b_y);
420
- metric_type_ dz = (val_a_z - centroid_a_z) - (val_b_z - centroid_b_z);
374
+ metric_type_ dx = metric_type_(a[i * 3 + 0]) - metric_type_(b[i * 3 + 0]);
375
+ metric_type_ dy = metric_type_(a[i * 3 + 1]) - metric_type_(b[i * 3 + 1]);
376
+ metric_type_ dz = metric_type_(a[i * 3 + 2]) - metric_type_(b[i * 3 + 2]);
421
377
  sum_squared = sum_squared + dx * dx + dy * dy + dz * dz;
422
378
  }
423
379
 
424
- *metric = (sum_squared * inv_n).sqrt();
380
+ *metric = (sum_squared / metric_type_(static_cast<double>(n))).sqrt();
425
381
  }
426
382
  }
427
383
 
@@ -470,18 +426,12 @@ void kabsch( //
470
426
  metric_type_ val_a_x, val_a_y, val_a_z, val_b_x, val_b_y, val_b_z;
471
427
 
472
428
  for (std::size_t i = 0; i < n; i++) {
473
- val_a_x = metric_type_(a[i * 3 + 0]);
474
- val_a_y = metric_type_(a[i * 3 + 1]);
429
+ val_a_x = metric_type_(a[i * 3 + 0]), val_a_y = metric_type_(a[i * 3 + 1]),
475
430
  val_a_z = metric_type_(a[i * 3 + 2]);
476
- val_b_x = metric_type_(b[i * 3 + 0]);
477
- val_b_y = metric_type_(b[i * 3 + 1]);
431
+ val_b_x = metric_type_(b[i * 3 + 0]), val_b_y = metric_type_(b[i * 3 + 1]),
478
432
  val_b_z = metric_type_(b[i * 3 + 2]);
479
- sum_a_x = sum_a_x + val_a_x;
480
- sum_a_y = sum_a_y + val_a_y;
481
- sum_a_z = sum_a_z + val_a_z;
482
- sum_b_x = sum_b_x + val_b_x;
483
- sum_b_y = sum_b_y + val_b_y;
484
- sum_b_z = sum_b_z + val_b_z;
433
+ sum_a_x = sum_a_x + val_a_x, sum_a_y = sum_a_y + val_a_y, sum_a_z = sum_a_z + val_a_z;
434
+ sum_b_x = sum_b_x + val_b_x, sum_b_y = sum_b_y + val_b_y, sum_b_z = sum_b_z + val_b_z;
485
435
  }
486
436
 
487
437
  metric_type_ inv_n = metric_type_(1.0) / metric_type_(static_cast<double>(n));
@@ -503,11 +453,9 @@ void kabsch( //
503
453
  // Step 2: Build 3x3 covariance matrix H = (A - A_bar)^T x (B - B_bar)
504
454
  metric_type_ cross_covariance[9] = {};
505
455
  for (std::size_t i = 0; i < n; i++) {
506
- val_a_x = metric_type_(a[i * 3 + 0]) - centroid_a_x;
507
- val_a_y = metric_type_(a[i * 3 + 1]) - centroid_a_y;
456
+ val_a_x = metric_type_(a[i * 3 + 0]) - centroid_a_x, val_a_y = metric_type_(a[i * 3 + 1]) - centroid_a_y,
508
457
  val_a_z = metric_type_(a[i * 3 + 2]) - centroid_a_z;
509
- val_b_x = metric_type_(b[i * 3 + 0]) - centroid_b_x;
510
- val_b_y = metric_type_(b[i * 3 + 1]) - centroid_b_y;
458
+ val_b_x = metric_type_(b[i * 3 + 0]) - centroid_b_x, val_b_y = metric_type_(b[i * 3 + 1]) - centroid_b_y,
511
459
  val_b_z = metric_type_(b[i * 3 + 2]) - centroid_b_z;
512
460
  cross_covariance[0] = cross_covariance[0] + val_a_x * val_b_x;
513
461
  cross_covariance[1] = cross_covariance[1] + val_a_x * val_b_y;
@@ -563,11 +511,11 @@ void kabsch( //
563
511
  metric_type_ sum_squared {};
564
512
  for (std::size_t i = 0; i < n; i++) {
565
513
  metric_type_ point_a[3], point_b[3], rotated_point_a[3];
566
- point_a[0] = metric_type_(a[i * 3 + 0]) - centroid_a_x;
567
- point_a[1] = metric_type_(a[i * 3 + 1]) - centroid_a_y;
514
+ point_a[0] = metric_type_(a[i * 3 + 0]) - centroid_a_x,
515
+ point_a[1] = metric_type_(a[i * 3 + 1]) - centroid_a_y,
568
516
  point_a[2] = metric_type_(a[i * 3 + 2]) - centroid_a_z;
569
- point_b[0] = metric_type_(b[i * 3 + 0]) - centroid_b_x;
570
- point_b[1] = metric_type_(b[i * 3 + 1]) - centroid_b_y;
517
+ point_b[0] = metric_type_(b[i * 3 + 0]) - centroid_b_x,
518
+ point_b[1] = metric_type_(b[i * 3 + 1]) - centroid_b_y,
571
519
  point_b[2] = metric_type_(b[i * 3 + 2]) - centroid_b_z;
572
520
  rotated_point_a[0] = rotation_matrix[0] * point_a[0] + rotation_matrix[1] * point_a[1] +
573
521
  rotation_matrix[2] * point_a[2];
@@ -628,18 +576,12 @@ void umeyama(in_type_ const *a, in_type_ const *b, std::size_t n, transform_type
628
576
  metric_type_ val_a_x, val_a_y, val_a_z, val_b_x, val_b_y, val_b_z;
629
577
 
630
578
  for (std::size_t i = 0; i < n; i++) {
631
- val_a_x = metric_type_(a[i * 3 + 0]);
632
- val_a_y = metric_type_(a[i * 3 + 1]);
579
+ val_a_x = metric_type_(a[i * 3 + 0]), val_a_y = metric_type_(a[i * 3 + 1]),
633
580
  val_a_z = metric_type_(a[i * 3 + 2]);
634
- val_b_x = metric_type_(b[i * 3 + 0]);
635
- val_b_y = metric_type_(b[i * 3 + 1]);
581
+ val_b_x = metric_type_(b[i * 3 + 0]), val_b_y = metric_type_(b[i * 3 + 1]),
636
582
  val_b_z = metric_type_(b[i * 3 + 2]);
637
- sum_a_x = sum_a_x + val_a_x;
638
- sum_a_y = sum_a_y + val_a_y;
639
- sum_a_z = sum_a_z + val_a_z;
640
- sum_b_x = sum_b_x + val_b_x;
641
- sum_b_y = sum_b_y + val_b_y;
642
- sum_b_z = sum_b_z + val_b_z;
583
+ sum_a_x = sum_a_x + val_a_x, sum_a_y = sum_a_y + val_a_y, sum_a_z = sum_a_z + val_a_z;
584
+ sum_b_x = sum_b_x + val_b_x, sum_b_y = sum_b_y + val_b_y, sum_b_z = sum_b_z + val_b_z;
643
585
  }
644
586
 
645
587
  metric_type_ inv_n = metric_type_(1.0) / metric_type_(static_cast<double>(n));
@@ -650,16 +592,13 @@ void umeyama(in_type_ const *a, in_type_ const *b, std::size_t n, transform_type
650
592
  metric_type_ centroid_b_y = sum_b_y * inv_n;
651
593
  metric_type_ centroid_b_z = sum_b_z * inv_n;
652
594
 
653
- if (a_centroid) {
654
- a_centroid[0] = transform_type_(centroid_a_x);
655
- a_centroid[1] = transform_type_(centroid_a_y);
595
+ if (a_centroid)
596
+ a_centroid[0] = transform_type_(centroid_a_x), a_centroid[1] = transform_type_(centroid_a_y),
656
597
  a_centroid[2] = transform_type_(centroid_a_z);
657
- }
658
- if (b_centroid) {
659
- b_centroid[0] = transform_type_(centroid_b_x);
660
- b_centroid[1] = transform_type_(centroid_b_y);
598
+
599
+ if (b_centroid)
600
+ b_centroid[0] = transform_type_(centroid_b_x), b_centroid[1] = transform_type_(centroid_b_y),
661
601
  b_centroid[2] = transform_type_(centroid_b_z);
662
- }
663
602
 
664
603
  // Step 2: Build covariance matrix H and compute variance of A
665
604
  metric_type_ cross_covariance[9] = {};
@@ -733,11 +672,11 @@ void umeyama(in_type_ const *a, in_type_ const *b, std::size_t n, transform_type
733
672
  metric_type_ sum_squared {};
734
673
  for (std::size_t i = 0; i < n; i++) {
735
674
  metric_type_ point_a[3], point_b[3], rotated_point_a[3];
736
- point_a[0] = metric_type_(a[i * 3 + 0]) - centroid_a_x;
737
- point_a[1] = metric_type_(a[i * 3 + 1]) - centroid_a_y;
675
+ point_a[0] = metric_type_(a[i * 3 + 0]) - centroid_a_x,
676
+ point_a[1] = metric_type_(a[i * 3 + 1]) - centroid_a_y,
738
677
  point_a[2] = metric_type_(a[i * 3 + 2]) - centroid_a_z;
739
- point_b[0] = metric_type_(b[i * 3 + 0]) - centroid_b_x;
740
- point_b[1] = metric_type_(b[i * 3 + 1]) - centroid_b_y;
678
+ point_b[0] = metric_type_(b[i * 3 + 0]) - centroid_b_x,
679
+ point_b[1] = metric_type_(b[i * 3 + 1]) - centroid_b_y,
741
680
  point_b[2] = metric_type_(b[i * 3 + 2]) - centroid_b_z;
742
681
  rotated_point_a[0] = scale_factor * (rotation_matrix[0] * point_a[0] + rotation_matrix[1] * point_a[1] +
743
682
  rotation_matrix[2] * point_a[2]);