numkong 7.4.3 → 7.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -49
- package/binding.gyp +3 -0
- package/include/numkong/capabilities.h +1 -1
- package/include/numkong/each/haswell.h +4 -4
- package/include/numkong/maxsim/sme.h +65 -27
- package/include/numkong/mesh/README.md +13 -27
- package/include/numkong/mesh/haswell.h +25 -122
- package/include/numkong/mesh/neon.h +21 -110
- package/include/numkong/mesh/neonbfdot.h +4 -43
- package/include/numkong/mesh/rvv.h +7 -82
- package/include/numkong/mesh/serial.h +26 -53
- package/include/numkong/mesh/skylake.h +7 -123
- package/include/numkong/mesh/v128relaxed.h +9 -93
- package/include/numkong/mesh.h +2 -2
- package/include/numkong/mesh.hpp +35 -96
- package/include/numkong/types.h +15 -9
- package/numkong.gypi +3 -0
- package/package.json +7 -7
- package/wasm/numkong.wasm +0 -0
|
@@ -644,12 +644,10 @@ NK_PUBLIC void nk_rmsd_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size
|
|
|
644
644
|
rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
|
|
645
645
|
rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
|
|
646
646
|
if (scale) *scale = 1.0f;
|
|
647
|
+
if (a_centroid) a_centroid[0] = 0, a_centroid[1] = 0, a_centroid[2] = 0;
|
|
648
|
+
if (b_centroid) b_centroid[0] = 0, b_centroid[1] = 0, b_centroid[2] = 0;
|
|
647
649
|
|
|
648
|
-
// Fused single-pass: centroids + squared differences in f64, using the identity:
|
|
649
|
-
// RMSD = √(E[(a-b)²] - (ā - b̄)²)
|
|
650
650
|
__m512d const zeros_f64x8 = _mm512_setzero_pd();
|
|
651
|
-
__m512d sum_a_x_f64x8 = zeros_f64x8, sum_a_y_f64x8 = zeros_f64x8, sum_a_z_f64x8 = zeros_f64x8;
|
|
652
|
-
__m512d sum_b_x_f64x8 = zeros_f64x8, sum_b_y_f64x8 = zeros_f64x8, sum_b_z_f64x8 = zeros_f64x8;
|
|
653
651
|
__m512d sum_squared_x_f64x8 = zeros_f64x8, sum_squared_y_f64x8 = zeros_f64x8, sum_squared_z_f64x8 = zeros_f64x8;
|
|
654
652
|
__m512 a_x_f32x16, a_y_f32x16, a_z_f32x16, b_x_f32x16, b_y_f32x16, b_z_f32x16;
|
|
655
653
|
nk_size_t i = 0;
|
|
@@ -672,13 +670,6 @@ NK_PUBLIC void nk_rmsd_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size
|
|
|
672
670
|
__m512d b_z_low_f64x8 = _mm512_cvtps_pd(_mm512_castps512_ps256(b_z_f32x16));
|
|
673
671
|
__m512d b_z_high_f64x8 = _mm512_cvtps_pd(_mm512_extractf32x8_ps(b_z_f32x16, 1));
|
|
674
672
|
|
|
675
|
-
sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, _mm512_add_pd(a_x_low_f64x8, a_x_high_f64x8));
|
|
676
|
-
sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, _mm512_add_pd(a_y_low_f64x8, a_y_high_f64x8));
|
|
677
|
-
sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, _mm512_add_pd(a_z_low_f64x8, a_z_high_f64x8));
|
|
678
|
-
sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, _mm512_add_pd(b_x_low_f64x8, b_x_high_f64x8));
|
|
679
|
-
sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, _mm512_add_pd(b_y_low_f64x8, b_y_high_f64x8));
|
|
680
|
-
sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, _mm512_add_pd(b_z_low_f64x8, b_z_high_f64x8));
|
|
681
|
-
|
|
682
673
|
__m512d delta_x_low_f64x8 = _mm512_sub_pd(a_x_low_f64x8, b_x_low_f64x8);
|
|
683
674
|
__m512d delta_x_high_f64x8 = _mm512_sub_pd(a_x_high_f64x8, b_x_high_f64x8);
|
|
684
675
|
__m512d delta_y_low_f64x8 = _mm512_sub_pd(a_y_low_f64x8, b_y_low_f64x8);
|
|
@@ -708,13 +699,6 @@ NK_PUBLIC void nk_rmsd_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size
|
|
|
708
699
|
b_z_low_f64x8 = _mm512_cvtps_pd(_mm512_castps512_ps256(b_z_f32x16));
|
|
709
700
|
b_z_high_f64x8 = _mm512_cvtps_pd(_mm512_extractf32x8_ps(b_z_f32x16, 1));
|
|
710
701
|
|
|
711
|
-
sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, _mm512_add_pd(a_x_low_f64x8, a_x_high_f64x8));
|
|
712
|
-
sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, _mm512_add_pd(a_y_low_f64x8, a_y_high_f64x8));
|
|
713
|
-
sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, _mm512_add_pd(a_z_low_f64x8, a_z_high_f64x8));
|
|
714
|
-
sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, _mm512_add_pd(b_x_low_f64x8, b_x_high_f64x8));
|
|
715
|
-
sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, _mm512_add_pd(b_y_low_f64x8, b_y_high_f64x8));
|
|
716
|
-
sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, _mm512_add_pd(b_z_low_f64x8, b_z_high_f64x8));
|
|
717
|
-
|
|
718
702
|
delta_x_low_f64x8 = _mm512_sub_pd(a_x_low_f64x8, b_x_low_f64x8);
|
|
719
703
|
delta_x_high_f64x8 = _mm512_sub_pd(a_x_high_f64x8, b_x_high_f64x8);
|
|
720
704
|
delta_y_low_f64x8 = _mm512_sub_pd(a_y_low_f64x8, b_y_low_f64x8);
|
|
@@ -746,13 +730,6 @@ NK_PUBLIC void nk_rmsd_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size
|
|
|
746
730
|
__m512d b_z_low_f64x8 = _mm512_cvtps_pd(_mm512_castps512_ps256(b_z_f32x16));
|
|
747
731
|
__m512d b_z_high_f64x8 = _mm512_cvtps_pd(_mm512_extractf32x8_ps(b_z_f32x16, 1));
|
|
748
732
|
|
|
749
|
-
sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, _mm512_add_pd(a_x_low_f64x8, a_x_high_f64x8));
|
|
750
|
-
sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, _mm512_add_pd(a_y_low_f64x8, a_y_high_f64x8));
|
|
751
|
-
sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, _mm512_add_pd(a_z_low_f64x8, a_z_high_f64x8));
|
|
752
|
-
sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, _mm512_add_pd(b_x_low_f64x8, b_x_high_f64x8));
|
|
753
|
-
sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, _mm512_add_pd(b_y_low_f64x8, b_y_high_f64x8));
|
|
754
|
-
sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, _mm512_add_pd(b_z_low_f64x8, b_z_high_f64x8));
|
|
755
|
-
|
|
756
733
|
__m512d delta_x_low_f64x8 = _mm512_sub_pd(a_x_low_f64x8, b_x_low_f64x8);
|
|
757
734
|
__m512d delta_x_high_f64x8 = _mm512_sub_pd(a_x_high_f64x8, b_x_high_f64x8);
|
|
758
735
|
__m512d delta_y_low_f64x8 = _mm512_sub_pd(a_y_low_f64x8, b_y_low_f64x8);
|
|
@@ -796,13 +773,6 @@ NK_PUBLIC void nk_rmsd_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size
|
|
|
796
773
|
__m512d b_z_low_f64x8 = _mm512_cvtps_pd(_mm512_castps512_ps256(b_z_f32x16));
|
|
797
774
|
__m512d b_z_high_f64x8 = _mm512_cvtps_pd(_mm512_extractf32x8_ps(b_z_f32x16, 1));
|
|
798
775
|
|
|
799
|
-
sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, _mm512_add_pd(a_x_low_f64x8, a_x_high_f64x8));
|
|
800
|
-
sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, _mm512_add_pd(a_y_low_f64x8, a_y_high_f64x8));
|
|
801
|
-
sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, _mm512_add_pd(a_z_low_f64x8, a_z_high_f64x8));
|
|
802
|
-
sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, _mm512_add_pd(b_x_low_f64x8, b_x_high_f64x8));
|
|
803
|
-
sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, _mm512_add_pd(b_y_low_f64x8, b_y_high_f64x8));
|
|
804
|
-
sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, _mm512_add_pd(b_z_low_f64x8, b_z_high_f64x8));
|
|
805
|
-
|
|
806
776
|
__m512d delta_x_low_f64x8 = _mm512_sub_pd(a_x_low_f64x8, b_x_low_f64x8);
|
|
807
777
|
__m512d delta_x_high_f64x8 = _mm512_sub_pd(a_x_high_f64x8, b_x_high_f64x8);
|
|
808
778
|
__m512d delta_y_low_f64x8 = _mm512_sub_pd(a_y_low_f64x8, b_y_low_f64x8);
|
|
@@ -817,32 +787,10 @@ NK_PUBLIC void nk_rmsd_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size
|
|
|
817
787
|
sum_squared_z_f64x8 = _mm512_fmadd_pd(delta_z_high_f64x8, delta_z_high_f64x8, sum_squared_z_f64x8);
|
|
818
788
|
}
|
|
819
789
|
|
|
820
|
-
// Reduce and compute centroids
|
|
821
|
-
nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
|
|
822
|
-
nk_f64_t total_ax = _mm512_reduce_add_pd(sum_a_x_f64x8);
|
|
823
|
-
nk_f64_t total_ay = _mm512_reduce_add_pd(sum_a_y_f64x8);
|
|
824
|
-
nk_f64_t total_az = _mm512_reduce_add_pd(sum_a_z_f64x8);
|
|
825
|
-
nk_f64_t total_bx = _mm512_reduce_add_pd(sum_b_x_f64x8);
|
|
826
|
-
nk_f64_t total_by = _mm512_reduce_add_pd(sum_b_y_f64x8);
|
|
827
|
-
nk_f64_t total_bz = _mm512_reduce_add_pd(sum_b_z_f64x8);
|
|
828
790
|
nk_f64_t total_sq_x = _mm512_reduce_add_pd(sum_squared_x_f64x8);
|
|
829
791
|
nk_f64_t total_sq_y = _mm512_reduce_add_pd(sum_squared_y_f64x8);
|
|
830
792
|
nk_f64_t total_sq_z = _mm512_reduce_add_pd(sum_squared_z_f64x8);
|
|
831
|
-
|
|
832
|
-
nk_f64_t centroid_a_x = total_ax * inv_n, centroid_a_y = total_ay * inv_n, centroid_a_z = total_az * inv_n;
|
|
833
|
-
nk_f64_t centroid_b_x = total_bx * inv_n, centroid_b_y = total_by * inv_n, centroid_b_z = total_bz * inv_n;
|
|
834
|
-
if (a_centroid)
|
|
835
|
-
a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
|
|
836
|
-
a_centroid[2] = (nk_f32_t)centroid_a_z;
|
|
837
|
-
if (b_centroid)
|
|
838
|
-
b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
|
|
839
|
-
b_centroid[2] = (nk_f32_t)centroid_b_z;
|
|
840
|
-
|
|
841
|
-
nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x, mean_diff_y = centroid_a_y - centroid_b_y,
|
|
842
|
-
mean_diff_z = centroid_a_z - centroid_b_z;
|
|
843
|
-
nk_f64_t sum_squared = total_sq_x + total_sq_y + total_sq_z;
|
|
844
|
-
nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
|
|
845
|
-
*result = nk_f64_sqrt_haswell(sum_squared * inv_n - mean_diff_sq);
|
|
793
|
+
*result = nk_f64_sqrt_haswell((total_sq_x + total_sq_y + total_sq_z) / (nk_f64_t)n);
|
|
846
794
|
}
|
|
847
795
|
|
|
848
796
|
NK_PUBLIC void nk_kabsch_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
|
|
@@ -1008,21 +956,15 @@ NK_PUBLIC void nk_kabsch_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_si
|
|
|
1008
956
|
|
|
1009
957
|
NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
|
|
1010
958
|
nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
|
|
1011
|
-
// RMSD uses identity rotation and scale=1.0.
|
|
1012
959
|
if (rotation)
|
|
1013
960
|
rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
|
|
1014
961
|
rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
|
|
1015
962
|
if (scale) *scale = 1.0;
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
// = √(E[(a-b)²] - (ā - b̄)²)
|
|
963
|
+
if (a_centroid) a_centroid[0] = 0, a_centroid[1] = 0, a_centroid[2] = 0;
|
|
964
|
+
if (b_centroid) b_centroid[0] = 0, b_centroid[1] = 0, b_centroid[2] = 0;
|
|
965
|
+
|
|
1020
966
|
__m512i const gather_idx_i64x8 = _mm512_setr_epi64(0, 3, 6, 9, 12, 15, 18, 21);
|
|
1021
967
|
__m512d const zeros_f64x8 = _mm512_setzero_pd();
|
|
1022
|
-
|
|
1023
|
-
// Accumulators for centroids and squared differences
|
|
1024
|
-
__m512d sum_a_x_f64x8 = zeros_f64x8, sum_a_y_f64x8 = zeros_f64x8, sum_a_z_f64x8 = zeros_f64x8;
|
|
1025
|
-
__m512d sum_b_x_f64x8 = zeros_f64x8, sum_b_y_f64x8 = zeros_f64x8, sum_b_z_f64x8 = zeros_f64x8;
|
|
1026
968
|
__m512d sum_squared_x_f64x8 = zeros_f64x8, sum_squared_y_f64x8 = zeros_f64x8, sum_squared_z_f64x8 = zeros_f64x8;
|
|
1027
969
|
|
|
1028
970
|
__m512d a_x_f64x8, a_y_f64x8, a_z_f64x8, b_x_f64x8, b_y_f64x8, b_z_f64x8;
|
|
@@ -1034,13 +976,6 @@ NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size
|
|
|
1034
976
|
nk_deinterleave_f64x8_skylake_(a + i * 3, &a_x_f64x8, &a_y_f64x8, &a_z_f64x8);
|
|
1035
977
|
nk_deinterleave_f64x8_skylake_(b + i * 3, &b_x_f64x8, &b_y_f64x8, &b_z_f64x8);
|
|
1036
978
|
|
|
1037
|
-
sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, a_x_f64x8),
|
|
1038
|
-
sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, a_y_f64x8),
|
|
1039
|
-
sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, a_z_f64x8);
|
|
1040
|
-
sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, b_x_f64x8),
|
|
1041
|
-
sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, b_y_f64x8),
|
|
1042
|
-
sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, b_z_f64x8);
|
|
1043
|
-
|
|
1044
979
|
__m512d delta_x_f64x8 = _mm512_sub_pd(a_x_f64x8, b_x_f64x8),
|
|
1045
980
|
delta_y_f64x8 = _mm512_sub_pd(a_y_f64x8, b_y_f64x8),
|
|
1046
981
|
delta_z_f64x8 = _mm512_sub_pd(a_z_f64x8, b_z_f64x8);
|
|
@@ -1053,13 +988,6 @@ NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size
|
|
|
1053
988
|
nk_deinterleave_f64x8_skylake_(a + (i + 8) * 3, &a_x1_f64x8, &a_y1_f64x8, &a_z1_f64x8);
|
|
1054
989
|
nk_deinterleave_f64x8_skylake_(b + (i + 8) * 3, &b_x1_f64x8, &b_y1_f64x8, &b_z1_f64x8);
|
|
1055
990
|
|
|
1056
|
-
sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, a_x1_f64x8),
|
|
1057
|
-
sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, a_y1_f64x8),
|
|
1058
|
-
sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, a_z1_f64x8);
|
|
1059
|
-
sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, b_x1_f64x8),
|
|
1060
|
-
sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, b_y1_f64x8),
|
|
1061
|
-
sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, b_z1_f64x8);
|
|
1062
|
-
|
|
1063
991
|
__m512d delta_x1_f64x8 = _mm512_sub_pd(a_x1_f64x8, b_x1_f64x8),
|
|
1064
992
|
delta_y1_f64x8 = _mm512_sub_pd(a_y1_f64x8, b_y1_f64x8),
|
|
1065
993
|
delta_z1_f64x8 = _mm512_sub_pd(a_z1_f64x8, b_z1_f64x8);
|
|
@@ -1073,13 +1001,6 @@ NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size
|
|
|
1073
1001
|
nk_deinterleave_f64x8_skylake_(a + i * 3, &a_x_f64x8, &a_y_f64x8, &a_z_f64x8);
|
|
1074
1002
|
nk_deinterleave_f64x8_skylake_(b + i * 3, &b_x_f64x8, &b_y_f64x8, &b_z_f64x8);
|
|
1075
1003
|
|
|
1076
|
-
sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, a_x_f64x8),
|
|
1077
|
-
sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, a_y_f64x8),
|
|
1078
|
-
sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, a_z_f64x8);
|
|
1079
|
-
sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, b_x_f64x8),
|
|
1080
|
-
sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, b_y_f64x8),
|
|
1081
|
-
sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, b_z_f64x8);
|
|
1082
|
-
|
|
1083
1004
|
__m512d delta_x_f64x8 = _mm512_sub_pd(a_x_f64x8, b_x_f64x8),
|
|
1084
1005
|
delta_y_f64x8 = _mm512_sub_pd(a_y_f64x8, b_y_f64x8),
|
|
1085
1006
|
delta_z_f64x8 = _mm512_sub_pd(a_z_f64x8, b_z_f64x8);
|
|
@@ -1102,13 +1023,6 @@ NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size
|
|
|
1102
1023
|
b_y_f64x8 = _mm512_mask_i64gather_pd(zeros_f64x8, mask, gather_idx_i64x8, b_tail + 1, 8);
|
|
1103
1024
|
b_z_f64x8 = _mm512_mask_i64gather_pd(zeros_f64x8, mask, gather_idx_i64x8, b_tail + 2, 8);
|
|
1104
1025
|
|
|
1105
|
-
sum_a_x_f64x8 = _mm512_add_pd(sum_a_x_f64x8, a_x_f64x8),
|
|
1106
|
-
sum_a_y_f64x8 = _mm512_add_pd(sum_a_y_f64x8, a_y_f64x8),
|
|
1107
|
-
sum_a_z_f64x8 = _mm512_add_pd(sum_a_z_f64x8, a_z_f64x8);
|
|
1108
|
-
sum_b_x_f64x8 = _mm512_add_pd(sum_b_x_f64x8, b_x_f64x8),
|
|
1109
|
-
sum_b_y_f64x8 = _mm512_add_pd(sum_b_y_f64x8, b_y_f64x8),
|
|
1110
|
-
sum_b_z_f64x8 = _mm512_add_pd(sum_b_z_f64x8, b_z_f64x8);
|
|
1111
|
-
|
|
1112
1026
|
__m512d delta_x_f64x8 = _mm512_sub_pd(a_x_f64x8, b_x_f64x8),
|
|
1113
1027
|
delta_y_f64x8 = _mm512_sub_pd(a_y_f64x8, b_y_f64x8),
|
|
1114
1028
|
delta_z_f64x8 = _mm512_sub_pd(a_z_f64x8, b_z_f64x8);
|
|
@@ -1118,14 +1032,6 @@ NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size
|
|
|
1118
1032
|
i = n;
|
|
1119
1033
|
}
|
|
1120
1034
|
|
|
1121
|
-
// Reduce and compute centroids.
|
|
1122
|
-
nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
|
|
1123
|
-
nk_f64_t total_ax = nk_reduce_stable_f64x8_skylake_(sum_a_x_f64x8), total_ax_compensation = 0.0;
|
|
1124
|
-
nk_f64_t total_ay = nk_reduce_stable_f64x8_skylake_(sum_a_y_f64x8), total_ay_compensation = 0.0;
|
|
1125
|
-
nk_f64_t total_az = nk_reduce_stable_f64x8_skylake_(sum_a_z_f64x8), total_az_compensation = 0.0;
|
|
1126
|
-
nk_f64_t total_bx = nk_reduce_stable_f64x8_skylake_(sum_b_x_f64x8), total_bx_compensation = 0.0;
|
|
1127
|
-
nk_f64_t total_by = nk_reduce_stable_f64x8_skylake_(sum_b_y_f64x8), total_by_compensation = 0.0;
|
|
1128
|
-
nk_f64_t total_bz = nk_reduce_stable_f64x8_skylake_(sum_b_z_f64x8), total_bz_compensation = 0.0;
|
|
1129
1035
|
nk_f64_t total_squared_x = nk_reduce_stable_f64x8_skylake_(sum_squared_x_f64x8), total_squared_x_compensation = 0.0;
|
|
1130
1036
|
nk_f64_t total_squared_y = nk_reduce_stable_f64x8_skylake_(sum_squared_y_f64x8), total_squared_y_compensation = 0.0;
|
|
1131
1037
|
nk_f64_t total_squared_z = nk_reduce_stable_f64x8_skylake_(sum_squared_z_f64x8), total_squared_z_compensation = 0.0;
|
|
@@ -1133,37 +1039,15 @@ NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size
|
|
|
1133
1039
|
for (; i < n; ++i) {
|
|
1134
1040
|
nk_f64_t ax = a[i * 3 + 0], ay = a[i * 3 + 1], az = a[i * 3 + 2];
|
|
1135
1041
|
nk_f64_t bx = b[i * 3 + 0], by = b[i * 3 + 1], bz = b[i * 3 + 2];
|
|
1136
|
-
nk_accumulate_sum_f64_(&total_ax, &total_ax_compensation, ax);
|
|
1137
|
-
nk_accumulate_sum_f64_(&total_ay, &total_ay_compensation, ay);
|
|
1138
|
-
nk_accumulate_sum_f64_(&total_az, &total_az_compensation, az);
|
|
1139
|
-
nk_accumulate_sum_f64_(&total_bx, &total_bx_compensation, bx);
|
|
1140
|
-
nk_accumulate_sum_f64_(&total_by, &total_by_compensation, by);
|
|
1141
|
-
nk_accumulate_sum_f64_(&total_bz, &total_bz_compensation, bz);
|
|
1142
1042
|
nk_f64_t delta_x = ax - bx, delta_y = ay - by, delta_z = az - bz;
|
|
1143
1043
|
nk_accumulate_square_f64_(&total_squared_x, &total_squared_x_compensation, delta_x);
|
|
1144
1044
|
nk_accumulate_square_f64_(&total_squared_y, &total_squared_y_compensation, delta_y);
|
|
1145
1045
|
nk_accumulate_square_f64_(&total_squared_z, &total_squared_z_compensation, delta_z);
|
|
1146
1046
|
}
|
|
1147
1047
|
|
|
1148
|
-
total_ax += total_ax_compensation, total_ay += total_ay_compensation, total_az += total_az_compensation;
|
|
1149
|
-
total_bx += total_bx_compensation, total_by += total_by_compensation, total_bz += total_bz_compensation;
|
|
1150
1048
|
total_squared_x += total_squared_x_compensation, total_squared_y += total_squared_y_compensation,
|
|
1151
1049
|
total_squared_z += total_squared_z_compensation;
|
|
1152
|
-
|
|
1153
|
-
nk_f64_t centroid_a_x = total_ax * inv_n, centroid_a_y = total_ay * inv_n, centroid_a_z = total_az * inv_n;
|
|
1154
|
-
nk_f64_t centroid_b_x = total_bx * inv_n, centroid_b_y = total_by * inv_n, centroid_b_z = total_bz * inv_n;
|
|
1155
|
-
|
|
1156
|
-
if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
|
|
1157
|
-
if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
|
|
1158
|
-
|
|
1159
|
-
// Compute RMSD using the formula:
|
|
1160
|
-
// RMSD = √(E[(a-b)²] - (ā - b̄)²).
|
|
1161
|
-
nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x, mean_diff_y = centroid_a_y - centroid_b_y,
|
|
1162
|
-
mean_diff_z = centroid_a_z - centroid_b_z;
|
|
1163
|
-
nk_f64_t sum_squared = total_squared_x + total_squared_y + total_squared_z;
|
|
1164
|
-
nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
|
|
1165
|
-
|
|
1166
|
-
*result = nk_f64_sqrt_haswell(sum_squared * inv_n - mean_diff_sq);
|
|
1050
|
+
*result = nk_f64_sqrt_haswell((total_squared_x + total_squared_y + total_squared_z) / (nk_f64_t)n);
|
|
1167
1051
|
}
|
|
1168
1052
|
|
|
1169
1053
|
NK_PUBLIC void nk_kabsch_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
|
|
@@ -570,16 +570,10 @@ NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_
|
|
|
570
570
|
rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
|
|
571
571
|
rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
|
|
572
572
|
if (scale) *scale = 1.0f;
|
|
573
|
+
if (a_centroid) a_centroid[0] = 0, a_centroid[1] = 0, a_centroid[2] = 0;
|
|
574
|
+
if (b_centroid) b_centroid[0] = 0, b_centroid[1] = 0, b_centroid[2] = 0;
|
|
573
575
|
|
|
574
|
-
// Fused single-pass: accumulate centroids and squared differences simultaneously.
|
|
575
|
-
// RMSD = √(E[(a−b)²] − (ā − b̄)²)
|
|
576
576
|
v128_t zero_f64x2 = wasm_f64x2_splat(0.0);
|
|
577
|
-
v128_t sum_a_x_low_f64x2 = zero_f64x2, sum_a_x_high_f64x2 = zero_f64x2;
|
|
578
|
-
v128_t sum_a_y_low_f64x2 = zero_f64x2, sum_a_y_high_f64x2 = zero_f64x2;
|
|
579
|
-
v128_t sum_a_z_low_f64x2 = zero_f64x2, sum_a_z_high_f64x2 = zero_f64x2;
|
|
580
|
-
v128_t sum_b_x_low_f64x2 = zero_f64x2, sum_b_x_high_f64x2 = zero_f64x2;
|
|
581
|
-
v128_t sum_b_y_low_f64x2 = zero_f64x2, sum_b_y_high_f64x2 = zero_f64x2;
|
|
582
|
-
v128_t sum_b_z_low_f64x2 = zero_f64x2, sum_b_z_high_f64x2 = zero_f64x2;
|
|
583
577
|
v128_t sum_sq_x_low_f64x2 = zero_f64x2, sum_sq_x_high_f64x2 = zero_f64x2;
|
|
584
578
|
v128_t sum_sq_y_low_f64x2 = zero_f64x2, sum_sq_y_high_f64x2 = zero_f64x2;
|
|
585
579
|
v128_t sum_sq_z_low_f64x2 = zero_f64x2, sum_sq_z_high_f64x2 = zero_f64x2;
|
|
@@ -590,8 +584,7 @@ NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_
|
|
|
590
584
|
nk_deinterleave_f32x4_v128relaxed_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4);
|
|
591
585
|
nk_deinterleave_f32x4_v128relaxed_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
|
|
592
586
|
|
|
593
|
-
// Promote lower and upper halves to f64
|
|
594
|
-
// f32 cancellation in the single-pass formula RMSD = √(E[(a−b)²] − (ā − b̄)²).
|
|
587
|
+
// Promote lower and upper halves to f64 for precision.
|
|
595
588
|
v128_t a_x_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_x_f32x4);
|
|
596
589
|
v128_t a_x_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_x_f32x4, a_x_f32x4, 2, 3, 0, 1));
|
|
597
590
|
v128_t a_y_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_y_f32x4);
|
|
@@ -605,21 +598,7 @@ NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_
|
|
|
605
598
|
v128_t b_z_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_z_f32x4);
|
|
606
599
|
v128_t b_z_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_z_f32x4, b_z_f32x4, 2, 3, 0, 1));
|
|
607
600
|
|
|
608
|
-
// Accumulate
|
|
609
|
-
sum_a_x_low_f64x2 = wasm_f64x2_add(sum_a_x_low_f64x2, a_x_low_f64x2);
|
|
610
|
-
sum_a_x_high_f64x2 = wasm_f64x2_add(sum_a_x_high_f64x2, a_x_high_f64x2);
|
|
611
|
-
sum_a_y_low_f64x2 = wasm_f64x2_add(sum_a_y_low_f64x2, a_y_low_f64x2);
|
|
612
|
-
sum_a_y_high_f64x2 = wasm_f64x2_add(sum_a_y_high_f64x2, a_y_high_f64x2);
|
|
613
|
-
sum_a_z_low_f64x2 = wasm_f64x2_add(sum_a_z_low_f64x2, a_z_low_f64x2);
|
|
614
|
-
sum_a_z_high_f64x2 = wasm_f64x2_add(sum_a_z_high_f64x2, a_z_high_f64x2);
|
|
615
|
-
sum_b_x_low_f64x2 = wasm_f64x2_add(sum_b_x_low_f64x2, b_x_low_f64x2);
|
|
616
|
-
sum_b_x_high_f64x2 = wasm_f64x2_add(sum_b_x_high_f64x2, b_x_high_f64x2);
|
|
617
|
-
sum_b_y_low_f64x2 = wasm_f64x2_add(sum_b_y_low_f64x2, b_y_low_f64x2);
|
|
618
|
-
sum_b_y_high_f64x2 = wasm_f64x2_add(sum_b_y_high_f64x2, b_y_high_f64x2);
|
|
619
|
-
sum_b_z_low_f64x2 = wasm_f64x2_add(sum_b_z_low_f64x2, b_z_low_f64x2);
|
|
620
|
-
sum_b_z_high_f64x2 = wasm_f64x2_add(sum_b_z_high_f64x2, b_z_high_f64x2);
|
|
621
|
-
|
|
622
|
-
// Accumulate squared differences in f64 — deltas computed in f64 for precision.
|
|
601
|
+
// Accumulate squared differences in f64.
|
|
623
602
|
v128_t dx_low_f64x2 = wasm_f64x2_sub(a_x_low_f64x2, b_x_low_f64x2);
|
|
624
603
|
v128_t dx_high_f64x2 = wasm_f64x2_sub(a_x_high_f64x2, b_x_high_f64x2);
|
|
625
604
|
v128_t dy_low_f64x2 = wasm_f64x2_sub(a_y_low_f64x2, b_y_low_f64x2);
|
|
@@ -635,12 +614,6 @@ NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_
|
|
|
635
614
|
sum_sq_z_high_f64x2 = wasm_f64x2_relaxed_madd(dz_high_f64x2, dz_high_f64x2, sum_sq_z_high_f64x2);
|
|
636
615
|
}
|
|
637
616
|
|
|
638
|
-
nk_f64_t sum_a_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_x_low_f64x2, sum_a_x_high_f64x2));
|
|
639
|
-
nk_f64_t sum_a_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_y_low_f64x2, sum_a_y_high_f64x2));
|
|
640
|
-
nk_f64_t sum_a_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_z_low_f64x2, sum_a_z_high_f64x2));
|
|
641
|
-
nk_f64_t sum_b_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_x_low_f64x2, sum_b_x_high_f64x2));
|
|
642
|
-
nk_f64_t sum_b_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_y_low_f64x2, sum_b_y_high_f64x2));
|
|
643
|
-
nk_f64_t sum_b_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_z_low_f64x2, sum_b_z_high_f64x2));
|
|
644
617
|
nk_f64_t sum_sq_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_sq_x_low_f64x2, sum_sq_x_high_f64x2));
|
|
645
618
|
nk_f64_t sum_sq_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_sq_y_low_f64x2, sum_sq_y_high_f64x2));
|
|
646
619
|
nk_f64_t sum_sq_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_sq_z_low_f64x2, sum_sq_z_high_f64x2));
|
|
@@ -649,45 +622,25 @@ NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_
|
|
|
649
622
|
for (; index < n; ++index) {
|
|
650
623
|
nk_f64_t ax = a[index * 3 + 0], ay = a[index * 3 + 1], az = a[index * 3 + 2];
|
|
651
624
|
nk_f64_t bx = b[index * 3 + 0], by = b[index * 3 + 1], bz = b[index * 3 + 2];
|
|
652
|
-
sum_a_x += ax, sum_a_y += ay, sum_a_z += az;
|
|
653
|
-
sum_b_x += bx, sum_b_y += by, sum_b_z += bz;
|
|
654
625
|
nk_f64_t dx = ax - bx, dy = ay - by, dz = az - bz;
|
|
655
626
|
sum_sq_x += dx * dx, sum_sq_y += dy * dy, sum_sq_z += dz * dz;
|
|
656
627
|
}
|
|
657
628
|
|
|
658
|
-
|
|
659
|
-
nk_f64_t centroid_a_x = sum_a_x * inv_points_count, centroid_a_y = sum_a_y * inv_points_count,
|
|
660
|
-
centroid_a_z = sum_a_z * inv_points_count;
|
|
661
|
-
nk_f64_t centroid_b_x = sum_b_x * inv_points_count, centroid_b_y = sum_b_y * inv_points_count,
|
|
662
|
-
centroid_b_z = sum_b_z * inv_points_count;
|
|
663
|
-
if (a_centroid)
|
|
664
|
-
a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
|
|
665
|
-
a_centroid[2] = (nk_f32_t)centroid_a_z;
|
|
666
|
-
if (b_centroid)
|
|
667
|
-
b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
|
|
668
|
-
b_centroid[2] = (nk_f32_t)centroid_b_z;
|
|
669
|
-
|
|
670
|
-
nk_f64_t sum_squared = sum_sq_x + sum_sq_y + sum_sq_z;
|
|
671
|
-
nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x;
|
|
672
|
-
nk_f64_t mean_diff_y = centroid_a_y - centroid_b_y;
|
|
673
|
-
nk_f64_t mean_diff_z = centroid_a_z - centroid_b_z;
|
|
674
|
-
nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
|
|
675
|
-
*result = nk_f64_sqrt_v128relaxed(sum_squared * inv_points_count - mean_diff_sq);
|
|
629
|
+
*result = nk_f64_sqrt_v128relaxed((sum_sq_x + sum_sq_y + sum_sq_z) / (nk_f64_t)n);
|
|
676
630
|
}
|
|
677
631
|
|
|
678
632
|
NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
|
|
679
633
|
nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
|
|
680
|
-
// RMSD uses identity rotation and scale=1.0
|
|
681
634
|
if (rotation)
|
|
682
635
|
rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
|
|
683
636
|
rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
|
|
684
637
|
if (scale) *scale = 1.0;
|
|
638
|
+
if (a_centroid) a_centroid[0] = 0, a_centroid[1] = 0, a_centroid[2] = 0;
|
|
639
|
+
if (b_centroid) b_centroid[0] = 0, b_centroid[1] = 0, b_centroid[2] = 0;
|
|
685
640
|
|
|
686
641
|
v128_t const zeros_f64x2 = wasm_f64x2_splat(0);
|
|
687
642
|
|
|
688
|
-
// Accumulators for
|
|
689
|
-
v128_t sum_a_x_f64x2 = zeros_f64x2, sum_a_y_f64x2 = zeros_f64x2, sum_a_z_f64x2 = zeros_f64x2;
|
|
690
|
-
v128_t sum_b_x_f64x2 = zeros_f64x2, sum_b_y_f64x2 = zeros_f64x2, sum_b_z_f64x2 = zeros_f64x2;
|
|
643
|
+
// Accumulators for squared differences
|
|
691
644
|
v128_t sum_squared_x_f64x2 = zeros_f64x2, sum_squared_y_f64x2 = zeros_f64x2, sum_squared_z_f64x2 = zeros_f64x2;
|
|
692
645
|
|
|
693
646
|
v128_t a_x_f64x2, a_y_f64x2, a_z_f64x2, b_x_f64x2, b_y_f64x2, b_z_f64x2;
|
|
@@ -698,13 +651,6 @@ NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_
|
|
|
698
651
|
nk_deinterleave_f64x2_v128relaxed_(a + i * 3, &a_x_f64x2, &a_y_f64x2, &a_z_f64x2);
|
|
699
652
|
nk_deinterleave_f64x2_v128relaxed_(b + i * 3, &b_x_f64x2, &b_y_f64x2, &b_z_f64x2);
|
|
700
653
|
|
|
701
|
-
sum_a_x_f64x2 = wasm_f64x2_add(sum_a_x_f64x2, a_x_f64x2);
|
|
702
|
-
sum_a_y_f64x2 = wasm_f64x2_add(sum_a_y_f64x2, a_y_f64x2);
|
|
703
|
-
sum_a_z_f64x2 = wasm_f64x2_add(sum_a_z_f64x2, a_z_f64x2);
|
|
704
|
-
sum_b_x_f64x2 = wasm_f64x2_add(sum_b_x_f64x2, b_x_f64x2);
|
|
705
|
-
sum_b_y_f64x2 = wasm_f64x2_add(sum_b_y_f64x2, b_y_f64x2);
|
|
706
|
-
sum_b_z_f64x2 = wasm_f64x2_add(sum_b_z_f64x2, b_z_f64x2);
|
|
707
|
-
|
|
708
654
|
v128_t delta_x_f64x2 = wasm_f64x2_sub(a_x_f64x2, b_x_f64x2);
|
|
709
655
|
v128_t delta_y_f64x2 = wasm_f64x2_sub(a_y_f64x2, b_y_f64x2);
|
|
710
656
|
v128_t delta_z_f64x2 = wasm_f64x2_sub(a_z_f64x2, b_z_f64x2);
|
|
@@ -715,12 +661,6 @@ NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_
|
|
|
715
661
|
}
|
|
716
662
|
|
|
717
663
|
// Reduce vectors to scalars.
|
|
718
|
-
nk_f64_t total_ax = nk_reduce_stable_f64x2_v128relaxed_(sum_a_x_f64x2), total_ax_compensation = 0.0;
|
|
719
|
-
nk_f64_t total_ay = nk_reduce_stable_f64x2_v128relaxed_(sum_a_y_f64x2), total_ay_compensation = 0.0;
|
|
720
|
-
nk_f64_t total_az = nk_reduce_stable_f64x2_v128relaxed_(sum_a_z_f64x2), total_az_compensation = 0.0;
|
|
721
|
-
nk_f64_t total_bx = nk_reduce_stable_f64x2_v128relaxed_(sum_b_x_f64x2), total_bx_compensation = 0.0;
|
|
722
|
-
nk_f64_t total_by = nk_reduce_stable_f64x2_v128relaxed_(sum_b_y_f64x2), total_by_compensation = 0.0;
|
|
723
|
-
nk_f64_t total_bz = nk_reduce_stable_f64x2_v128relaxed_(sum_b_z_f64x2), total_bz_compensation = 0.0;
|
|
724
664
|
nk_f64_t total_squared_x = nk_reduce_stable_f64x2_v128relaxed_(sum_squared_x_f64x2),
|
|
725
665
|
total_squared_x_compensation = 0.0;
|
|
726
666
|
nk_f64_t total_squared_y = nk_reduce_stable_f64x2_v128relaxed_(sum_squared_y_f64x2),
|
|
@@ -732,40 +672,16 @@ NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_
|
|
|
732
672
|
for (; i < n; ++i) {
|
|
733
673
|
nk_f64_t ax = a[i * 3 + 0], ay = a[i * 3 + 1], az = a[i * 3 + 2];
|
|
734
674
|
nk_f64_t bx = b[i * 3 + 0], by = b[i * 3 + 1], bz = b[i * 3 + 2];
|
|
735
|
-
nk_accumulate_sum_f64_(&total_ax, &total_ax_compensation, ax);
|
|
736
|
-
nk_accumulate_sum_f64_(&total_ay, &total_ay_compensation, ay);
|
|
737
|
-
nk_accumulate_sum_f64_(&total_az, &total_az_compensation, az);
|
|
738
|
-
nk_accumulate_sum_f64_(&total_bx, &total_bx_compensation, bx);
|
|
739
|
-
nk_accumulate_sum_f64_(&total_by, &total_by_compensation, by);
|
|
740
|
-
nk_accumulate_sum_f64_(&total_bz, &total_bz_compensation, bz);
|
|
741
675
|
nk_f64_t delta_x = ax - bx, delta_y = ay - by, delta_z = az - bz;
|
|
742
676
|
nk_accumulate_square_f64_(&total_squared_x, &total_squared_x_compensation, delta_x);
|
|
743
677
|
nk_accumulate_square_f64_(&total_squared_y, &total_squared_y_compensation, delta_y);
|
|
744
678
|
nk_accumulate_square_f64_(&total_squared_z, &total_squared_z_compensation, delta_z);
|
|
745
679
|
}
|
|
746
680
|
|
|
747
|
-
total_ax += total_ax_compensation, total_ay += total_ay_compensation, total_az += total_az_compensation;
|
|
748
|
-
total_bx += total_bx_compensation, total_by += total_by_compensation, total_bz += total_bz_compensation;
|
|
749
681
|
total_squared_x += total_squared_x_compensation, total_squared_y += total_squared_y_compensation,
|
|
750
682
|
total_squared_z += total_squared_z_compensation;
|
|
751
683
|
|
|
752
|
-
|
|
753
|
-
nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
|
|
754
|
-
nk_f64_t centroid_a_x = total_ax * inv_points_count, centroid_a_y = total_ay * inv_points_count,
|
|
755
|
-
centroid_a_z = total_az * inv_points_count;
|
|
756
|
-
nk_f64_t centroid_b_x = total_bx * inv_points_count, centroid_b_y = total_by * inv_points_count,
|
|
757
|
-
centroid_b_z = total_bz * inv_points_count;
|
|
758
|
-
if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
|
|
759
|
-
if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
|
|
760
|
-
|
|
761
|
-
// Compute RMSD
|
|
762
|
-
nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x;
|
|
763
|
-
nk_f64_t mean_diff_y = centroid_a_y - centroid_b_y;
|
|
764
|
-
nk_f64_t mean_diff_z = centroid_a_z - centroid_b_z;
|
|
765
|
-
nk_f64_t sum_squared = total_squared_x + total_squared_y + total_squared_z;
|
|
766
|
-
nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
|
|
767
|
-
|
|
768
|
-
*result = nk_f64_sqrt_v128relaxed(sum_squared * inv_points_count - mean_diff_sq);
|
|
684
|
+
*result = nk_f64_sqrt_v128relaxed((total_squared_x + total_squared_y + total_squared_z) / (nk_f64_t)n);
|
|
769
685
|
}
|
|
770
686
|
|
|
771
687
|
NK_PUBLIC void nk_kabsch_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
|
package/include/numkong/mesh.h
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
*
|
|
7
7
|
* Contains:
|
|
8
8
|
*
|
|
9
|
-
* - Root Mean Square Deviation (RMSD)
|
|
9
|
+
* - Root Mean Square Deviation (RMSD) of raw point differences
|
|
10
10
|
* - Kabsch algorithm for optimal rigid body alignment (rotation only)
|
|
11
11
|
* - Umeyama algorithm for similarity transform (rotation + uniform scaling)
|
|
12
12
|
*
|
|
@@ -48,7 +48,7 @@
|
|
|
48
48
|
*
|
|
49
49
|
* @section algorithm_overview Algorithm Overview
|
|
50
50
|
*
|
|
51
|
-
* - RMSD:
|
|
51
|
+
* - RMSD: Raw √(Σ‖aᵢ − bᵢ‖² / n) without centering or alignment. R = identity, scale = 1.0, centroids zeroed
|
|
52
52
|
* - Kabsch: Finds optimal rotation R minimizing ‖R × (a - ā) - (b - b̄)‖. scale = 1.0
|
|
53
53
|
* - Umeyama: Finds optimal rotation R and scale c minimizing ‖c × R × (a - ā) - (b - b̄)‖
|
|
54
54
|
*
|
package/include/numkong/mesh.hpp
CHANGED
|
@@ -354,74 +354,30 @@ void rmsd( //
|
|
|
354
354
|
else if constexpr (std::is_same_v<in_type_, bf16_t> && simd)
|
|
355
355
|
nk_rmsd_bf16(&a->raw_, &b->raw_, n, &a_centroid->raw_, &b_centroid->raw_, &rotation->raw_,
|
|
356
356
|
scale ? &scale->raw_ : nullptr, &metric->raw_);
|
|
357
|
-
// Scalar fallback
|
|
357
|
+
// Scalar fallback: raw √(Σ‖aᵢ − bᵢ‖² / n), no centering
|
|
358
358
|
else {
|
|
359
|
-
// Step 1: Compute centroids
|
|
360
|
-
metric_type_ sum_a_x {}, sum_a_y {}, sum_a_z {};
|
|
361
|
-
metric_type_ sum_b_x {}, sum_b_y {}, sum_b_z {};
|
|
362
|
-
metric_type_ val_a_x, val_a_y, val_a_z, val_b_x, val_b_y, val_b_z;
|
|
363
|
-
|
|
364
|
-
for (std::size_t i = 0; i < n; i++) {
|
|
365
|
-
val_a_x = metric_type_(a[i * 3 + 0]);
|
|
366
|
-
val_a_y = metric_type_(a[i * 3 + 1]);
|
|
367
|
-
val_a_z = metric_type_(a[i * 3 + 2]);
|
|
368
|
-
val_b_x = metric_type_(b[i * 3 + 0]);
|
|
369
|
-
val_b_y = metric_type_(b[i * 3 + 1]);
|
|
370
|
-
val_b_z = metric_type_(b[i * 3 + 2]);
|
|
371
|
-
sum_a_x = sum_a_x + val_a_x;
|
|
372
|
-
sum_a_y = sum_a_y + val_a_y;
|
|
373
|
-
sum_a_z = sum_a_z + val_a_z;
|
|
374
|
-
sum_b_x = sum_b_x + val_b_x;
|
|
375
|
-
sum_b_y = sum_b_y + val_b_y;
|
|
376
|
-
sum_b_z = sum_b_z + val_b_z;
|
|
377
|
-
}
|
|
378
|
-
|
|
379
|
-
metric_type_ inv_n = metric_type_(1.0) / metric_type_(static_cast<double>(n));
|
|
380
|
-
metric_type_ centroid_a_x = sum_a_x * inv_n;
|
|
381
|
-
metric_type_ centroid_a_y = sum_a_y * inv_n;
|
|
382
|
-
metric_type_ centroid_a_z = sum_a_z * inv_n;
|
|
383
|
-
metric_type_ centroid_b_x = sum_b_x * inv_n;
|
|
384
|
-
metric_type_ centroid_b_y = sum_b_y * inv_n;
|
|
385
|
-
metric_type_ centroid_b_z = sum_b_z * inv_n;
|
|
386
|
-
|
|
387
|
-
// Step 2: Store centroids if requested
|
|
388
359
|
if (a_centroid)
|
|
389
|
-
a_centroid[0] = transform_type_(
|
|
390
|
-
a_centroid[2] = transform_type_(
|
|
360
|
+
a_centroid[0] = transform_type_(0.0), a_centroid[1] = transform_type_(0.0),
|
|
361
|
+
a_centroid[2] = transform_type_(0.0);
|
|
391
362
|
if (b_centroid)
|
|
392
|
-
b_centroid[0] = transform_type_(
|
|
393
|
-
b_centroid[2] = transform_type_(
|
|
394
|
-
|
|
395
|
-
// Step 3: RMSD uses identity rotation and scale=1.0
|
|
363
|
+
b_centroid[0] = transform_type_(0.0), b_centroid[1] = transform_type_(0.0),
|
|
364
|
+
b_centroid[2] = transform_type_(0.0);
|
|
396
365
|
if (rotation) {
|
|
397
|
-
rotation[0] = transform_type_(1.0);
|
|
398
|
-
rotation[1] = transform_type_(0.0);
|
|
399
|
-
rotation[
|
|
400
|
-
rotation[3] = transform_type_(0.0);
|
|
401
|
-
rotation[4] = transform_type_(1.0);
|
|
402
|
-
rotation[5] = transform_type_(0.0);
|
|
403
|
-
rotation[6] = transform_type_(0.0);
|
|
404
|
-
rotation[7] = transform_type_(0.0);
|
|
405
|
-
rotation[8] = transform_type_(1.0);
|
|
366
|
+
rotation[0] = transform_type_(1.0), rotation[1] = transform_type_(0.0), rotation[2] = transform_type_(0.0);
|
|
367
|
+
rotation[3] = transform_type_(0.0), rotation[4] = transform_type_(1.0), rotation[5] = transform_type_(0.0);
|
|
368
|
+
rotation[6] = transform_type_(0.0), rotation[7] = transform_type_(0.0), rotation[8] = transform_type_(1.0);
|
|
406
369
|
}
|
|
407
370
|
if (scale) *scale = transform_type_(1.0);
|
|
408
371
|
|
|
409
|
-
// Step 4: Compute RMSD between centered point clouds
|
|
410
372
|
metric_type_ sum_squared {};
|
|
411
373
|
for (std::size_t i = 0; i < n; i++) {
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
val_b_x = metric_type_(b[i * 3 + 0]);
|
|
416
|
-
val_b_y = metric_type_(b[i * 3 + 1]);
|
|
417
|
-
val_b_z = metric_type_(b[i * 3 + 2]);
|
|
418
|
-
metric_type_ dx = (val_a_x - centroid_a_x) - (val_b_x - centroid_b_x);
|
|
419
|
-
metric_type_ dy = (val_a_y - centroid_a_y) - (val_b_y - centroid_b_y);
|
|
420
|
-
metric_type_ dz = (val_a_z - centroid_a_z) - (val_b_z - centroid_b_z);
|
|
374
|
+
metric_type_ dx = metric_type_(a[i * 3 + 0]) - metric_type_(b[i * 3 + 0]);
|
|
375
|
+
metric_type_ dy = metric_type_(a[i * 3 + 1]) - metric_type_(b[i * 3 + 1]);
|
|
376
|
+
metric_type_ dz = metric_type_(a[i * 3 + 2]) - metric_type_(b[i * 3 + 2]);
|
|
421
377
|
sum_squared = sum_squared + dx * dx + dy * dy + dz * dz;
|
|
422
378
|
}
|
|
423
379
|
|
|
424
|
-
*metric = (sum_squared
|
|
380
|
+
*metric = (sum_squared / metric_type_(static_cast<double>(n))).sqrt();
|
|
425
381
|
}
|
|
426
382
|
}
|
|
427
383
|
|
|
@@ -470,18 +426,12 @@ void kabsch( //
|
|
|
470
426
|
metric_type_ val_a_x, val_a_y, val_a_z, val_b_x, val_b_y, val_b_z;
|
|
471
427
|
|
|
472
428
|
for (std::size_t i = 0; i < n; i++) {
|
|
473
|
-
val_a_x = metric_type_(a[i * 3 + 0])
|
|
474
|
-
val_a_y = metric_type_(a[i * 3 + 1]);
|
|
429
|
+
val_a_x = metric_type_(a[i * 3 + 0]), val_a_y = metric_type_(a[i * 3 + 1]),
|
|
475
430
|
val_a_z = metric_type_(a[i * 3 + 2]);
|
|
476
|
-
val_b_x = metric_type_(b[i * 3 + 0])
|
|
477
|
-
val_b_y = metric_type_(b[i * 3 + 1]);
|
|
431
|
+
val_b_x = metric_type_(b[i * 3 + 0]), val_b_y = metric_type_(b[i * 3 + 1]),
|
|
478
432
|
val_b_z = metric_type_(b[i * 3 + 2]);
|
|
479
|
-
sum_a_x = sum_a_x + val_a_x;
|
|
480
|
-
|
|
481
|
-
sum_a_z = sum_a_z + val_a_z;
|
|
482
|
-
sum_b_x = sum_b_x + val_b_x;
|
|
483
|
-
sum_b_y = sum_b_y + val_b_y;
|
|
484
|
-
sum_b_z = sum_b_z + val_b_z;
|
|
433
|
+
sum_a_x = sum_a_x + val_a_x, sum_a_y = sum_a_y + val_a_y, sum_a_z = sum_a_z + val_a_z;
|
|
434
|
+
sum_b_x = sum_b_x + val_b_x, sum_b_y = sum_b_y + val_b_y, sum_b_z = sum_b_z + val_b_z;
|
|
485
435
|
}
|
|
486
436
|
|
|
487
437
|
metric_type_ inv_n = metric_type_(1.0) / metric_type_(static_cast<double>(n));
|
|
@@ -503,11 +453,9 @@ void kabsch( //
|
|
|
503
453
|
// Step 2: Build 3x3 covariance matrix H = (A - A_bar)^T x (B - B_bar)
|
|
504
454
|
metric_type_ cross_covariance[9] = {};
|
|
505
455
|
for (std::size_t i = 0; i < n; i++) {
|
|
506
|
-
val_a_x = metric_type_(a[i * 3 + 0]) - centroid_a_x
|
|
507
|
-
val_a_y = metric_type_(a[i * 3 + 1]) - centroid_a_y;
|
|
456
|
+
val_a_x = metric_type_(a[i * 3 + 0]) - centroid_a_x, val_a_y = metric_type_(a[i * 3 + 1]) - centroid_a_y,
|
|
508
457
|
val_a_z = metric_type_(a[i * 3 + 2]) - centroid_a_z;
|
|
509
|
-
val_b_x = metric_type_(b[i * 3 + 0]) - centroid_b_x
|
|
510
|
-
val_b_y = metric_type_(b[i * 3 + 1]) - centroid_b_y;
|
|
458
|
+
val_b_x = metric_type_(b[i * 3 + 0]) - centroid_b_x, val_b_y = metric_type_(b[i * 3 + 1]) - centroid_b_y,
|
|
511
459
|
val_b_z = metric_type_(b[i * 3 + 2]) - centroid_b_z;
|
|
512
460
|
cross_covariance[0] = cross_covariance[0] + val_a_x * val_b_x;
|
|
513
461
|
cross_covariance[1] = cross_covariance[1] + val_a_x * val_b_y;
|
|
@@ -563,11 +511,11 @@ void kabsch( //
|
|
|
563
511
|
metric_type_ sum_squared {};
|
|
564
512
|
for (std::size_t i = 0; i < n; i++) {
|
|
565
513
|
metric_type_ point_a[3], point_b[3], rotated_point_a[3];
|
|
566
|
-
point_a[0] = metric_type_(a[i * 3 + 0]) - centroid_a_x
|
|
567
|
-
point_a[1] = metric_type_(a[i * 3 + 1]) - centroid_a_y
|
|
514
|
+
point_a[0] = metric_type_(a[i * 3 + 0]) - centroid_a_x,
|
|
515
|
+
point_a[1] = metric_type_(a[i * 3 + 1]) - centroid_a_y,
|
|
568
516
|
point_a[2] = metric_type_(a[i * 3 + 2]) - centroid_a_z;
|
|
569
|
-
point_b[0] = metric_type_(b[i * 3 + 0]) - centroid_b_x
|
|
570
|
-
point_b[1] = metric_type_(b[i * 3 + 1]) - centroid_b_y
|
|
517
|
+
point_b[0] = metric_type_(b[i * 3 + 0]) - centroid_b_x,
|
|
518
|
+
point_b[1] = metric_type_(b[i * 3 + 1]) - centroid_b_y,
|
|
571
519
|
point_b[2] = metric_type_(b[i * 3 + 2]) - centroid_b_z;
|
|
572
520
|
rotated_point_a[0] = rotation_matrix[0] * point_a[0] + rotation_matrix[1] * point_a[1] +
|
|
573
521
|
rotation_matrix[2] * point_a[2];
|
|
@@ -628,18 +576,12 @@ void umeyama(in_type_ const *a, in_type_ const *b, std::size_t n, transform_type
|
|
|
628
576
|
metric_type_ val_a_x, val_a_y, val_a_z, val_b_x, val_b_y, val_b_z;
|
|
629
577
|
|
|
630
578
|
for (std::size_t i = 0; i < n; i++) {
|
|
631
|
-
val_a_x = metric_type_(a[i * 3 + 0])
|
|
632
|
-
val_a_y = metric_type_(a[i * 3 + 1]);
|
|
579
|
+
val_a_x = metric_type_(a[i * 3 + 0]), val_a_y = metric_type_(a[i * 3 + 1]),
|
|
633
580
|
val_a_z = metric_type_(a[i * 3 + 2]);
|
|
634
|
-
val_b_x = metric_type_(b[i * 3 + 0])
|
|
635
|
-
val_b_y = metric_type_(b[i * 3 + 1]);
|
|
581
|
+
val_b_x = metric_type_(b[i * 3 + 0]), val_b_y = metric_type_(b[i * 3 + 1]),
|
|
636
582
|
val_b_z = metric_type_(b[i * 3 + 2]);
|
|
637
|
-
sum_a_x = sum_a_x + val_a_x;
|
|
638
|
-
|
|
639
|
-
sum_a_z = sum_a_z + val_a_z;
|
|
640
|
-
sum_b_x = sum_b_x + val_b_x;
|
|
641
|
-
sum_b_y = sum_b_y + val_b_y;
|
|
642
|
-
sum_b_z = sum_b_z + val_b_z;
|
|
583
|
+
sum_a_x = sum_a_x + val_a_x, sum_a_y = sum_a_y + val_a_y, sum_a_z = sum_a_z + val_a_z;
|
|
584
|
+
sum_b_x = sum_b_x + val_b_x, sum_b_y = sum_b_y + val_b_y, sum_b_z = sum_b_z + val_b_z;
|
|
643
585
|
}
|
|
644
586
|
|
|
645
587
|
metric_type_ inv_n = metric_type_(1.0) / metric_type_(static_cast<double>(n));
|
|
@@ -650,16 +592,13 @@ void umeyama(in_type_ const *a, in_type_ const *b, std::size_t n, transform_type
|
|
|
650
592
|
metric_type_ centroid_b_y = sum_b_y * inv_n;
|
|
651
593
|
metric_type_ centroid_b_z = sum_b_z * inv_n;
|
|
652
594
|
|
|
653
|
-
if (a_centroid)
|
|
654
|
-
a_centroid[0] = transform_type_(centroid_a_x)
|
|
655
|
-
a_centroid[1] = transform_type_(centroid_a_y);
|
|
595
|
+
if (a_centroid)
|
|
596
|
+
a_centroid[0] = transform_type_(centroid_a_x), a_centroid[1] = transform_type_(centroid_a_y),
|
|
656
597
|
a_centroid[2] = transform_type_(centroid_a_z);
|
|
657
|
-
|
|
658
|
-
if (b_centroid)
|
|
659
|
-
b_centroid[0] = transform_type_(centroid_b_x)
|
|
660
|
-
b_centroid[1] = transform_type_(centroid_b_y);
|
|
598
|
+
|
|
599
|
+
if (b_centroid)
|
|
600
|
+
b_centroid[0] = transform_type_(centroid_b_x), b_centroid[1] = transform_type_(centroid_b_y),
|
|
661
601
|
b_centroid[2] = transform_type_(centroid_b_z);
|
|
662
|
-
}
|
|
663
602
|
|
|
664
603
|
// Step 2: Build covariance matrix H and compute variance of A
|
|
665
604
|
metric_type_ cross_covariance[9] = {};
|
|
@@ -733,11 +672,11 @@ void umeyama(in_type_ const *a, in_type_ const *b, std::size_t n, transform_type
|
|
|
733
672
|
metric_type_ sum_squared {};
|
|
734
673
|
for (std::size_t i = 0; i < n; i++) {
|
|
735
674
|
metric_type_ point_a[3], point_b[3], rotated_point_a[3];
|
|
736
|
-
point_a[0] = metric_type_(a[i * 3 + 0]) - centroid_a_x
|
|
737
|
-
point_a[1] = metric_type_(a[i * 3 + 1]) - centroid_a_y
|
|
675
|
+
point_a[0] = metric_type_(a[i * 3 + 0]) - centroid_a_x,
|
|
676
|
+
point_a[1] = metric_type_(a[i * 3 + 1]) - centroid_a_y,
|
|
738
677
|
point_a[2] = metric_type_(a[i * 3 + 2]) - centroid_a_z;
|
|
739
|
-
point_b[0] = metric_type_(b[i * 3 + 0]) - centroid_b_x
|
|
740
|
-
point_b[1] = metric_type_(b[i * 3 + 1]) - centroid_b_y
|
|
678
|
+
point_b[0] = metric_type_(b[i * 3 + 0]) - centroid_b_x,
|
|
679
|
+
point_b[1] = metric_type_(b[i * 3 + 1]) - centroid_b_y,
|
|
741
680
|
point_b[2] = metric_type_(b[i * 3 + 2]) - centroid_b_z;
|
|
742
681
|
rotated_point_a[0] = scale_factor * (rotation_matrix[0] * point_a[0] + rotation_matrix[1] * point_a[1] +
|
|
743
682
|
rotation_matrix[2] * point_a[2]);
|