numkong 7.0.0 → 7.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +239 -122
- package/binding.gyp +25 -491
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
|
@@ -56,14 +56,14 @@ NK_INTERNAL void nk_deinterleave_f32x4_v128relaxed_(nk_f32_t const *ptr, v128_t
|
|
|
56
56
|
v128_t v1_f32x4 = wasm_v128_load(ptr + 4); // y1 z1 x2 y2
|
|
57
57
|
v128_t v2_f32x4 = wasm_v128_load(ptr + 8); // z2 x3 y3 z3
|
|
58
58
|
// x0 x1 x2 x3
|
|
59
|
-
v128_t
|
|
60
|
-
*xs_f32x4 = wasm_i32x4_shuffle(
|
|
59
|
+
v128_t x_partial_f32x4 = wasm_i32x4_shuffle(v0_f32x4, v1_f32x4, 0, 3, 6, 0); // x0 x1 x2 _
|
|
60
|
+
*xs_f32x4 = wasm_i32x4_shuffle(x_partial_f32x4, v2_f32x4, 0, 1, 2, 5); // x0 x1 x2 x3
|
|
61
61
|
// y0 y1 y2 y3
|
|
62
|
-
v128_t
|
|
63
|
-
*ys_f32x4 = wasm_i32x4_shuffle(
|
|
62
|
+
v128_t y_partial_f32x4 = wasm_i32x4_shuffle(v0_f32x4, v1_f32x4, 1, 4, 7, 0); // y0 y1 y2 _
|
|
63
|
+
*ys_f32x4 = wasm_i32x4_shuffle(y_partial_f32x4, v2_f32x4, 0, 1, 2, 6); // y0 y1 y2 y3
|
|
64
64
|
// z0 z1 z2 z3
|
|
65
|
-
v128_t
|
|
66
|
-
*zs_f32x4 = wasm_i32x4_shuffle(
|
|
65
|
+
v128_t z_partial_f32x4 = wasm_i32x4_shuffle(v0_f32x4, v1_f32x4, 2, 5, 0, 0); // z0 z1 _ _
|
|
66
|
+
*zs_f32x4 = wasm_i32x4_shuffle(z_partial_f32x4, v2_f32x4, 0, 1, 4, 7); // z0 z1 z2 z3
|
|
67
67
|
}
|
|
68
68
|
|
|
69
69
|
/* Deinterleave 6 contiguous f64 values (2 XYZ triplets) into separate x, y, z vectors.
|
|
@@ -120,89 +120,27 @@ NK_INTERNAL void nk_accumulate_square_f64x2_v128relaxed_(v128_t *sum_f64x2, v128
|
|
|
120
120
|
*compensation_f64x2 = wasm_f64x2_add(*compensation_f64x2, wasm_f64x2_add(sum_error_f64x2, product_error_f64x2));
|
|
121
121
|
}
|
|
122
122
|
|
|
123
|
-
NK_INTERNAL void
|
|
124
|
-
nk_f32_t const *a, nk_f32_t const *b, nk_size_t n,
|
|
125
|
-
nk_f64_t *
|
|
126
|
-
nk_f64_t *
|
|
127
|
-
v128_t zero_f64x2 = wasm_f64x2_splat(0.0);
|
|
128
|
-
v128_t sum_a_x_lower_f64x2 = zero_f64x2, sum_a_x_upper_f64x2 = zero_f64x2;
|
|
129
|
-
v128_t sum_a_y_lower_f64x2 = zero_f64x2, sum_a_y_upper_f64x2 = zero_f64x2;
|
|
130
|
-
v128_t sum_a_z_lower_f64x2 = zero_f64x2, sum_a_z_upper_f64x2 = zero_f64x2;
|
|
131
|
-
v128_t sum_b_x_lower_f64x2 = zero_f64x2, sum_b_x_upper_f64x2 = zero_f64x2;
|
|
132
|
-
v128_t sum_b_y_lower_f64x2 = zero_f64x2, sum_b_y_upper_f64x2 = zero_f64x2;
|
|
133
|
-
v128_t sum_b_z_lower_f64x2 = zero_f64x2, sum_b_z_upper_f64x2 = zero_f64x2;
|
|
134
|
-
nk_size_t index = 0;
|
|
135
|
-
|
|
136
|
-
for (; index + 4 <= n; index += 4) {
|
|
137
|
-
v128_t a_x_f32x4, a_y_f32x4, a_z_f32x4, b_x_f32x4, b_y_f32x4, b_z_f32x4;
|
|
138
|
-
nk_deinterleave_f32x4_v128relaxed_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4);
|
|
139
|
-
nk_deinterleave_f32x4_v128relaxed_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
|
|
140
|
-
|
|
141
|
-
v128_t a_x_lower_f64x2 = wasm_f64x2_promote_low_f32x4(a_x_f32x4);
|
|
142
|
-
v128_t a_x_upper_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_x_f32x4, a_x_f32x4, 2, 3, 0, 1));
|
|
143
|
-
v128_t a_y_lower_f64x2 = wasm_f64x2_promote_low_f32x4(a_y_f32x4);
|
|
144
|
-
v128_t a_y_upper_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_y_f32x4, a_y_f32x4, 2, 3, 0, 1));
|
|
145
|
-
v128_t a_z_lower_f64x2 = wasm_f64x2_promote_low_f32x4(a_z_f32x4);
|
|
146
|
-
v128_t a_z_upper_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_z_f32x4, a_z_f32x4, 2, 3, 0, 1));
|
|
147
|
-
v128_t b_x_lower_f64x2 = wasm_f64x2_promote_low_f32x4(b_x_f32x4);
|
|
148
|
-
v128_t b_x_upper_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_x_f32x4, b_x_f32x4, 2, 3, 0, 1));
|
|
149
|
-
v128_t b_y_lower_f64x2 = wasm_f64x2_promote_low_f32x4(b_y_f32x4);
|
|
150
|
-
v128_t b_y_upper_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_y_f32x4, b_y_f32x4, 2, 3, 0, 1));
|
|
151
|
-
v128_t b_z_lower_f64x2 = wasm_f64x2_promote_low_f32x4(b_z_f32x4);
|
|
152
|
-
v128_t b_z_upper_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_z_f32x4, b_z_f32x4, 2, 3, 0, 1));
|
|
153
|
-
|
|
154
|
-
sum_a_x_lower_f64x2 = wasm_f64x2_add(sum_a_x_lower_f64x2, a_x_lower_f64x2),
|
|
155
|
-
sum_a_x_upper_f64x2 = wasm_f64x2_add(sum_a_x_upper_f64x2, a_x_upper_f64x2);
|
|
156
|
-
sum_a_y_lower_f64x2 = wasm_f64x2_add(sum_a_y_lower_f64x2, a_y_lower_f64x2),
|
|
157
|
-
sum_a_y_upper_f64x2 = wasm_f64x2_add(sum_a_y_upper_f64x2, a_y_upper_f64x2);
|
|
158
|
-
sum_a_z_lower_f64x2 = wasm_f64x2_add(sum_a_z_lower_f64x2, a_z_lower_f64x2),
|
|
159
|
-
sum_a_z_upper_f64x2 = wasm_f64x2_add(sum_a_z_upper_f64x2, a_z_upper_f64x2);
|
|
160
|
-
sum_b_x_lower_f64x2 = wasm_f64x2_add(sum_b_x_lower_f64x2, b_x_lower_f64x2),
|
|
161
|
-
sum_b_x_upper_f64x2 = wasm_f64x2_add(sum_b_x_upper_f64x2, b_x_upper_f64x2);
|
|
162
|
-
sum_b_y_lower_f64x2 = wasm_f64x2_add(sum_b_y_lower_f64x2, b_y_lower_f64x2),
|
|
163
|
-
sum_b_y_upper_f64x2 = wasm_f64x2_add(sum_b_y_upper_f64x2, b_y_upper_f64x2);
|
|
164
|
-
sum_b_z_lower_f64x2 = wasm_f64x2_add(sum_b_z_lower_f64x2, b_z_lower_f64x2),
|
|
165
|
-
sum_b_z_upper_f64x2 = wasm_f64x2_add(sum_b_z_upper_f64x2, b_z_upper_f64x2);
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
nk_f64_t sum_a_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_x_lower_f64x2, sum_a_x_upper_f64x2));
|
|
169
|
-
nk_f64_t sum_a_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_y_lower_f64x2, sum_a_y_upper_f64x2));
|
|
170
|
-
nk_f64_t sum_a_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_z_lower_f64x2, sum_a_z_upper_f64x2));
|
|
171
|
-
nk_f64_t sum_b_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_x_lower_f64x2, sum_b_x_upper_f64x2));
|
|
172
|
-
nk_f64_t sum_b_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_y_lower_f64x2, sum_b_y_upper_f64x2));
|
|
173
|
-
nk_f64_t sum_b_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_z_lower_f64x2, sum_b_z_upper_f64x2));
|
|
174
|
-
|
|
175
|
-
for (; index < n; ++index) {
|
|
176
|
-
sum_a_x += a[index * 3 + 0], sum_a_y += a[index * 3 + 1], sum_a_z += a[index * 3 + 2];
|
|
177
|
-
sum_b_x += b[index * 3 + 0], sum_b_y += b[index * 3 + 1], sum_b_z += b[index * 3 + 2];
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
|
|
181
|
-
*ca_x = sum_a_x * inv_n, *ca_y = sum_a_y * inv_n, *ca_z = sum_a_z * inv_n;
|
|
182
|
-
*cb_x = sum_b_x * inv_n, *cb_y = sum_b_y * inv_n, *cb_z = sum_b_z * inv_n;
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
NK_INTERNAL void nk_centroid_and_cross_covariance_f32_v128relaxed_( //
|
|
186
|
-
nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, //
|
|
187
|
-
nk_f64_t *ca_x, nk_f64_t *ca_y, nk_f64_t *ca_z, //
|
|
188
|
-
nk_f64_t *cb_x, nk_f64_t *cb_y, nk_f64_t *cb_z, //
|
|
123
|
+
NK_INTERNAL void nk_centroid_and_cross_covariance_f32_v128relaxed_( //
|
|
124
|
+
nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, //
|
|
125
|
+
nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
|
|
126
|
+
nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
|
|
189
127
|
nk_f64_t h[9]) {
|
|
190
128
|
v128_t zero_f64x2 = wasm_f64x2_splat(0.0);
|
|
191
|
-
v128_t
|
|
192
|
-
v128_t
|
|
193
|
-
v128_t
|
|
194
|
-
v128_t
|
|
195
|
-
v128_t
|
|
196
|
-
v128_t
|
|
197
|
-
v128_t
|
|
198
|
-
v128_t
|
|
199
|
-
v128_t
|
|
200
|
-
v128_t
|
|
201
|
-
v128_t
|
|
202
|
-
v128_t
|
|
203
|
-
v128_t
|
|
204
|
-
v128_t
|
|
205
|
-
v128_t
|
|
129
|
+
v128_t sum_a_x_low_f64x2 = zero_f64x2, sum_a_x_high_f64x2 = zero_f64x2;
|
|
130
|
+
v128_t sum_a_y_low_f64x2 = zero_f64x2, sum_a_y_high_f64x2 = zero_f64x2;
|
|
131
|
+
v128_t sum_a_z_low_f64x2 = zero_f64x2, sum_a_z_high_f64x2 = zero_f64x2;
|
|
132
|
+
v128_t sum_b_x_low_f64x2 = zero_f64x2, sum_b_x_high_f64x2 = zero_f64x2;
|
|
133
|
+
v128_t sum_b_y_low_f64x2 = zero_f64x2, sum_b_y_high_f64x2 = zero_f64x2;
|
|
134
|
+
v128_t sum_b_z_low_f64x2 = zero_f64x2, sum_b_z_high_f64x2 = zero_f64x2;
|
|
135
|
+
v128_t cross_00_low_f64x2 = zero_f64x2, cross_00_high_f64x2 = zero_f64x2;
|
|
136
|
+
v128_t cross_01_low_f64x2 = zero_f64x2, cross_01_high_f64x2 = zero_f64x2;
|
|
137
|
+
v128_t cross_02_low_f64x2 = zero_f64x2, cross_02_high_f64x2 = zero_f64x2;
|
|
138
|
+
v128_t cross_10_low_f64x2 = zero_f64x2, cross_10_high_f64x2 = zero_f64x2;
|
|
139
|
+
v128_t cross_11_low_f64x2 = zero_f64x2, cross_11_high_f64x2 = zero_f64x2;
|
|
140
|
+
v128_t cross_12_low_f64x2 = zero_f64x2, cross_12_high_f64x2 = zero_f64x2;
|
|
141
|
+
v128_t cross_20_low_f64x2 = zero_f64x2, cross_20_high_f64x2 = zero_f64x2;
|
|
142
|
+
v128_t cross_21_low_f64x2 = zero_f64x2, cross_21_high_f64x2 = zero_f64x2;
|
|
143
|
+
v128_t cross_22_low_f64x2 = zero_f64x2, cross_22_high_f64x2 = zero_f64x2;
|
|
206
144
|
nk_size_t index = 0;
|
|
207
145
|
|
|
208
146
|
for (; index + 4 <= n; index += 4) {
|
|
@@ -210,67 +148,67 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_v128relaxed_( //
|
|
|
210
148
|
nk_deinterleave_f32x4_v128relaxed_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4);
|
|
211
149
|
nk_deinterleave_f32x4_v128relaxed_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
|
|
212
150
|
|
|
213
|
-
v128_t
|
|
214
|
-
v128_t
|
|
215
|
-
v128_t
|
|
216
|
-
v128_t
|
|
217
|
-
v128_t
|
|
218
|
-
v128_t
|
|
219
|
-
v128_t
|
|
220
|
-
v128_t
|
|
221
|
-
v128_t
|
|
222
|
-
v128_t
|
|
223
|
-
v128_t
|
|
224
|
-
v128_t
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
151
|
+
v128_t a_x_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_x_f32x4);
|
|
152
|
+
v128_t a_x_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_x_f32x4, a_x_f32x4, 2, 3, 0, 1));
|
|
153
|
+
v128_t a_y_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_y_f32x4);
|
|
154
|
+
v128_t a_y_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_y_f32x4, a_y_f32x4, 2, 3, 0, 1));
|
|
155
|
+
v128_t a_z_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_z_f32x4);
|
|
156
|
+
v128_t a_z_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_z_f32x4, a_z_f32x4, 2, 3, 0, 1));
|
|
157
|
+
v128_t b_x_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_x_f32x4);
|
|
158
|
+
v128_t b_x_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_x_f32x4, b_x_f32x4, 2, 3, 0, 1));
|
|
159
|
+
v128_t b_y_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_y_f32x4);
|
|
160
|
+
v128_t b_y_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_y_f32x4, b_y_f32x4, 2, 3, 0, 1));
|
|
161
|
+
v128_t b_z_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_z_f32x4);
|
|
162
|
+
v128_t b_z_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_z_f32x4, b_z_f32x4, 2, 3, 0, 1));
|
|
163
|
+
|
|
164
|
+
sum_a_x_low_f64x2 = wasm_f64x2_add(sum_a_x_low_f64x2, a_x_low_f64x2),
|
|
165
|
+
sum_a_x_high_f64x2 = wasm_f64x2_add(sum_a_x_high_f64x2, a_x_high_f64x2);
|
|
166
|
+
sum_a_y_low_f64x2 = wasm_f64x2_add(sum_a_y_low_f64x2, a_y_low_f64x2),
|
|
167
|
+
sum_a_y_high_f64x2 = wasm_f64x2_add(sum_a_y_high_f64x2, a_y_high_f64x2);
|
|
168
|
+
sum_a_z_low_f64x2 = wasm_f64x2_add(sum_a_z_low_f64x2, a_z_low_f64x2),
|
|
169
|
+
sum_a_z_high_f64x2 = wasm_f64x2_add(sum_a_z_high_f64x2, a_z_high_f64x2);
|
|
170
|
+
sum_b_x_low_f64x2 = wasm_f64x2_add(sum_b_x_low_f64x2, b_x_low_f64x2),
|
|
171
|
+
sum_b_x_high_f64x2 = wasm_f64x2_add(sum_b_x_high_f64x2, b_x_high_f64x2);
|
|
172
|
+
sum_b_y_low_f64x2 = wasm_f64x2_add(sum_b_y_low_f64x2, b_y_low_f64x2),
|
|
173
|
+
sum_b_y_high_f64x2 = wasm_f64x2_add(sum_b_y_high_f64x2, b_y_high_f64x2);
|
|
174
|
+
sum_b_z_low_f64x2 = wasm_f64x2_add(sum_b_z_low_f64x2, b_z_low_f64x2),
|
|
175
|
+
sum_b_z_high_f64x2 = wasm_f64x2_add(sum_b_z_high_f64x2, b_z_high_f64x2);
|
|
176
|
+
|
|
177
|
+
cross_00_low_f64x2 = wasm_f64x2_relaxed_madd(a_x_low_f64x2, b_x_low_f64x2, cross_00_low_f64x2),
|
|
178
|
+
cross_00_high_f64x2 = wasm_f64x2_relaxed_madd(a_x_high_f64x2, b_x_high_f64x2, cross_00_high_f64x2);
|
|
179
|
+
cross_01_low_f64x2 = wasm_f64x2_relaxed_madd(a_x_low_f64x2, b_y_low_f64x2, cross_01_low_f64x2),
|
|
180
|
+
cross_01_high_f64x2 = wasm_f64x2_relaxed_madd(a_x_high_f64x2, b_y_high_f64x2, cross_01_high_f64x2);
|
|
181
|
+
cross_02_low_f64x2 = wasm_f64x2_relaxed_madd(a_x_low_f64x2, b_z_low_f64x2, cross_02_low_f64x2),
|
|
182
|
+
cross_02_high_f64x2 = wasm_f64x2_relaxed_madd(a_x_high_f64x2, b_z_high_f64x2, cross_02_high_f64x2);
|
|
183
|
+
cross_10_low_f64x2 = wasm_f64x2_relaxed_madd(a_y_low_f64x2, b_x_low_f64x2, cross_10_low_f64x2),
|
|
184
|
+
cross_10_high_f64x2 = wasm_f64x2_relaxed_madd(a_y_high_f64x2, b_x_high_f64x2, cross_10_high_f64x2);
|
|
185
|
+
cross_11_low_f64x2 = wasm_f64x2_relaxed_madd(a_y_low_f64x2, b_y_low_f64x2, cross_11_low_f64x2),
|
|
186
|
+
cross_11_high_f64x2 = wasm_f64x2_relaxed_madd(a_y_high_f64x2, b_y_high_f64x2, cross_11_high_f64x2);
|
|
187
|
+
cross_12_low_f64x2 = wasm_f64x2_relaxed_madd(a_y_low_f64x2, b_z_low_f64x2, cross_12_low_f64x2),
|
|
188
|
+
cross_12_high_f64x2 = wasm_f64x2_relaxed_madd(a_y_high_f64x2, b_z_high_f64x2, cross_12_high_f64x2);
|
|
189
|
+
cross_20_low_f64x2 = wasm_f64x2_relaxed_madd(a_z_low_f64x2, b_x_low_f64x2, cross_20_low_f64x2),
|
|
190
|
+
cross_20_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, b_x_high_f64x2, cross_20_high_f64x2);
|
|
191
|
+
cross_21_low_f64x2 = wasm_f64x2_relaxed_madd(a_z_low_f64x2, b_y_low_f64x2, cross_21_low_f64x2),
|
|
192
|
+
cross_21_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, b_y_high_f64x2, cross_21_high_f64x2);
|
|
193
|
+
cross_22_low_f64x2 = wasm_f64x2_relaxed_madd(a_z_low_f64x2, b_z_low_f64x2, cross_22_low_f64x2),
|
|
194
|
+
cross_22_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, b_z_high_f64x2, cross_22_high_f64x2);
|
|
257
195
|
}
|
|
258
196
|
|
|
259
|
-
nk_f64_t sum_a_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(
|
|
260
|
-
nk_f64_t sum_a_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(
|
|
261
|
-
nk_f64_t sum_a_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(
|
|
262
|
-
nk_f64_t sum_b_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(
|
|
263
|
-
nk_f64_t sum_b_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(
|
|
264
|
-
nk_f64_t sum_b_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(
|
|
265
|
-
nk_f64_t cross_00 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(
|
|
266
|
-
nk_f64_t cross_01 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(
|
|
267
|
-
nk_f64_t cross_02 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(
|
|
268
|
-
nk_f64_t cross_10 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(
|
|
269
|
-
nk_f64_t cross_11 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(
|
|
270
|
-
nk_f64_t cross_12 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(
|
|
271
|
-
nk_f64_t cross_20 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(
|
|
272
|
-
nk_f64_t cross_21 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(
|
|
273
|
-
nk_f64_t cross_22 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(
|
|
197
|
+
nk_f64_t sum_a_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_x_low_f64x2, sum_a_x_high_f64x2));
|
|
198
|
+
nk_f64_t sum_a_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_y_low_f64x2, sum_a_y_high_f64x2));
|
|
199
|
+
nk_f64_t sum_a_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_z_low_f64x2, sum_a_z_high_f64x2));
|
|
200
|
+
nk_f64_t sum_b_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_x_low_f64x2, sum_b_x_high_f64x2));
|
|
201
|
+
nk_f64_t sum_b_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_y_low_f64x2, sum_b_y_high_f64x2));
|
|
202
|
+
nk_f64_t sum_b_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_z_low_f64x2, sum_b_z_high_f64x2));
|
|
203
|
+
nk_f64_t cross_00 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_00_low_f64x2, cross_00_high_f64x2));
|
|
204
|
+
nk_f64_t cross_01 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_01_low_f64x2, cross_01_high_f64x2));
|
|
205
|
+
nk_f64_t cross_02 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_02_low_f64x2, cross_02_high_f64x2));
|
|
206
|
+
nk_f64_t cross_10 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_10_low_f64x2, cross_10_high_f64x2));
|
|
207
|
+
nk_f64_t cross_11 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_11_low_f64x2, cross_11_high_f64x2));
|
|
208
|
+
nk_f64_t cross_12 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_12_low_f64x2, cross_12_high_f64x2));
|
|
209
|
+
nk_f64_t cross_20 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_20_low_f64x2, cross_20_high_f64x2));
|
|
210
|
+
nk_f64_t cross_21 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_21_low_f64x2, cross_21_high_f64x2));
|
|
211
|
+
nk_f64_t cross_22 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_22_low_f64x2, cross_22_high_f64x2));
|
|
274
212
|
|
|
275
213
|
for (; index < n; ++index) {
|
|
276
214
|
nk_f64_t a_x = a[index * 3 + 0], a_y = a[index * 3 + 1], a_z = a[index * 3 + 2];
|
|
@@ -282,59 +220,157 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_v128relaxed_( //
|
|
|
282
220
|
cross_20 += a_z * b_x, cross_21 += a_z * b_y, cross_22 += a_z * b_z;
|
|
283
221
|
}
|
|
284
222
|
|
|
285
|
-
nk_f64_t
|
|
286
|
-
*
|
|
287
|
-
*
|
|
223
|
+
nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
|
|
224
|
+
*centroid_a_x = sum_a_x * inv_points_count, *centroid_a_y = sum_a_y * inv_points_count,
|
|
225
|
+
*centroid_a_z = sum_a_z * inv_points_count;
|
|
226
|
+
*centroid_b_x = sum_b_x * inv_points_count, *centroid_b_y = sum_b_y * inv_points_count,
|
|
227
|
+
*centroid_b_z = sum_b_z * inv_points_count;
|
|
288
228
|
|
|
289
229
|
nk_f64_t n_f64 = (nk_f64_t)n;
|
|
290
|
-
h[0] = cross_00 - n_f64 * (*
|
|
291
|
-
h[
|
|
292
|
-
h[
|
|
293
|
-
h[
|
|
294
|
-
h[
|
|
295
|
-
h[
|
|
230
|
+
h[0] = cross_00 - n_f64 * (*centroid_a_x) * (*centroid_b_x),
|
|
231
|
+
h[1] = cross_01 - n_f64 * (*centroid_a_x) * (*centroid_b_y),
|
|
232
|
+
h[2] = cross_02 - n_f64 * (*centroid_a_x) * (*centroid_b_z);
|
|
233
|
+
h[3] = cross_10 - n_f64 * (*centroid_a_y) * (*centroid_b_x),
|
|
234
|
+
h[4] = cross_11 - n_f64 * (*centroid_a_y) * (*centroid_b_y),
|
|
235
|
+
h[5] = cross_12 - n_f64 * (*centroid_a_y) * (*centroid_b_z);
|
|
236
|
+
h[6] = cross_20 - n_f64 * (*centroid_a_z) * (*centroid_b_x),
|
|
237
|
+
h[7] = cross_21 - n_f64 * (*centroid_a_z) * (*centroid_b_y),
|
|
238
|
+
h[8] = cross_22 - n_f64 * (*centroid_a_z) * (*centroid_b_z);
|
|
296
239
|
}
|
|
297
240
|
|
|
298
241
|
NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_v128relaxed_( //
|
|
299
242
|
nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, //
|
|
300
|
-
nk_f64_t *
|
|
301
|
-
nk_f64_t *
|
|
243
|
+
nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
|
|
244
|
+
nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
|
|
302
245
|
nk_f64_t h[9], nk_f64_t *variance_a) {
|
|
303
246
|
v128_t zero_f64x2 = wasm_f64x2_splat(0.0);
|
|
304
|
-
v128_t
|
|
305
|
-
|
|
306
|
-
|
|
247
|
+
v128_t sum_a_x_low_f64x2 = zero_f64x2, sum_a_x_high_f64x2 = zero_f64x2;
|
|
248
|
+
v128_t sum_a_y_low_f64x2 = zero_f64x2, sum_a_y_high_f64x2 = zero_f64x2;
|
|
249
|
+
v128_t sum_a_z_low_f64x2 = zero_f64x2, sum_a_z_high_f64x2 = zero_f64x2;
|
|
250
|
+
v128_t sum_b_x_low_f64x2 = zero_f64x2, sum_b_x_high_f64x2 = zero_f64x2;
|
|
251
|
+
v128_t sum_b_y_low_f64x2 = zero_f64x2, sum_b_y_high_f64x2 = zero_f64x2;
|
|
252
|
+
v128_t sum_b_z_low_f64x2 = zero_f64x2, sum_b_z_high_f64x2 = zero_f64x2;
|
|
253
|
+
v128_t cross_00_low_f64x2 = zero_f64x2, cross_00_high_f64x2 = zero_f64x2;
|
|
254
|
+
v128_t cross_01_low_f64x2 = zero_f64x2, cross_01_high_f64x2 = zero_f64x2;
|
|
255
|
+
v128_t cross_02_low_f64x2 = zero_f64x2, cross_02_high_f64x2 = zero_f64x2;
|
|
256
|
+
v128_t cross_10_low_f64x2 = zero_f64x2, cross_10_high_f64x2 = zero_f64x2;
|
|
257
|
+
v128_t cross_11_low_f64x2 = zero_f64x2, cross_11_high_f64x2 = zero_f64x2;
|
|
258
|
+
v128_t cross_12_low_f64x2 = zero_f64x2, cross_12_high_f64x2 = zero_f64x2;
|
|
259
|
+
v128_t cross_20_low_f64x2 = zero_f64x2, cross_20_high_f64x2 = zero_f64x2;
|
|
260
|
+
v128_t cross_21_low_f64x2 = zero_f64x2, cross_21_high_f64x2 = zero_f64x2;
|
|
261
|
+
v128_t cross_22_low_f64x2 = zero_f64x2, cross_22_high_f64x2 = zero_f64x2;
|
|
262
|
+
v128_t sum_norm_squared_low_f64x2 = zero_f64x2, sum_norm_squared_high_f64x2 = zero_f64x2;
|
|
307
263
|
nk_size_t index = 0;
|
|
264
|
+
|
|
308
265
|
for (; index + 4 <= n; index += 4) {
|
|
309
|
-
v128_t a_x_f32x4, a_y_f32x4, a_z_f32x4;
|
|
266
|
+
v128_t a_x_f32x4, a_y_f32x4, a_z_f32x4, b_x_f32x4, b_y_f32x4, b_z_f32x4;
|
|
310
267
|
nk_deinterleave_f32x4_v128relaxed_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4);
|
|
268
|
+
nk_deinterleave_f32x4_v128relaxed_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
|
|
311
269
|
|
|
312
|
-
v128_t
|
|
313
|
-
v128_t
|
|
314
|
-
v128_t
|
|
315
|
-
v128_t
|
|
316
|
-
v128_t
|
|
317
|
-
v128_t
|
|
318
|
-
|
|
319
|
-
v128_t
|
|
320
|
-
|
|
321
|
-
v128_t
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
270
|
+
v128_t a_x_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_x_f32x4);
|
|
271
|
+
v128_t a_x_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_x_f32x4, a_x_f32x4, 2, 3, 0, 1));
|
|
272
|
+
v128_t a_y_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_y_f32x4);
|
|
273
|
+
v128_t a_y_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_y_f32x4, a_y_f32x4, 2, 3, 0, 1));
|
|
274
|
+
v128_t a_z_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_z_f32x4);
|
|
275
|
+
v128_t a_z_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_z_f32x4, a_z_f32x4, 2, 3, 0, 1));
|
|
276
|
+
v128_t b_x_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_x_f32x4);
|
|
277
|
+
v128_t b_x_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_x_f32x4, b_x_f32x4, 2, 3, 0, 1));
|
|
278
|
+
v128_t b_y_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_y_f32x4);
|
|
279
|
+
v128_t b_y_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_y_f32x4, b_y_f32x4, 2, 3, 0, 1));
|
|
280
|
+
v128_t b_z_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_z_f32x4);
|
|
281
|
+
v128_t b_z_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_z_f32x4, b_z_f32x4, 2, 3, 0, 1));
|
|
282
|
+
|
|
283
|
+
sum_a_x_low_f64x2 = wasm_f64x2_add(sum_a_x_low_f64x2, a_x_low_f64x2),
|
|
284
|
+
sum_a_x_high_f64x2 = wasm_f64x2_add(sum_a_x_high_f64x2, a_x_high_f64x2);
|
|
285
|
+
sum_a_y_low_f64x2 = wasm_f64x2_add(sum_a_y_low_f64x2, a_y_low_f64x2),
|
|
286
|
+
sum_a_y_high_f64x2 = wasm_f64x2_add(sum_a_y_high_f64x2, a_y_high_f64x2);
|
|
287
|
+
sum_a_z_low_f64x2 = wasm_f64x2_add(sum_a_z_low_f64x2, a_z_low_f64x2),
|
|
288
|
+
sum_a_z_high_f64x2 = wasm_f64x2_add(sum_a_z_high_f64x2, a_z_high_f64x2);
|
|
289
|
+
sum_b_x_low_f64x2 = wasm_f64x2_add(sum_b_x_low_f64x2, b_x_low_f64x2),
|
|
290
|
+
sum_b_x_high_f64x2 = wasm_f64x2_add(sum_b_x_high_f64x2, b_x_high_f64x2);
|
|
291
|
+
sum_b_y_low_f64x2 = wasm_f64x2_add(sum_b_y_low_f64x2, b_y_low_f64x2),
|
|
292
|
+
sum_b_y_high_f64x2 = wasm_f64x2_add(sum_b_y_high_f64x2, b_y_high_f64x2);
|
|
293
|
+
sum_b_z_low_f64x2 = wasm_f64x2_add(sum_b_z_low_f64x2, b_z_low_f64x2),
|
|
294
|
+
sum_b_z_high_f64x2 = wasm_f64x2_add(sum_b_z_high_f64x2, b_z_high_f64x2);
|
|
295
|
+
|
|
296
|
+
cross_00_low_f64x2 = wasm_f64x2_relaxed_madd(a_x_low_f64x2, b_x_low_f64x2, cross_00_low_f64x2),
|
|
297
|
+
cross_00_high_f64x2 = wasm_f64x2_relaxed_madd(a_x_high_f64x2, b_x_high_f64x2, cross_00_high_f64x2);
|
|
298
|
+
cross_01_low_f64x2 = wasm_f64x2_relaxed_madd(a_x_low_f64x2, b_y_low_f64x2, cross_01_low_f64x2),
|
|
299
|
+
cross_01_high_f64x2 = wasm_f64x2_relaxed_madd(a_x_high_f64x2, b_y_high_f64x2, cross_01_high_f64x2);
|
|
300
|
+
cross_02_low_f64x2 = wasm_f64x2_relaxed_madd(a_x_low_f64x2, b_z_low_f64x2, cross_02_low_f64x2),
|
|
301
|
+
cross_02_high_f64x2 = wasm_f64x2_relaxed_madd(a_x_high_f64x2, b_z_high_f64x2, cross_02_high_f64x2);
|
|
302
|
+
cross_10_low_f64x2 = wasm_f64x2_relaxed_madd(a_y_low_f64x2, b_x_low_f64x2, cross_10_low_f64x2),
|
|
303
|
+
cross_10_high_f64x2 = wasm_f64x2_relaxed_madd(a_y_high_f64x2, b_x_high_f64x2, cross_10_high_f64x2);
|
|
304
|
+
cross_11_low_f64x2 = wasm_f64x2_relaxed_madd(a_y_low_f64x2, b_y_low_f64x2, cross_11_low_f64x2),
|
|
305
|
+
cross_11_high_f64x2 = wasm_f64x2_relaxed_madd(a_y_high_f64x2, b_y_high_f64x2, cross_11_high_f64x2);
|
|
306
|
+
cross_12_low_f64x2 = wasm_f64x2_relaxed_madd(a_y_low_f64x2, b_z_low_f64x2, cross_12_low_f64x2),
|
|
307
|
+
cross_12_high_f64x2 = wasm_f64x2_relaxed_madd(a_y_high_f64x2, b_z_high_f64x2, cross_12_high_f64x2);
|
|
308
|
+
cross_20_low_f64x2 = wasm_f64x2_relaxed_madd(a_z_low_f64x2, b_x_low_f64x2, cross_20_low_f64x2),
|
|
309
|
+
cross_20_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, b_x_high_f64x2, cross_20_high_f64x2);
|
|
310
|
+
cross_21_low_f64x2 = wasm_f64x2_relaxed_madd(a_z_low_f64x2, b_y_low_f64x2, cross_21_low_f64x2),
|
|
311
|
+
cross_21_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, b_y_high_f64x2, cross_21_high_f64x2);
|
|
312
|
+
cross_22_low_f64x2 = wasm_f64x2_relaxed_madd(a_z_low_f64x2, b_z_low_f64x2, cross_22_low_f64x2),
|
|
313
|
+
cross_22_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, b_z_high_f64x2, cross_22_high_f64x2);
|
|
314
|
+
|
|
315
|
+
// Variance: accumulate ||a||^2.
|
|
316
|
+
v128_t norm_squared_low_f64x2 = wasm_f64x2_relaxed_madd(a_y_low_f64x2, a_y_low_f64x2,
|
|
317
|
+
wasm_f64x2_mul(a_x_low_f64x2, a_x_low_f64x2));
|
|
318
|
+
v128_t norm_squared_high_f64x2 = wasm_f64x2_relaxed_madd(a_y_high_f64x2, a_y_high_f64x2,
|
|
319
|
+
wasm_f64x2_mul(a_x_high_f64x2, a_x_high_f64x2));
|
|
320
|
+
norm_squared_low_f64x2 = wasm_f64x2_relaxed_madd(a_z_low_f64x2, a_z_low_f64x2, norm_squared_low_f64x2);
|
|
321
|
+
norm_squared_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, a_z_high_f64x2, norm_squared_high_f64x2);
|
|
322
|
+
sum_norm_squared_low_f64x2 = wasm_f64x2_add(sum_norm_squared_low_f64x2, norm_squared_low_f64x2);
|
|
323
|
+
sum_norm_squared_high_f64x2 = wasm_f64x2_add(sum_norm_squared_high_f64x2, norm_squared_high_f64x2);
|
|
327
324
|
}
|
|
328
325
|
|
|
326
|
+
nk_f64_t sum_a_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_x_low_f64x2, sum_a_x_high_f64x2));
|
|
327
|
+
nk_f64_t sum_a_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_y_low_f64x2, sum_a_y_high_f64x2));
|
|
328
|
+
nk_f64_t sum_a_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_z_low_f64x2, sum_a_z_high_f64x2));
|
|
329
|
+
nk_f64_t sum_b_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_x_low_f64x2, sum_b_x_high_f64x2));
|
|
330
|
+
nk_f64_t sum_b_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_y_low_f64x2, sum_b_y_high_f64x2));
|
|
331
|
+
nk_f64_t sum_b_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_z_low_f64x2, sum_b_z_high_f64x2));
|
|
332
|
+
nk_f64_t cross_00 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_00_low_f64x2, cross_00_high_f64x2));
|
|
333
|
+
nk_f64_t cross_01 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_01_low_f64x2, cross_01_high_f64x2));
|
|
334
|
+
nk_f64_t cross_02 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_02_low_f64x2, cross_02_high_f64x2));
|
|
335
|
+
nk_f64_t cross_10 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_10_low_f64x2, cross_10_high_f64x2));
|
|
336
|
+
nk_f64_t cross_11 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_11_low_f64x2, cross_11_high_f64x2));
|
|
337
|
+
nk_f64_t cross_12 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_12_low_f64x2, cross_12_high_f64x2));
|
|
338
|
+
nk_f64_t cross_20 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_20_low_f64x2, cross_20_high_f64x2));
|
|
339
|
+
nk_f64_t cross_21 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_21_low_f64x2, cross_21_high_f64x2));
|
|
340
|
+
nk_f64_t cross_22 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_22_low_f64x2, cross_22_high_f64x2));
|
|
329
341
|
nk_f64_t sum_norm_squared = nk_hsum_f64x2_v128relaxed_(
|
|
330
|
-
wasm_f64x2_add(
|
|
342
|
+
wasm_f64x2_add(sum_norm_squared_low_f64x2, sum_norm_squared_high_f64x2));
|
|
343
|
+
|
|
331
344
|
for (; index < n; ++index) {
|
|
332
345
|
nk_f64_t a_x = a[index * 3 + 0], a_y = a[index * 3 + 1], a_z = a[index * 3 + 2];
|
|
346
|
+
nk_f64_t b_x = b[index * 3 + 0], b_y = b[index * 3 + 1], b_z = b[index * 3 + 2];
|
|
347
|
+
sum_a_x += a_x, sum_a_y += a_y, sum_a_z += a_z;
|
|
348
|
+
sum_b_x += b_x, sum_b_y += b_y, sum_b_z += b_z;
|
|
349
|
+
cross_00 += a_x * b_x, cross_01 += a_x * b_y, cross_02 += a_x * b_z;
|
|
350
|
+
cross_10 += a_y * b_x, cross_11 += a_y * b_y, cross_12 += a_y * b_z;
|
|
351
|
+
cross_20 += a_z * b_x, cross_21 += a_z * b_y, cross_22 += a_z * b_z;
|
|
333
352
|
sum_norm_squared += a_x * a_x + a_y * a_y + a_z * a_z;
|
|
334
353
|
}
|
|
335
354
|
|
|
336
|
-
nk_f64_t
|
|
337
|
-
*
|
|
355
|
+
nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
|
|
356
|
+
*centroid_a_x = sum_a_x * inv_points_count, *centroid_a_y = sum_a_y * inv_points_count,
|
|
357
|
+
*centroid_a_z = sum_a_z * inv_points_count;
|
|
358
|
+
*centroid_b_x = sum_b_x * inv_points_count, *centroid_b_y = sum_b_y * inv_points_count,
|
|
359
|
+
*centroid_b_z = sum_b_z * inv_points_count;
|
|
360
|
+
|
|
361
|
+
nk_f64_t n_f64 = (nk_f64_t)n;
|
|
362
|
+
h[0] = cross_00 - n_f64 * (*centroid_a_x) * (*centroid_b_x),
|
|
363
|
+
h[1] = cross_01 - n_f64 * (*centroid_a_x) * (*centroid_b_y),
|
|
364
|
+
h[2] = cross_02 - n_f64 * (*centroid_a_x) * (*centroid_b_z);
|
|
365
|
+
h[3] = cross_10 - n_f64 * (*centroid_a_y) * (*centroid_b_x),
|
|
366
|
+
h[4] = cross_11 - n_f64 * (*centroid_a_y) * (*centroid_b_y),
|
|
367
|
+
h[5] = cross_12 - n_f64 * (*centroid_a_y) * (*centroid_b_z);
|
|
368
|
+
h[6] = cross_20 - n_f64 * (*centroid_a_z) * (*centroid_b_x),
|
|
369
|
+
h[7] = cross_21 - n_f64 * (*centroid_a_z) * (*centroid_b_y),
|
|
370
|
+
h[8] = cross_22 - n_f64 * (*centroid_a_z) * (*centroid_b_z);
|
|
371
|
+
*variance_a = sum_norm_squared * inv_points_count -
|
|
372
|
+
((*centroid_a_x) * (*centroid_a_x) + (*centroid_a_y) * (*centroid_a_y) +
|
|
373
|
+
(*centroid_a_z) * (*centroid_a_z));
|
|
338
374
|
}
|
|
339
375
|
|
|
340
376
|
NK_INTERNAL nk_f64_t nk_transformed_ssd_f32_v128relaxed_( //
|
|
@@ -352,7 +388,7 @@ NK_INTERNAL nk_f64_t nk_transformed_ssd_f32_v128relaxed_( //
|
|
|
352
388
|
v128_t centroid_a_x_f64x2 = wasm_f64x2_splat(centroid_a_x), centroid_a_y_f64x2 = wasm_f64x2_splat(centroid_a_y);
|
|
353
389
|
v128_t centroid_a_z_f64x2 = wasm_f64x2_splat(centroid_a_z), centroid_b_x_f64x2 = wasm_f64x2_splat(centroid_b_x);
|
|
354
390
|
v128_t centroid_b_y_f64x2 = wasm_f64x2_splat(centroid_b_y), centroid_b_z_f64x2 = wasm_f64x2_splat(centroid_b_z);
|
|
355
|
-
v128_t
|
|
391
|
+
v128_t sum_squared_low_f64x2 = wasm_f64x2_splat(0.0), sum_squared_high_f64x2 = wasm_f64x2_splat(0.0);
|
|
356
392
|
nk_size_t index = 0;
|
|
357
393
|
|
|
358
394
|
for (; index + 4 <= n; index += 4) {
|
|
@@ -360,82 +396,79 @@ NK_INTERNAL nk_f64_t nk_transformed_ssd_f32_v128relaxed_( //
|
|
|
360
396
|
nk_deinterleave_f32x4_v128relaxed_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4);
|
|
361
397
|
nk_deinterleave_f32x4_v128relaxed_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
|
|
362
398
|
|
|
363
|
-
v128_t
|
|
364
|
-
v128_t
|
|
399
|
+
v128_t centered_a_x_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(a_x_f32x4), centroid_a_x_f64x2);
|
|
400
|
+
v128_t centered_a_x_high_f64x2 = wasm_f64x2_sub(
|
|
365
401
|
wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_x_f32x4, a_x_f32x4, 2, 3, 0, 1)), centroid_a_x_f64x2);
|
|
366
|
-
v128_t
|
|
367
|
-
v128_t
|
|
402
|
+
v128_t centered_a_y_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(a_y_f32x4), centroid_a_y_f64x2);
|
|
403
|
+
v128_t centered_a_y_high_f64x2 = wasm_f64x2_sub(
|
|
368
404
|
wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_y_f32x4, a_y_f32x4, 2, 3, 0, 1)), centroid_a_y_f64x2);
|
|
369
|
-
v128_t
|
|
370
|
-
v128_t
|
|
405
|
+
v128_t centered_a_z_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(a_z_f32x4), centroid_a_z_f64x2);
|
|
406
|
+
v128_t centered_a_z_high_f64x2 = wasm_f64x2_sub(
|
|
371
407
|
wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_z_f32x4, a_z_f32x4, 2, 3, 0, 1)), centroid_a_z_f64x2);
|
|
372
|
-
v128_t
|
|
373
|
-
v128_t
|
|
408
|
+
v128_t centered_b_x_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(b_x_f32x4), centroid_b_x_f64x2);
|
|
409
|
+
v128_t centered_b_x_high_f64x2 = wasm_f64x2_sub(
|
|
374
410
|
wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_x_f32x4, b_x_f32x4, 2, 3, 0, 1)), centroid_b_x_f64x2);
|
|
375
|
-
v128_t
|
|
376
|
-
v128_t
|
|
411
|
+
v128_t centered_b_y_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(b_y_f32x4), centroid_b_y_f64x2);
|
|
412
|
+
v128_t centered_b_y_high_f64x2 = wasm_f64x2_sub(
|
|
377
413
|
wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_y_f32x4, b_y_f32x4, 2, 3, 0, 1)), centroid_b_y_f64x2);
|
|
378
|
-
v128_t
|
|
379
|
-
v128_t
|
|
414
|
+
v128_t centered_b_z_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(b_z_f32x4), centroid_b_z_f64x2);
|
|
415
|
+
v128_t centered_b_z_high_f64x2 = wasm_f64x2_sub(
|
|
380
416
|
wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_z_f32x4, b_z_f32x4, 2, 3, 0, 1)), centroid_b_z_f64x2);
|
|
381
417
|
|
|
382
|
-
v128_t
|
|
383
|
-
scaled_rotation_x_z_f64x2,
|
|
384
|
-
wasm_f64x2_relaxed_madd(scaled_rotation_x_y_f64x2,
|
|
385
|
-
wasm_f64x2_mul(scaled_rotation_x_x_f64x2,
|
|
386
|
-
v128_t
|
|
387
|
-
scaled_rotation_x_z_f64x2,
|
|
388
|
-
wasm_f64x2_relaxed_madd(scaled_rotation_x_y_f64x2,
|
|
389
|
-
wasm_f64x2_mul(scaled_rotation_x_x_f64x2,
|
|
390
|
-
v128_t
|
|
391
|
-
scaled_rotation_y_z_f64x2,
|
|
392
|
-
wasm_f64x2_relaxed_madd(scaled_rotation_y_y_f64x2,
|
|
393
|
-
wasm_f64x2_mul(scaled_rotation_y_x_f64x2,
|
|
394
|
-
v128_t
|
|
395
|
-
scaled_rotation_y_z_f64x2,
|
|
396
|
-
wasm_f64x2_relaxed_madd(scaled_rotation_y_y_f64x2,
|
|
397
|
-
wasm_f64x2_mul(scaled_rotation_y_x_f64x2,
|
|
398
|
-
v128_t
|
|
399
|
-
scaled_rotation_z_z_f64x2,
|
|
400
|
-
wasm_f64x2_relaxed_madd(scaled_rotation_z_y_f64x2,
|
|
401
|
-
wasm_f64x2_mul(scaled_rotation_z_x_f64x2,
|
|
402
|
-
v128_t
|
|
403
|
-
scaled_rotation_z_z_f64x2,
|
|
404
|
-
wasm_f64x2_relaxed_madd(scaled_rotation_z_y_f64x2,
|
|
405
|
-
wasm_f64x2_mul(scaled_rotation_z_x_f64x2,
|
|
406
|
-
|
|
407
|
-
v128_t
|
|
408
|
-
v128_t
|
|
409
|
-
v128_t
|
|
410
|
-
v128_t
|
|
411
|
-
v128_t
|
|
412
|
-
v128_t
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
sum_squared_lower_f64x2);
|
|
424
|
-
sum_squared_upper_f64x2 = wasm_f64x2_relaxed_madd(delta_z_upper_f64x2, delta_z_upper_f64x2,
|
|
425
|
-
sum_squared_upper_f64x2);
|
|
418
|
+
v128_t rotated_a_x_low_f64x2 = wasm_f64x2_relaxed_madd(
|
|
419
|
+
scaled_rotation_x_z_f64x2, centered_a_z_low_f64x2,
|
|
420
|
+
wasm_f64x2_relaxed_madd(scaled_rotation_x_y_f64x2, centered_a_y_low_f64x2,
|
|
421
|
+
wasm_f64x2_mul(scaled_rotation_x_x_f64x2, centered_a_x_low_f64x2)));
|
|
422
|
+
v128_t rotated_a_x_high_f64x2 = wasm_f64x2_relaxed_madd(
|
|
423
|
+
scaled_rotation_x_z_f64x2, centered_a_z_high_f64x2,
|
|
424
|
+
wasm_f64x2_relaxed_madd(scaled_rotation_x_y_f64x2, centered_a_y_high_f64x2,
|
|
425
|
+
wasm_f64x2_mul(scaled_rotation_x_x_f64x2, centered_a_x_high_f64x2)));
|
|
426
|
+
v128_t rotated_a_y_low_f64x2 = wasm_f64x2_relaxed_madd(
|
|
427
|
+
scaled_rotation_y_z_f64x2, centered_a_z_low_f64x2,
|
|
428
|
+
wasm_f64x2_relaxed_madd(scaled_rotation_y_y_f64x2, centered_a_y_low_f64x2,
|
|
429
|
+
wasm_f64x2_mul(scaled_rotation_y_x_f64x2, centered_a_x_low_f64x2)));
|
|
430
|
+
v128_t rotated_a_y_high_f64x2 = wasm_f64x2_relaxed_madd(
|
|
431
|
+
scaled_rotation_y_z_f64x2, centered_a_z_high_f64x2,
|
|
432
|
+
wasm_f64x2_relaxed_madd(scaled_rotation_y_y_f64x2, centered_a_y_high_f64x2,
|
|
433
|
+
wasm_f64x2_mul(scaled_rotation_y_x_f64x2, centered_a_x_high_f64x2)));
|
|
434
|
+
v128_t rotated_a_z_low_f64x2 = wasm_f64x2_relaxed_madd(
|
|
435
|
+
scaled_rotation_z_z_f64x2, centered_a_z_low_f64x2,
|
|
436
|
+
wasm_f64x2_relaxed_madd(scaled_rotation_z_y_f64x2, centered_a_y_low_f64x2,
|
|
437
|
+
wasm_f64x2_mul(scaled_rotation_z_x_f64x2, centered_a_x_low_f64x2)));
|
|
438
|
+
v128_t rotated_a_z_high_f64x2 = wasm_f64x2_relaxed_madd(
|
|
439
|
+
scaled_rotation_z_z_f64x2, centered_a_z_high_f64x2,
|
|
440
|
+
wasm_f64x2_relaxed_madd(scaled_rotation_z_y_f64x2, centered_a_y_high_f64x2,
|
|
441
|
+
wasm_f64x2_mul(scaled_rotation_z_x_f64x2, centered_a_x_high_f64x2)));
|
|
442
|
+
|
|
443
|
+
v128_t delta_x_low_f64x2 = wasm_f64x2_sub(rotated_a_x_low_f64x2, centered_b_x_low_f64x2);
|
|
444
|
+
v128_t delta_x_high_f64x2 = wasm_f64x2_sub(rotated_a_x_high_f64x2, centered_b_x_high_f64x2);
|
|
445
|
+
v128_t delta_y_low_f64x2 = wasm_f64x2_sub(rotated_a_y_low_f64x2, centered_b_y_low_f64x2);
|
|
446
|
+
v128_t delta_y_high_f64x2 = wasm_f64x2_sub(rotated_a_y_high_f64x2, centered_b_y_high_f64x2);
|
|
447
|
+
v128_t delta_z_low_f64x2 = wasm_f64x2_sub(rotated_a_z_low_f64x2, centered_b_z_low_f64x2);
|
|
448
|
+
v128_t delta_z_high_f64x2 = wasm_f64x2_sub(rotated_a_z_high_f64x2, centered_b_z_high_f64x2);
|
|
449
|
+
|
|
450
|
+
sum_squared_low_f64x2 = wasm_f64x2_relaxed_madd(delta_x_low_f64x2, delta_x_low_f64x2, sum_squared_low_f64x2);
|
|
451
|
+
sum_squared_high_f64x2 = wasm_f64x2_relaxed_madd(delta_x_high_f64x2, delta_x_high_f64x2,
|
|
452
|
+
sum_squared_high_f64x2);
|
|
453
|
+
sum_squared_low_f64x2 = wasm_f64x2_relaxed_madd(delta_y_low_f64x2, delta_y_low_f64x2, sum_squared_low_f64x2);
|
|
454
|
+
sum_squared_high_f64x2 = wasm_f64x2_relaxed_madd(delta_y_high_f64x2, delta_y_high_f64x2,
|
|
455
|
+
sum_squared_high_f64x2);
|
|
456
|
+
sum_squared_low_f64x2 = wasm_f64x2_relaxed_madd(delta_z_low_f64x2, delta_z_low_f64x2, sum_squared_low_f64x2);
|
|
457
|
+
sum_squared_high_f64x2 = wasm_f64x2_relaxed_madd(delta_z_high_f64x2, delta_z_high_f64x2,
|
|
458
|
+
sum_squared_high_f64x2);
|
|
426
459
|
}
|
|
427
460
|
|
|
428
|
-
nk_f64_t sum_squared = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(
|
|
461
|
+
nk_f64_t sum_squared = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_squared_low_f64x2, sum_squared_high_f64x2));
|
|
429
462
|
for (; index < n; ++index) {
|
|
430
|
-
nk_f64_t centered_a_x = (nk_f64_t)a[index * 3 + 0] - centroid_a_x
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
nk_f64_t centered_b_x = (nk_f64_t)b[index * 3 + 0] - centroid_b_x
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
nk_f64_t rotated_a_x = scale * (r[0] * centered_a_x + r[1] * centered_a_y + r[2] * centered_a_z)
|
|
437
|
-
|
|
438
|
-
|
|
463
|
+
nk_f64_t centered_a_x = (nk_f64_t)a[index * 3 + 0] - centroid_a_x,
|
|
464
|
+
centered_a_y = (nk_f64_t)a[index * 3 + 1] - centroid_a_y,
|
|
465
|
+
centered_a_z = (nk_f64_t)a[index * 3 + 2] - centroid_a_z;
|
|
466
|
+
nk_f64_t centered_b_x = (nk_f64_t)b[index * 3 + 0] - centroid_b_x,
|
|
467
|
+
centered_b_y = (nk_f64_t)b[index * 3 + 1] - centroid_b_y,
|
|
468
|
+
centered_b_z = (nk_f64_t)b[index * 3 + 2] - centroid_b_z;
|
|
469
|
+
nk_f64_t rotated_a_x = scale * (r[0] * centered_a_x + r[1] * centered_a_y + r[2] * centered_a_z),
|
|
470
|
+
rotated_a_y = scale * (r[3] * centered_a_x + r[4] * centered_a_y + r[5] * centered_a_z),
|
|
471
|
+
rotated_a_z = scale * (r[6] * centered_a_x + r[7] * centered_a_y + r[8] * centered_a_z);
|
|
439
472
|
nk_f64_t delta_x = rotated_a_x - centered_b_x, delta_y = rotated_a_y - centered_b_y,
|
|
440
473
|
delta_z = rotated_a_z - centered_b_z;
|
|
441
474
|
sum_squared += delta_x * delta_x + delta_y * delta_y + delta_z * delta_z;
|
|
@@ -474,35 +507,38 @@ NK_INTERNAL nk_f64_t nk_transformed_ssd_f64_v128relaxed_(nk_f64_t const *a, nk_f
|
|
|
474
507
|
|
|
475
508
|
// Main loop: process 2 points per iteration
|
|
476
509
|
for (; j + 2 <= n; j += 2) {
|
|
477
|
-
v128_t
|
|
478
|
-
nk_deinterleave_f64x2_v128relaxed_(a + j * 3, &
|
|
479
|
-
nk_deinterleave_f64x2_v128relaxed_(b + j * 3, &
|
|
510
|
+
v128_t a_x_f64x2, a_y_f64x2, a_z_f64x2, b_x_f64x2, b_y_f64x2, b_z_f64x2;
|
|
511
|
+
nk_deinterleave_f64x2_v128relaxed_(a + j * 3, &a_x_f64x2, &a_y_f64x2, &a_z_f64x2);
|
|
512
|
+
nk_deinterleave_f64x2_v128relaxed_(b + j * 3, &b_x_f64x2, &b_y_f64x2, &b_z_f64x2);
|
|
480
513
|
|
|
481
|
-
v128_t
|
|
482
|
-
v128_t
|
|
483
|
-
v128_t
|
|
484
|
-
v128_t
|
|
485
|
-
v128_t
|
|
486
|
-
v128_t
|
|
514
|
+
v128_t centered_a_x_f64x2 = wasm_f64x2_sub(a_x_f64x2, centroid_a_x_f64x2);
|
|
515
|
+
v128_t centered_a_y_f64x2 = wasm_f64x2_sub(a_y_f64x2, centroid_a_y_f64x2);
|
|
516
|
+
v128_t centered_a_z_f64x2 = wasm_f64x2_sub(a_z_f64x2, centroid_a_z_f64x2);
|
|
517
|
+
v128_t centered_b_x_f64x2 = wasm_f64x2_sub(b_x_f64x2, centroid_b_x_f64x2);
|
|
518
|
+
v128_t centered_b_y_f64x2 = wasm_f64x2_sub(b_y_f64x2, centroid_b_y_f64x2);
|
|
519
|
+
v128_t centered_b_z_f64x2 = wasm_f64x2_sub(b_z_f64x2, centroid_b_z_f64x2);
|
|
487
520
|
|
|
488
521
|
// Rotate and scale: ra = scale * R * pa
|
|
489
|
-
v128_t
|
|
490
|
-
scaled_rotation_x_z_f64x2,
|
|
491
|
-
wasm_f64x2_relaxed_madd(scaled_rotation_x_y_f64x2,
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
522
|
+
v128_t rotated_a_x_f64x2 = wasm_f64x2_relaxed_madd(
|
|
523
|
+
scaled_rotation_x_z_f64x2, centered_a_z_f64x2,
|
|
524
|
+
wasm_f64x2_relaxed_madd(scaled_rotation_x_y_f64x2, centered_a_y_f64x2,
|
|
525
|
+
wasm_f64x2_mul(scaled_rotation_x_x_f64x2, centered_a_x_f64x2)));
|
|
526
|
+
v128_t rotated_a_y_f64x2 = wasm_f64x2_relaxed_madd(
|
|
527
|
+
scaled_rotation_y_z_f64x2, centered_a_z_f64x2,
|
|
528
|
+
wasm_f64x2_relaxed_madd(scaled_rotation_y_y_f64x2, centered_a_y_f64x2,
|
|
529
|
+
wasm_f64x2_mul(scaled_rotation_y_x_f64x2, centered_a_x_f64x2)));
|
|
530
|
+
v128_t rotated_a_z_f64x2 = wasm_f64x2_relaxed_madd(
|
|
531
|
+
scaled_rotation_z_z_f64x2, centered_a_z_f64x2,
|
|
532
|
+
wasm_f64x2_relaxed_madd(scaled_rotation_z_y_f64x2, centered_a_y_f64x2,
|
|
533
|
+
wasm_f64x2_mul(scaled_rotation_z_x_f64x2, centered_a_x_f64x2)));
|
|
534
|
+
|
|
535
|
+
v128_t delta_x_f64x2 = wasm_f64x2_sub(rotated_a_x_f64x2, centered_b_x_f64x2);
|
|
536
|
+
v128_t delta_y_f64x2 = wasm_f64x2_sub(rotated_a_y_f64x2, centered_b_y_f64x2);
|
|
537
|
+
v128_t delta_z_f64x2 = wasm_f64x2_sub(rotated_a_z_f64x2, centered_b_z_f64x2);
|
|
538
|
+
|
|
539
|
+
nk_accumulate_square_f64x2_v128relaxed_(&sum_squared_f64x2, &sum_squared_compensation_f64x2, delta_x_f64x2);
|
|
540
|
+
nk_accumulate_square_f64x2_v128relaxed_(&sum_squared_f64x2, &sum_squared_compensation_f64x2, delta_y_f64x2);
|
|
541
|
+
nk_accumulate_square_f64x2_v128relaxed_(&sum_squared_f64x2, &sum_squared_compensation_f64x2, delta_z_f64x2);
|
|
506
542
|
}
|
|
507
543
|
|
|
508
544
|
nk_f64_t sum_squared = nk_dot_stable_sum_f64x2_v128relaxed_(sum_squared_f64x2, sum_squared_compensation_f64x2);
|
|
@@ -510,20 +546,16 @@ NK_INTERNAL nk_f64_t nk_transformed_ssd_f64_v128relaxed_(nk_f64_t const *a, nk_f
|
|
|
510
546
|
|
|
511
547
|
// Scalar tail
|
|
512
548
|
for (; j < n; ++j) {
|
|
513
|
-
nk_f64_t pa_x = a[j * 3 + 0] - centroid_a_x
|
|
514
|
-
|
|
515
|
-
nk_f64_t
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
nk_f64_t
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
nk_f64_t
|
|
523
|
-
|
|
524
|
-
nk_f64_t delta_x = ra_x - pb_x;
|
|
525
|
-
nk_f64_t delta_y = ra_y - pb_y;
|
|
526
|
-
nk_f64_t delta_z = ra_z - pb_z;
|
|
549
|
+
nk_f64_t pa_x = a[j * 3 + 0] - centroid_a_x, pa_y = a[j * 3 + 1] - centroid_a_y,
|
|
550
|
+
pa_z = a[j * 3 + 2] - centroid_a_z;
|
|
551
|
+
nk_f64_t pb_x = b[j * 3 + 0] - centroid_b_x, pb_y = b[j * 3 + 1] - centroid_b_y,
|
|
552
|
+
pb_z = b[j * 3 + 2] - centroid_b_z;
|
|
553
|
+
|
|
554
|
+
nk_f64_t ra_x = scale * (r[0] * pa_x + r[1] * pa_y + r[2] * pa_z),
|
|
555
|
+
ra_y = scale * (r[3] * pa_x + r[4] * pa_y + r[5] * pa_z),
|
|
556
|
+
ra_z = scale * (r[6] * pa_x + r[7] * pa_y + r[8] * pa_z);
|
|
557
|
+
|
|
558
|
+
nk_f64_t delta_x = ra_x - pb_x, delta_y = ra_y - pb_y, delta_z = ra_z - pb_z;
|
|
527
559
|
nk_accumulate_square_f64_(&sum_squared, &sum_squared_compensation, delta_x);
|
|
528
560
|
nk_accumulate_square_f64_(&sum_squared, &sum_squared_compensation, delta_y);
|
|
529
561
|
nk_accumulate_square_f64_(&sum_squared, &sum_squared_compensation, delta_z);
|
|
@@ -534,37 +566,121 @@ NK_INTERNAL nk_f64_t nk_transformed_ssd_f64_v128relaxed_(nk_f64_t const *a, nk_f
|
|
|
534
566
|
|
|
535
567
|
NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
|
|
536
568
|
nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
rotation[0] = 1, rotation[1] = 0, rotation[2] = 0;
|
|
540
|
-
rotation[3] = 0, rotation[4] = 1, rotation[5] = 0;
|
|
569
|
+
if (rotation)
|
|
570
|
+
rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
|
|
541
571
|
rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
|
|
542
|
-
}
|
|
543
572
|
if (scale) *scale = 1.0f;
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
573
|
+
|
|
574
|
+
// Fused single-pass: accumulate centroids and squared differences simultaneously.
|
|
575
|
+
// RMSD = √(E[(a−b)²] − (ā − b̄)²)
|
|
576
|
+
v128_t zero_f64x2 = wasm_f64x2_splat(0.0);
|
|
577
|
+
v128_t sum_a_x_low_f64x2 = zero_f64x2, sum_a_x_high_f64x2 = zero_f64x2;
|
|
578
|
+
v128_t sum_a_y_low_f64x2 = zero_f64x2, sum_a_y_high_f64x2 = zero_f64x2;
|
|
579
|
+
v128_t sum_a_z_low_f64x2 = zero_f64x2, sum_a_z_high_f64x2 = zero_f64x2;
|
|
580
|
+
v128_t sum_b_x_low_f64x2 = zero_f64x2, sum_b_x_high_f64x2 = zero_f64x2;
|
|
581
|
+
v128_t sum_b_y_low_f64x2 = zero_f64x2, sum_b_y_high_f64x2 = zero_f64x2;
|
|
582
|
+
v128_t sum_b_z_low_f64x2 = zero_f64x2, sum_b_z_high_f64x2 = zero_f64x2;
|
|
583
|
+
v128_t sum_sq_x_low_f64x2 = zero_f64x2, sum_sq_x_high_f64x2 = zero_f64x2;
|
|
584
|
+
v128_t sum_sq_y_low_f64x2 = zero_f64x2, sum_sq_y_high_f64x2 = zero_f64x2;
|
|
585
|
+
v128_t sum_sq_z_low_f64x2 = zero_f64x2, sum_sq_z_high_f64x2 = zero_f64x2;
|
|
586
|
+
nk_size_t index = 0;
|
|
587
|
+
|
|
588
|
+
for (; index + 4 <= n; index += 4) {
|
|
589
|
+
v128_t a_x_f32x4, a_y_f32x4, a_z_f32x4, b_x_f32x4, b_y_f32x4, b_z_f32x4;
|
|
590
|
+
nk_deinterleave_f32x4_v128relaxed_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4);
|
|
591
|
+
nk_deinterleave_f32x4_v128relaxed_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
|
|
592
|
+
|
|
593
|
+
// Promote lower and upper halves to f64. Deltas computed in f64 to avoid
|
|
594
|
+
// f32 cancellation in the single-pass formula RMSD = √(E[(a−b)²] − (ā − b̄)²).
|
|
595
|
+
v128_t a_x_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_x_f32x4);
|
|
596
|
+
v128_t a_x_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_x_f32x4, a_x_f32x4, 2, 3, 0, 1));
|
|
597
|
+
v128_t a_y_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_y_f32x4);
|
|
598
|
+
v128_t a_y_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_y_f32x4, a_y_f32x4, 2, 3, 0, 1));
|
|
599
|
+
v128_t a_z_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_z_f32x4);
|
|
600
|
+
v128_t a_z_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_z_f32x4, a_z_f32x4, 2, 3, 0, 1));
|
|
601
|
+
v128_t b_x_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_x_f32x4);
|
|
602
|
+
v128_t b_x_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_x_f32x4, b_x_f32x4, 2, 3, 0, 1));
|
|
603
|
+
v128_t b_y_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_y_f32x4);
|
|
604
|
+
v128_t b_y_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_y_f32x4, b_y_f32x4, 2, 3, 0, 1));
|
|
605
|
+
v128_t b_z_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_z_f32x4);
|
|
606
|
+
v128_t b_z_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_z_f32x4, b_z_f32x4, 2, 3, 0, 1));
|
|
607
|
+
|
|
608
|
+
// Accumulate centroids.
|
|
609
|
+
sum_a_x_low_f64x2 = wasm_f64x2_add(sum_a_x_low_f64x2, a_x_low_f64x2);
|
|
610
|
+
sum_a_x_high_f64x2 = wasm_f64x2_add(sum_a_x_high_f64x2, a_x_high_f64x2);
|
|
611
|
+
sum_a_y_low_f64x2 = wasm_f64x2_add(sum_a_y_low_f64x2, a_y_low_f64x2);
|
|
612
|
+
sum_a_y_high_f64x2 = wasm_f64x2_add(sum_a_y_high_f64x2, a_y_high_f64x2);
|
|
613
|
+
sum_a_z_low_f64x2 = wasm_f64x2_add(sum_a_z_low_f64x2, a_z_low_f64x2);
|
|
614
|
+
sum_a_z_high_f64x2 = wasm_f64x2_add(sum_a_z_high_f64x2, a_z_high_f64x2);
|
|
615
|
+
sum_b_x_low_f64x2 = wasm_f64x2_add(sum_b_x_low_f64x2, b_x_low_f64x2);
|
|
616
|
+
sum_b_x_high_f64x2 = wasm_f64x2_add(sum_b_x_high_f64x2, b_x_high_f64x2);
|
|
617
|
+
sum_b_y_low_f64x2 = wasm_f64x2_add(sum_b_y_low_f64x2, b_y_low_f64x2);
|
|
618
|
+
sum_b_y_high_f64x2 = wasm_f64x2_add(sum_b_y_high_f64x2, b_y_high_f64x2);
|
|
619
|
+
sum_b_z_low_f64x2 = wasm_f64x2_add(sum_b_z_low_f64x2, b_z_low_f64x2);
|
|
620
|
+
sum_b_z_high_f64x2 = wasm_f64x2_add(sum_b_z_high_f64x2, b_z_high_f64x2);
|
|
621
|
+
|
|
622
|
+
// Accumulate squared differences in f64 — deltas computed in f64 for precision.
|
|
623
|
+
v128_t dx_low_f64x2 = wasm_f64x2_sub(a_x_low_f64x2, b_x_low_f64x2);
|
|
624
|
+
v128_t dx_high_f64x2 = wasm_f64x2_sub(a_x_high_f64x2, b_x_high_f64x2);
|
|
625
|
+
v128_t dy_low_f64x2 = wasm_f64x2_sub(a_y_low_f64x2, b_y_low_f64x2);
|
|
626
|
+
v128_t dy_high_f64x2 = wasm_f64x2_sub(a_y_high_f64x2, b_y_high_f64x2);
|
|
627
|
+
v128_t dz_low_f64x2 = wasm_f64x2_sub(a_z_low_f64x2, b_z_low_f64x2);
|
|
628
|
+
v128_t dz_high_f64x2 = wasm_f64x2_sub(a_z_high_f64x2, b_z_high_f64x2);
|
|
629
|
+
|
|
630
|
+
sum_sq_x_low_f64x2 = wasm_f64x2_relaxed_madd(dx_low_f64x2, dx_low_f64x2, sum_sq_x_low_f64x2);
|
|
631
|
+
sum_sq_x_high_f64x2 = wasm_f64x2_relaxed_madd(dx_high_f64x2, dx_high_f64x2, sum_sq_x_high_f64x2);
|
|
632
|
+
sum_sq_y_low_f64x2 = wasm_f64x2_relaxed_madd(dy_low_f64x2, dy_low_f64x2, sum_sq_y_low_f64x2);
|
|
633
|
+
sum_sq_y_high_f64x2 = wasm_f64x2_relaxed_madd(dy_high_f64x2, dy_high_f64x2, sum_sq_y_high_f64x2);
|
|
634
|
+
sum_sq_z_low_f64x2 = wasm_f64x2_relaxed_madd(dz_low_f64x2, dz_low_f64x2, sum_sq_z_low_f64x2);
|
|
635
|
+
sum_sq_z_high_f64x2 = wasm_f64x2_relaxed_madd(dz_high_f64x2, dz_high_f64x2, sum_sq_z_high_f64x2);
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
nk_f64_t sum_a_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_x_low_f64x2, sum_a_x_high_f64x2));
|
|
639
|
+
nk_f64_t sum_a_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_y_low_f64x2, sum_a_y_high_f64x2));
|
|
640
|
+
nk_f64_t sum_a_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_z_low_f64x2, sum_a_z_high_f64x2));
|
|
641
|
+
nk_f64_t sum_b_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_x_low_f64x2, sum_b_x_high_f64x2));
|
|
642
|
+
nk_f64_t sum_b_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_y_low_f64x2, sum_b_y_high_f64x2));
|
|
643
|
+
nk_f64_t sum_b_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_z_low_f64x2, sum_b_z_high_f64x2));
|
|
644
|
+
nk_f64_t sum_sq_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_sq_x_low_f64x2, sum_sq_x_high_f64x2));
|
|
645
|
+
nk_f64_t sum_sq_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_sq_y_low_f64x2, sum_sq_y_high_f64x2));
|
|
646
|
+
nk_f64_t sum_sq_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_sq_z_low_f64x2, sum_sq_z_high_f64x2));
|
|
647
|
+
|
|
648
|
+
// Scalar tail.
|
|
649
|
+
for (; index < n; ++index) {
|
|
650
|
+
nk_f64_t ax = a[index * 3 + 0], ay = a[index * 3 + 1], az = a[index * 3 + 2];
|
|
651
|
+
nk_f64_t bx = b[index * 3 + 0], by = b[index * 3 + 1], bz = b[index * 3 + 2];
|
|
652
|
+
sum_a_x += ax, sum_a_y += ay, sum_a_z += az;
|
|
653
|
+
sum_b_x += bx, sum_b_y += by, sum_b_z += bz;
|
|
654
|
+
nk_f64_t dx = ax - bx, dy = ay - by, dz = az - bz;
|
|
655
|
+
sum_sq_x += dx * dx, sum_sq_y += dy * dy, sum_sq_z += dz * dz;
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
|
|
659
|
+
nk_f64_t centroid_a_x = sum_a_x * inv_points_count, centroid_a_y = sum_a_y * inv_points_count,
|
|
660
|
+
centroid_a_z = sum_a_z * inv_points_count;
|
|
661
|
+
nk_f64_t centroid_b_x = sum_b_x * inv_points_count, centroid_b_y = sum_b_y * inv_points_count,
|
|
662
|
+
centroid_b_z = sum_b_z * inv_points_count;
|
|
548
663
|
if (a_centroid)
|
|
549
664
|
a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
|
|
550
665
|
a_centroid[2] = (nk_f32_t)centroid_a_z;
|
|
551
666
|
if (b_centroid)
|
|
552
667
|
b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
|
|
553
668
|
b_centroid[2] = (nk_f32_t)centroid_b_z;
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
669
|
+
|
|
670
|
+
nk_f64_t sum_squared = sum_sq_x + sum_sq_y + sum_sq_z;
|
|
671
|
+
nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x;
|
|
672
|
+
nk_f64_t mean_diff_y = centroid_a_y - centroid_b_y;
|
|
673
|
+
nk_f64_t mean_diff_z = centroid_a_z - centroid_b_z;
|
|
674
|
+
nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
|
|
675
|
+
*result = nk_f64_sqrt_v128relaxed(sum_squared * inv_points_count - mean_diff_sq);
|
|
558
676
|
}
|
|
559
677
|
|
|
560
678
|
NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
|
|
561
679
|
nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
|
|
562
680
|
// RMSD uses identity rotation and scale=1.0
|
|
563
|
-
if (rotation)
|
|
564
|
-
rotation[0] = 1, rotation[1] = 0, rotation[2] = 0
|
|
565
|
-
rotation[3] = 0, rotation[4] = 1, rotation[5] = 0;
|
|
681
|
+
if (rotation)
|
|
682
|
+
rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
|
|
566
683
|
rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
|
|
567
|
-
}
|
|
568
684
|
if (scale) *scale = 1.0;
|
|
569
685
|
|
|
570
686
|
v128_t const zeros_f64x2 = wasm_f64x2_splat(0);
|
|
@@ -634,9 +750,11 @@ NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_
|
|
|
634
750
|
total_squared_z += total_squared_z_compensation;
|
|
635
751
|
|
|
636
752
|
// Compute centroids
|
|
637
|
-
nk_f64_t
|
|
638
|
-
nk_f64_t centroid_a_x = total_ax *
|
|
639
|
-
|
|
753
|
+
nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
|
|
754
|
+
nk_f64_t centroid_a_x = total_ax * inv_points_count, centroid_a_y = total_ay * inv_points_count,
|
|
755
|
+
centroid_a_z = total_az * inv_points_count;
|
|
756
|
+
nk_f64_t centroid_b_x = total_bx * inv_points_count, centroid_b_y = total_by * inv_points_count,
|
|
757
|
+
centroid_b_z = total_bz * inv_points_count;
|
|
640
758
|
if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
|
|
641
759
|
if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
|
|
642
760
|
|
|
@@ -647,7 +765,7 @@ NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_
|
|
|
647
765
|
nk_f64_t sum_squared = total_squared_x + total_squared_y + total_squared_z;
|
|
648
766
|
nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
|
|
649
767
|
|
|
650
|
-
*result = nk_f64_sqrt_v128relaxed(sum_squared *
|
|
768
|
+
*result = nk_f64_sqrt_v128relaxed(sum_squared * inv_points_count - mean_diff_sq);
|
|
651
769
|
}
|
|
652
770
|
|
|
653
771
|
NK_PUBLIC void nk_kabsch_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
|
|
@@ -678,9 +796,7 @@ NK_PUBLIC void nk_kabsch_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, n
|
|
|
678
796
|
|
|
679
797
|
// Handle reflection: if det(R) < 0, negate third column of V and recompute R.
|
|
680
798
|
if (nk_det3x3_f64_(r) < 0) {
|
|
681
|
-
svd_v[2] = -svd_v[2];
|
|
682
|
-
svd_v[5] = -svd_v[5];
|
|
683
|
-
svd_v[8] = -svd_v[8];
|
|
799
|
+
svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
|
|
684
800
|
r[0] = svd_v[0] * svd_u[0] + svd_v[1] * svd_u[1] + svd_v[2] * svd_u[2];
|
|
685
801
|
r[1] = svd_v[0] * svd_u[3] + svd_v[1] * svd_u[4] + svd_v[2] * svd_u[5];
|
|
686
802
|
r[2] = svd_v[0] * svd_u[6] + svd_v[1] * svd_u[7] + svd_v[2] * svd_u[8];
|
|
@@ -692,9 +808,8 @@ NK_PUBLIC void nk_kabsch_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, n
|
|
|
692
808
|
r[8] = svd_v[6] * svd_u[6] + svd_v[7] * svd_u[7] + svd_v[8] * svd_u[8];
|
|
693
809
|
}
|
|
694
810
|
|
|
695
|
-
if (rotation)
|
|
811
|
+
if (rotation)
|
|
696
812
|
for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)r[j];
|
|
697
|
-
}
|
|
698
813
|
if (scale) *scale = 1.0f;
|
|
699
814
|
|
|
700
815
|
*result = nk_f64_sqrt_v128relaxed(nk_transformed_ssd_f32_v128relaxed_(a, b, n, r, 1.0, centroid_a_x, centroid_a_y,
|
|
@@ -790,9 +905,11 @@ NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, n
|
|
|
790
905
|
covariance_z_z += covariance_z_z_compensation;
|
|
791
906
|
|
|
792
907
|
// Compute centroids
|
|
793
|
-
nk_f64_t
|
|
794
|
-
nk_f64_t centroid_a_x = sum_a_x *
|
|
795
|
-
|
|
908
|
+
nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
|
|
909
|
+
nk_f64_t centroid_a_x = sum_a_x * inv_points_count, centroid_a_y = sum_a_y * inv_points_count,
|
|
910
|
+
centroid_a_z = sum_a_z * inv_points_count;
|
|
911
|
+
nk_f64_t centroid_b_x = sum_b_x * inv_points_count, centroid_b_y = sum_b_y * inv_points_count,
|
|
912
|
+
centroid_b_z = sum_b_z * inv_points_count;
|
|
796
913
|
if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
|
|
797
914
|
if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
|
|
798
915
|
|
|
@@ -818,9 +935,7 @@ NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, n
|
|
|
818
935
|
|
|
819
936
|
// Handle reflection: if det(R) < 0, negate third column of V and recompute R
|
|
820
937
|
if (nk_det3x3_f64_(r) < 0) {
|
|
821
|
-
svd_v[2] = -svd_v[2];
|
|
822
|
-
svd_v[5] = -svd_v[5];
|
|
823
|
-
svd_v[8] = -svd_v[8];
|
|
938
|
+
svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
|
|
824
939
|
nk_rotation_from_svd_f64_v128relaxed_(svd_u, svd_v, r);
|
|
825
940
|
}
|
|
826
941
|
|
|
@@ -833,7 +948,7 @@ NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, n
|
|
|
833
948
|
// Compute RMSD after optimal rotation
|
|
834
949
|
nk_f64_t sum_squared = nk_transformed_ssd_f64_v128relaxed_(a, b, n, r, 1.0, centroid_a_x, centroid_a_y,
|
|
835
950
|
centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z);
|
|
836
|
-
*result = nk_f64_sqrt_v128relaxed(sum_squared *
|
|
951
|
+
*result = nk_f64_sqrt_v128relaxed(sum_squared * inv_points_count);
|
|
837
952
|
}
|
|
838
953
|
|
|
839
954
|
NK_PUBLIC void nk_umeyama_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
|
|
@@ -866,9 +981,7 @@ NK_PUBLIC void nk_umeyama_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b,
|
|
|
866
981
|
|
|
867
982
|
nk_f64_t det = nk_det3x3_f64_(r);
|
|
868
983
|
if (det < 0) {
|
|
869
|
-
svd_v[2] = -svd_v[2];
|
|
870
|
-
svd_v[5] = -svd_v[5];
|
|
871
|
-
svd_v[8] = -svd_v[8];
|
|
984
|
+
svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
|
|
872
985
|
r[0] = svd_v[0] * svd_u[0] + svd_v[1] * svd_u[1] + svd_v[2] * svd_u[2];
|
|
873
986
|
r[1] = svd_v[0] * svd_u[3] + svd_v[1] * svd_u[4] + svd_v[2] * svd_u[5];
|
|
874
987
|
r[2] = svd_v[0] * svd_u[6] + svd_v[1] * svd_u[7] + svd_v[2] * svd_u[8];
|
|
@@ -988,15 +1101,17 @@ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b,
|
|
|
988
1101
|
sum_sq_a += sum_sq_a_compensation;
|
|
989
1102
|
|
|
990
1103
|
// Compute centroids
|
|
991
|
-
nk_f64_t
|
|
992
|
-
nk_f64_t centroid_a_x = sum_a_x *
|
|
993
|
-
|
|
1104
|
+
nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
|
|
1105
|
+
nk_f64_t centroid_a_x = sum_a_x * inv_points_count, centroid_a_y = sum_a_y * inv_points_count,
|
|
1106
|
+
centroid_a_z = sum_a_z * inv_points_count;
|
|
1107
|
+
nk_f64_t centroid_b_x = sum_b_x * inv_points_count, centroid_b_y = sum_b_y * inv_points_count,
|
|
1108
|
+
centroid_b_z = sum_b_z * inv_points_count;
|
|
994
1109
|
if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
|
|
995
1110
|
if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
|
|
996
1111
|
|
|
997
1112
|
// Compute variance of A (centered)
|
|
998
1113
|
nk_f64_t centroid_sq = centroid_a_x * centroid_a_x + centroid_a_y * centroid_a_y + centroid_a_z * centroid_a_z;
|
|
999
|
-
nk_f64_t var_a = sum_sq_a *
|
|
1114
|
+
nk_f64_t var_a = sum_sq_a * inv_points_count - centroid_sq;
|
|
1000
1115
|
|
|
1001
1116
|
// Apply centering correction: H_centered = H - n * centroid_a * centroid_bT
|
|
1002
1117
|
covariance_x_x -= n * centroid_a_x * centroid_b_x;
|
|
@@ -1024,9 +1139,7 @@ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b,
|
|
|
1024
1139
|
nk_f64_t computed_scale = trace_d_s / (n * var_a);
|
|
1025
1140
|
|
|
1026
1141
|
if (det < 0) {
|
|
1027
|
-
svd_v[2] = -svd_v[2];
|
|
1028
|
-
svd_v[5] = -svd_v[5];
|
|
1029
|
-
svd_v[8] = -svd_v[8];
|
|
1142
|
+
svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
|
|
1030
1143
|
nk_rotation_from_svd_f64_v128relaxed_(svd_u, svd_v, r);
|
|
1031
1144
|
}
|
|
1032
1145
|
|
|
@@ -1037,7 +1150,7 @@ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b,
|
|
|
1037
1150
|
// Compute RMSD after transformation
|
|
1038
1151
|
nk_f64_t sum_squared = nk_transformed_ssd_f64_v128relaxed_(a, b, n, r, computed_scale, centroid_a_x, centroid_a_y,
|
|
1039
1152
|
centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z);
|
|
1040
|
-
*result = nk_f64_sqrt_v128relaxed(sum_squared *
|
|
1153
|
+
*result = nk_f64_sqrt_v128relaxed(sum_squared * inv_points_count);
|
|
1041
1154
|
}
|
|
1042
1155
|
|
|
1043
1156
|
#if defined(__clang__)
|