numkong 7.0.0 → 7.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +239 -122
- package/binding.gyp +25 -491
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
*
|
|
16
16
|
* Fused helpers minimize data passes:
|
|
17
17
|
*
|
|
18
|
-
* -
|
|
18
|
+
* - RMSD: fully fused single-pass (centroids + squared diffs), no separate helper
|
|
19
19
|
* - `nk_centroid_and_cross_covariance_*_rvv_`: centroids + H in one pass (Kabsch)
|
|
20
20
|
* - `nk_centroid_and_cross_covariance_and_variance_*_rvv_`: + variance (Umeyama)
|
|
21
21
|
*
|
|
@@ -89,104 +89,6 @@ NK_INTERNAL void nk_accumulate_product_f64m1_rvv_(vfloat64m1_t *sum_f64m1, vfloa
|
|
|
89
89
|
vector_length);
|
|
90
90
|
}
|
|
91
91
|
|
|
92
|
-
/**
|
|
93
|
-
* @brief Compute centroids of two f32 point clouds in a single pass.
|
|
94
|
-
*
|
|
95
|
-
* Reads both clouds simultaneously, accumulating 6 sums (3 per cloud) in f64.
|
|
96
|
-
* Reduces RMSD from 3 passes to 2 (bicentroid + SSD).
|
|
97
|
-
* Uses per-lane `vfwadd_wv` accumulation with deferred `vfredusum` after the loop.
|
|
98
|
-
*/
|
|
99
|
-
NK_INTERNAL void nk_bicentroid_f32_rvv_( //
|
|
100
|
-
nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, //
|
|
101
|
-
nk_f64_t *ca_x, nk_f64_t *ca_y, nk_f64_t *ca_z, //
|
|
102
|
-
nk_f64_t *cb_x, nk_f64_t *cb_y, nk_f64_t *cb_z) {
|
|
103
|
-
nk_size_t vlmax = __riscv_vsetvlmax_e64m2();
|
|
104
|
-
vfloat64m2_t sum_a_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
|
|
105
|
-
vfloat64m2_t sum_a_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
|
|
106
|
-
vfloat64m2_t sum_a_z_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
|
|
107
|
-
vfloat64m2_t sum_b_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
|
|
108
|
-
vfloat64m2_t sum_b_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
|
|
109
|
-
vfloat64m2_t sum_b_z_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, vlmax);
|
|
110
|
-
nk_f32_t const *a_ptr = a, *b_ptr = b;
|
|
111
|
-
nk_size_t remaining = n;
|
|
112
|
-
for (nk_size_t vector_length; remaining > 0;
|
|
113
|
-
remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
|
|
114
|
-
vector_length = __riscv_vsetvl_e32m1(remaining);
|
|
115
|
-
vfloat32m1x3_t a_f32m1x3 = __riscv_vlseg3e32_v_f32m1x3(a_ptr, vector_length);
|
|
116
|
-
sum_a_x_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_a_x_f64m2, sum_a_x_f64m2,
|
|
117
|
-
__riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 0), vector_length);
|
|
118
|
-
sum_a_y_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_a_y_f64m2, sum_a_y_f64m2,
|
|
119
|
-
__riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 1), vector_length);
|
|
120
|
-
sum_a_z_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_a_z_f64m2, sum_a_z_f64m2,
|
|
121
|
-
__riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 2), vector_length);
|
|
122
|
-
vfloat32m1x3_t b_f32m1x3 = __riscv_vlseg3e32_v_f32m1x3(b_ptr, vector_length);
|
|
123
|
-
sum_b_x_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_b_x_f64m2, sum_b_x_f64m2,
|
|
124
|
-
__riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 0), vector_length);
|
|
125
|
-
sum_b_y_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_b_y_f64m2, sum_b_y_f64m2,
|
|
126
|
-
__riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 1), vector_length);
|
|
127
|
-
sum_b_z_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_b_z_f64m2, sum_b_z_f64m2,
|
|
128
|
-
__riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 2), vector_length);
|
|
129
|
-
}
|
|
130
|
-
vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
|
|
131
|
-
nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
|
|
132
|
-
*ca_x = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_a_x_f64m2, zero_f64m1, vlmax)) * inv_n;
|
|
133
|
-
*ca_y = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_a_y_f64m2, zero_f64m1, vlmax)) * inv_n;
|
|
134
|
-
*ca_z = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_a_z_f64m2, zero_f64m1, vlmax)) * inv_n;
|
|
135
|
-
*cb_x = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_b_x_f64m2, zero_f64m1, vlmax)) * inv_n;
|
|
136
|
-
*cb_y = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_b_y_f64m2, zero_f64m1, vlmax)) * inv_n;
|
|
137
|
-
*cb_z = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(sum_b_z_f64m2, zero_f64m1, vlmax)) * inv_n;
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
/**
|
|
141
|
-
* @brief Compute centroids of two f64 point clouds in a single pass.
|
|
142
|
-
* Uses per-lane `vfadd_vv` accumulation with deferred `vfredusum` after the loop.
|
|
143
|
-
*/
|
|
144
|
-
NK_INTERNAL void nk_bicentroid_f64_rvv_( //
|
|
145
|
-
nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, //
|
|
146
|
-
nk_f64_t *ca_x, nk_f64_t *ca_y, nk_f64_t *ca_z, //
|
|
147
|
-
nk_f64_t *cb_x, nk_f64_t *cb_y, nk_f64_t *cb_z) {
|
|
148
|
-
nk_size_t vlmax = __riscv_vsetvlmax_e64m1();
|
|
149
|
-
vfloat64m1_t sum_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
|
|
150
|
-
vfloat64m1_t sum_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
|
|
151
|
-
vfloat64m1_t sum_a_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
|
|
152
|
-
vfloat64m1_t sum_b_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
|
|
153
|
-
vfloat64m1_t sum_b_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
|
|
154
|
-
vfloat64m1_t sum_b_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
|
|
155
|
-
vfloat64m1_t compensation_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
|
|
156
|
-
vfloat64m1_t compensation_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
|
|
157
|
-
vfloat64m1_t compensation_a_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
|
|
158
|
-
vfloat64m1_t compensation_b_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
|
|
159
|
-
vfloat64m1_t compensation_b_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
|
|
160
|
-
vfloat64m1_t compensation_b_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
|
|
161
|
-
nk_f64_t const *a_ptr = a, *b_ptr = b;
|
|
162
|
-
nk_size_t remaining = n;
|
|
163
|
-
for (nk_size_t vector_length; remaining > 0;
|
|
164
|
-
remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
|
|
165
|
-
vector_length = __riscv_vsetvl_e64m1(remaining);
|
|
166
|
-
vfloat64m1x3_t a_f64m1x3 = __riscv_vlseg3e64_v_f64m1x3(a_ptr, vector_length);
|
|
167
|
-
nk_accumulate_sum_f64m1_rvv_(&sum_a_x_f64m1, &compensation_a_x_f64m1,
|
|
168
|
-
__riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 0), vector_length);
|
|
169
|
-
nk_accumulate_sum_f64m1_rvv_(&sum_a_y_f64m1, &compensation_a_y_f64m1,
|
|
170
|
-
__riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 1), vector_length);
|
|
171
|
-
nk_accumulate_sum_f64m1_rvv_(&sum_a_z_f64m1, &compensation_a_z_f64m1,
|
|
172
|
-
__riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 2), vector_length);
|
|
173
|
-
vfloat64m1x3_t b_f64m1x3 = __riscv_vlseg3e64_v_f64m1x3(b_ptr, vector_length);
|
|
174
|
-
nk_accumulate_sum_f64m1_rvv_(&sum_b_x_f64m1, &compensation_b_x_f64m1,
|
|
175
|
-
__riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 0), vector_length);
|
|
176
|
-
nk_accumulate_sum_f64m1_rvv_(&sum_b_y_f64m1, &compensation_b_y_f64m1,
|
|
177
|
-
__riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 1), vector_length);
|
|
178
|
-
nk_accumulate_sum_f64m1_rvv_(&sum_b_z_f64m1, &compensation_b_z_f64m1,
|
|
179
|
-
__riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 2), vector_length);
|
|
180
|
-
}
|
|
181
|
-
nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
|
|
182
|
-
*ca_x = nk_dot_stable_sum_f64m1_rvv_(sum_a_x_f64m1, compensation_a_x_f64m1) * inv_n;
|
|
183
|
-
*ca_y = nk_dot_stable_sum_f64m1_rvv_(sum_a_y_f64m1, compensation_a_y_f64m1) * inv_n;
|
|
184
|
-
*ca_z = nk_dot_stable_sum_f64m1_rvv_(sum_a_z_f64m1, compensation_a_z_f64m1) * inv_n;
|
|
185
|
-
*cb_x = nk_dot_stable_sum_f64m1_rvv_(sum_b_x_f64m1, compensation_b_x_f64m1) * inv_n;
|
|
186
|
-
*cb_y = nk_dot_stable_sum_f64m1_rvv_(sum_b_y_f64m1, compensation_b_y_f64m1) * inv_n;
|
|
187
|
-
*cb_z = nk_dot_stable_sum_f64m1_rvv_(sum_b_z_f64m1, compensation_b_z_f64m1) * inv_n;
|
|
188
|
-
}
|
|
189
|
-
|
|
190
92
|
/**
|
|
191
93
|
* @brief Compute centroids and cross-covariance matrix in a single pass (f32).
|
|
192
94
|
*
|
|
@@ -198,27 +100,29 @@ NK_INTERNAL void nk_bicentroid_f64_rvv_( //
|
|
|
198
100
|
* Cross-products use per-lane `vfwmacc_vv` accumulation (vfloat64m2_t) with
|
|
199
101
|
* deferred `vfredusum` after the loop — eliminates 9 reductions per iteration.
|
|
200
102
|
*/
|
|
201
|
-
NK_INTERNAL void nk_centroid_and_cross_covariance_f32_rvv_(
|
|
202
|
-
nk_f32_t const *a, nk_f32_t const *b, nk_size_t
|
|
203
|
-
nk_f64_t *
|
|
204
|
-
nk_f64_t *
|
|
103
|
+
NK_INTERNAL void nk_centroid_and_cross_covariance_f32_rvv_( //
|
|
104
|
+
nk_f32_t const *a, nk_f32_t const *b, nk_size_t points_count, //
|
|
105
|
+
nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
|
|
106
|
+
nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
|
|
205
107
|
nk_f64_t h[9]) {
|
|
206
|
-
nk_size_t
|
|
207
|
-
vfloat64m2_t sum_a_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0,
|
|
208
|
-
|
|
209
|
-
vfloat64m2_t
|
|
210
|
-
vfloat64m2_t
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
vfloat64m2_t
|
|
214
|
-
|
|
215
|
-
vfloat64m2_t
|
|
216
|
-
|
|
217
|
-
vfloat64m2_t
|
|
218
|
-
|
|
219
|
-
vfloat64m2_t
|
|
108
|
+
nk_size_t max_vector_length = __riscv_vsetvlmax_e64m2();
|
|
109
|
+
vfloat64m2_t sum_a_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
|
|
110
|
+
sum_a_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
111
|
+
vfloat64m2_t sum_a_z_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
112
|
+
vfloat64m2_t sum_b_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
|
|
113
|
+
sum_b_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
114
|
+
vfloat64m2_t sum_b_z_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
115
|
+
vfloat64m2_t cross_00_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
|
|
116
|
+
cross_01_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
117
|
+
vfloat64m2_t cross_02_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
|
|
118
|
+
cross_10_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
119
|
+
vfloat64m2_t cross_11_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
|
|
120
|
+
cross_12_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
121
|
+
vfloat64m2_t cross_20_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
|
|
122
|
+
cross_21_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
123
|
+
vfloat64m2_t cross_22_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
220
124
|
nk_f32_t const *a_ptr = a, *b_ptr = b;
|
|
221
|
-
nk_size_t remaining =
|
|
125
|
+
nk_size_t remaining = points_count;
|
|
222
126
|
for (nk_size_t vector_length; remaining > 0;
|
|
223
127
|
remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
|
|
224
128
|
vector_length = __riscv_vsetvl_e32m1(remaining);
|
|
@@ -248,45 +152,51 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_rvv_( //
|
|
|
248
152
|
}
|
|
249
153
|
vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
|
|
250
154
|
// Compute centroids
|
|
251
|
-
nk_f64_t
|
|
252
|
-
nk_f64_t
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
nk_f64_t
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
h[
|
|
279
|
-
n_f64 *
|
|
280
|
-
h[
|
|
281
|
-
n_f64 *
|
|
282
|
-
h[
|
|
283
|
-
n_f64 *
|
|
284
|
-
h[
|
|
285
|
-
n_f64 *
|
|
286
|
-
h[
|
|
287
|
-
n_f64 *
|
|
288
|
-
h[
|
|
289
|
-
n_f64 *
|
|
155
|
+
nk_f64_t inv_points_count = 1.0 / (nk_f64_t)points_count;
|
|
156
|
+
nk_f64_t centroid_a_x_f64 = __riscv_vfmv_f_s_f64m1_f64(
|
|
157
|
+
__riscv_vfredusum_vs_f64m2_f64m1(sum_a_x_f64m2, zero_f64m1, max_vector_length)) *
|
|
158
|
+
inv_points_count;
|
|
159
|
+
nk_f64_t centroid_a_y_f64 = __riscv_vfmv_f_s_f64m1_f64(
|
|
160
|
+
__riscv_vfredusum_vs_f64m2_f64m1(sum_a_y_f64m2, zero_f64m1, max_vector_length)) *
|
|
161
|
+
inv_points_count;
|
|
162
|
+
nk_f64_t centroid_a_z_f64 = __riscv_vfmv_f_s_f64m1_f64(
|
|
163
|
+
__riscv_vfredusum_vs_f64m2_f64m1(sum_a_z_f64m2, zero_f64m1, max_vector_length)) *
|
|
164
|
+
inv_points_count;
|
|
165
|
+
nk_f64_t centroid_b_x_f64 = __riscv_vfmv_f_s_f64m1_f64(
|
|
166
|
+
__riscv_vfredusum_vs_f64m2_f64m1(sum_b_x_f64m2, zero_f64m1, max_vector_length)) *
|
|
167
|
+
inv_points_count;
|
|
168
|
+
nk_f64_t centroid_b_y_f64 = __riscv_vfmv_f_s_f64m1_f64(
|
|
169
|
+
__riscv_vfredusum_vs_f64m2_f64m1(sum_b_y_f64m2, zero_f64m1, max_vector_length)) *
|
|
170
|
+
inv_points_count;
|
|
171
|
+
nk_f64_t centroid_b_z_f64 = __riscv_vfmv_f_s_f64m1_f64(
|
|
172
|
+
__riscv_vfredusum_vs_f64m2_f64m1(sum_b_z_f64m2, zero_f64m1, max_vector_length)) *
|
|
173
|
+
inv_points_count;
|
|
174
|
+
*centroid_a_x = centroid_a_x_f64;
|
|
175
|
+
*centroid_a_y = centroid_a_y_f64;
|
|
176
|
+
*centroid_a_z = centroid_a_z_f64;
|
|
177
|
+
*centroid_b_x = centroid_b_x_f64;
|
|
178
|
+
*centroid_b_y = centroid_b_y_f64;
|
|
179
|
+
*centroid_b_z = centroid_b_z_f64;
|
|
180
|
+
// Fix up: H[i][j] = raw[i][j] - points_count * ca[i] * cb[j]
|
|
181
|
+
nk_f64_t n_f64 = (nk_f64_t)points_count;
|
|
182
|
+
h[0] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_00_f64m2, zero_f64m1, max_vector_length)) -
|
|
183
|
+
n_f64 * centroid_a_x_f64 * centroid_b_x_f64;
|
|
184
|
+
h[1] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_01_f64m2, zero_f64m1, max_vector_length)) -
|
|
185
|
+
n_f64 * centroid_a_x_f64 * centroid_b_y_f64;
|
|
186
|
+
h[2] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_02_f64m2, zero_f64m1, max_vector_length)) -
|
|
187
|
+
n_f64 * centroid_a_x_f64 * centroid_b_z_f64;
|
|
188
|
+
h[3] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_10_f64m2, zero_f64m1, max_vector_length)) -
|
|
189
|
+
n_f64 * centroid_a_y_f64 * centroid_b_x_f64;
|
|
190
|
+
h[4] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_11_f64m2, zero_f64m1, max_vector_length)) -
|
|
191
|
+
n_f64 * centroid_a_y_f64 * centroid_b_y_f64;
|
|
192
|
+
h[5] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_12_f64m2, zero_f64m1, max_vector_length)) -
|
|
193
|
+
n_f64 * centroid_a_y_f64 * centroid_b_z_f64;
|
|
194
|
+
h[6] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_20_f64m2, zero_f64m1, max_vector_length)) -
|
|
195
|
+
n_f64 * centroid_a_z_f64 * centroid_b_x_f64;
|
|
196
|
+
h[7] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_21_f64m2, zero_f64m1, max_vector_length)) -
|
|
197
|
+
n_f64 * centroid_a_z_f64 * centroid_b_y_f64;
|
|
198
|
+
h[8] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_22_f64m2, zero_f64m1, max_vector_length)) -
|
|
199
|
+
n_f64 * centroid_a_z_f64 * centroid_b_z_f64;
|
|
290
200
|
}
|
|
291
201
|
|
|
292
202
|
/**
|
|
@@ -295,42 +205,44 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_rvv_( //
|
|
|
295
205
|
* Per-lane `vfadd_vv`/`vfmacc_vv` accumulation with deferred `vfredusum` after the loop
|
|
296
206
|
* — eliminates 15 horizontal reductions per iteration.
|
|
297
207
|
*/
|
|
298
|
-
NK_INTERNAL void nk_centroid_and_cross_covariance_f64_rvv_(
|
|
299
|
-
nk_f64_t const *a, nk_f64_t const *b, nk_size_t
|
|
300
|
-
nk_f64_t *
|
|
301
|
-
nk_f64_t *
|
|
208
|
+
NK_INTERNAL void nk_centroid_and_cross_covariance_f64_rvv_( //
|
|
209
|
+
nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, //
|
|
210
|
+
nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
|
|
211
|
+
nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
|
|
302
212
|
nk_f64_t h[9]) {
|
|
303
|
-
nk_size_t
|
|
304
|
-
vfloat64m1_t sum_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0,
|
|
305
|
-
|
|
306
|
-
vfloat64m1_t
|
|
307
|
-
vfloat64m1_t
|
|
308
|
-
|
|
309
|
-
vfloat64m1_t
|
|
310
|
-
vfloat64m1_t
|
|
311
|
-
vfloat64m1_t
|
|
312
|
-
vfloat64m1_t
|
|
313
|
-
vfloat64m1_t
|
|
314
|
-
vfloat64m1_t
|
|
315
|
-
|
|
316
|
-
vfloat64m1_t
|
|
317
|
-
|
|
318
|
-
vfloat64m1_t
|
|
319
|
-
|
|
320
|
-
vfloat64m1_t
|
|
321
|
-
|
|
322
|
-
vfloat64m1_t
|
|
323
|
-
|
|
324
|
-
vfloat64m1_t
|
|
325
|
-
vfloat64m1_t
|
|
326
|
-
vfloat64m1_t
|
|
327
|
-
vfloat64m1_t
|
|
328
|
-
vfloat64m1_t
|
|
329
|
-
vfloat64m1_t
|
|
330
|
-
vfloat64m1_t
|
|
331
|
-
vfloat64m1_t
|
|
213
|
+
nk_size_t max_vector_length = __riscv_vsetvlmax_e64m1();
|
|
214
|
+
vfloat64m1_t sum_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
|
|
215
|
+
sum_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
216
|
+
vfloat64m1_t sum_a_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
217
|
+
vfloat64m1_t sum_b_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
|
|
218
|
+
sum_b_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
219
|
+
vfloat64m1_t sum_b_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
220
|
+
vfloat64m1_t compensation_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
221
|
+
vfloat64m1_t compensation_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
222
|
+
vfloat64m1_t compensation_a_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
223
|
+
vfloat64m1_t compensation_b_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
224
|
+
vfloat64m1_t compensation_b_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
225
|
+
vfloat64m1_t compensation_b_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
226
|
+
vfloat64m1_t cross_00_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
|
|
227
|
+
cross_01_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
228
|
+
vfloat64m1_t cross_02_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
|
|
229
|
+
cross_10_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
230
|
+
vfloat64m1_t cross_11_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
|
|
231
|
+
cross_12_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
232
|
+
vfloat64m1_t cross_20_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
|
|
233
|
+
cross_21_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
234
|
+
vfloat64m1_t cross_22_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
235
|
+
vfloat64m1_t compensation_00_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
236
|
+
vfloat64m1_t compensation_01_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
237
|
+
vfloat64m1_t compensation_02_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
238
|
+
vfloat64m1_t compensation_10_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
239
|
+
vfloat64m1_t compensation_11_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
240
|
+
vfloat64m1_t compensation_12_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
241
|
+
vfloat64m1_t compensation_20_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
242
|
+
vfloat64m1_t compensation_21_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
243
|
+
vfloat64m1_t compensation_22_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
332
244
|
nk_f64_t const *a_ptr = a, *b_ptr = b;
|
|
333
|
-
nk_size_t remaining =
|
|
245
|
+
nk_size_t remaining = points_count;
|
|
334
246
|
for (nk_size_t vector_length; remaining > 0;
|
|
335
247
|
remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
|
|
336
248
|
vector_length = __riscv_vsetvl_e64m1(remaining);
|
|
@@ -359,29 +271,38 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f64_rvv_( //
|
|
|
359
271
|
nk_accumulate_product_f64m1_rvv_(&cross_22_f64m1, &compensation_22_f64m1, a_z_f64m1, b_z_f64m1, vector_length);
|
|
360
272
|
}
|
|
361
273
|
// Compute centroids.
|
|
362
|
-
nk_f64_t
|
|
363
|
-
nk_f64_t
|
|
364
|
-
nk_f64_t
|
|
365
|
-
nk_f64_t
|
|
366
|
-
nk_f64_t
|
|
367
|
-
nk_f64_t
|
|
368
|
-
nk_f64_t
|
|
369
|
-
*
|
|
370
|
-
*
|
|
371
|
-
*
|
|
372
|
-
*
|
|
373
|
-
*
|
|
374
|
-
*
|
|
375
|
-
nk_f64_t n_f64 = (nk_f64_t)
|
|
376
|
-
h[0] = nk_dot_stable_sum_f64m1_rvv_(cross_00_f64m1, compensation_00_f64m1) -
|
|
377
|
-
|
|
378
|
-
h[
|
|
379
|
-
|
|
380
|
-
h[
|
|
381
|
-
|
|
382
|
-
h[
|
|
383
|
-
|
|
384
|
-
h[
|
|
274
|
+
nk_f64_t inv_points_count = 1.0 / (nk_f64_t)points_count;
|
|
275
|
+
nk_f64_t centroid_a_x_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_a_x_f64m1, compensation_a_x_f64m1) * inv_points_count;
|
|
276
|
+
nk_f64_t centroid_a_y_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_a_y_f64m1, compensation_a_y_f64m1) * inv_points_count;
|
|
277
|
+
nk_f64_t centroid_a_z_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_a_z_f64m1, compensation_a_z_f64m1) * inv_points_count;
|
|
278
|
+
nk_f64_t centroid_b_x_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_b_x_f64m1, compensation_b_x_f64m1) * inv_points_count;
|
|
279
|
+
nk_f64_t centroid_b_y_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_b_y_f64m1, compensation_b_y_f64m1) * inv_points_count;
|
|
280
|
+
nk_f64_t centroid_b_z_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_b_z_f64m1, compensation_b_z_f64m1) * inv_points_count;
|
|
281
|
+
*centroid_a_x = centroid_a_x_f64;
|
|
282
|
+
*centroid_a_y = centroid_a_y_f64;
|
|
283
|
+
*centroid_a_z = centroid_a_z_f64;
|
|
284
|
+
*centroid_b_x = centroid_b_x_f64;
|
|
285
|
+
*centroid_b_y = centroid_b_y_f64;
|
|
286
|
+
*centroid_b_z = centroid_b_z_f64;
|
|
287
|
+
nk_f64_t n_f64 = (nk_f64_t)points_count;
|
|
288
|
+
h[0] = nk_dot_stable_sum_f64m1_rvv_(cross_00_f64m1, compensation_00_f64m1) -
|
|
289
|
+
n_f64 * centroid_a_x_f64 * centroid_b_x_f64;
|
|
290
|
+
h[1] = nk_dot_stable_sum_f64m1_rvv_(cross_01_f64m1, compensation_01_f64m1) -
|
|
291
|
+
n_f64 * centroid_a_x_f64 * centroid_b_y_f64;
|
|
292
|
+
h[2] = nk_dot_stable_sum_f64m1_rvv_(cross_02_f64m1, compensation_02_f64m1) -
|
|
293
|
+
n_f64 * centroid_a_x_f64 * centroid_b_z_f64;
|
|
294
|
+
h[3] = nk_dot_stable_sum_f64m1_rvv_(cross_10_f64m1, compensation_10_f64m1) -
|
|
295
|
+
n_f64 * centroid_a_y_f64 * centroid_b_x_f64;
|
|
296
|
+
h[4] = nk_dot_stable_sum_f64m1_rvv_(cross_11_f64m1, compensation_11_f64m1) -
|
|
297
|
+
n_f64 * centroid_a_y_f64 * centroid_b_y_f64;
|
|
298
|
+
h[5] = nk_dot_stable_sum_f64m1_rvv_(cross_12_f64m1, compensation_12_f64m1) -
|
|
299
|
+
n_f64 * centroid_a_y_f64 * centroid_b_z_f64;
|
|
300
|
+
h[6] = nk_dot_stable_sum_f64m1_rvv_(cross_20_f64m1, compensation_20_f64m1) -
|
|
301
|
+
n_f64 * centroid_a_z_f64 * centroid_b_x_f64;
|
|
302
|
+
h[7] = nk_dot_stable_sum_f64m1_rvv_(cross_21_f64m1, compensation_21_f64m1) -
|
|
303
|
+
n_f64 * centroid_a_z_f64 * centroid_b_y_f64;
|
|
304
|
+
h[8] = nk_dot_stable_sum_f64m1_rvv_(cross_22_f64m1, compensation_22_f64m1) -
|
|
305
|
+
n_f64 * centroid_a_z_f64 * centroid_b_z_f64;
|
|
385
306
|
}
|
|
386
307
|
|
|
387
308
|
/**
|
|
@@ -394,28 +315,30 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f64_rvv_( //
|
|
|
394
315
|
* Cross-products use per-lane `vfwmacc_vv` accumulation (vfloat64m2_t) with
|
|
395
316
|
* deferred `vfredusum` after the loop — eliminates 9 reductions per iteration.
|
|
396
317
|
*/
|
|
397
|
-
NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_rvv_(
|
|
398
|
-
nk_f32_t const *a, nk_f32_t const *b, nk_size_t
|
|
399
|
-
nk_f64_t *
|
|
400
|
-
nk_f64_t *
|
|
318
|
+
NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_rvv_( //
|
|
319
|
+
nk_f32_t const *a, nk_f32_t const *b, nk_size_t points_count, //
|
|
320
|
+
nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
|
|
321
|
+
nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
|
|
401
322
|
nk_f64_t h[9], nk_f64_t *variance_a) {
|
|
402
|
-
nk_size_t
|
|
403
|
-
vfloat64m2_t sum_a_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0,
|
|
404
|
-
|
|
405
|
-
vfloat64m2_t
|
|
406
|
-
vfloat64m2_t
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
vfloat64m2_t
|
|
410
|
-
|
|
411
|
-
vfloat64m2_t
|
|
412
|
-
|
|
413
|
-
vfloat64m2_t
|
|
414
|
-
|
|
415
|
-
vfloat64m2_t
|
|
416
|
-
|
|
323
|
+
nk_size_t max_vector_length = __riscv_vsetvlmax_e64m2();
|
|
324
|
+
vfloat64m2_t sum_a_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
|
|
325
|
+
sum_a_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
326
|
+
vfloat64m2_t sum_a_z_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
327
|
+
vfloat64m2_t sum_b_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
|
|
328
|
+
sum_b_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
329
|
+
vfloat64m2_t sum_b_z_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
330
|
+
vfloat64m2_t cross_00_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
|
|
331
|
+
cross_01_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
332
|
+
vfloat64m2_t cross_02_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
|
|
333
|
+
cross_10_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
334
|
+
vfloat64m2_t cross_11_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
|
|
335
|
+
cross_12_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
336
|
+
vfloat64m2_t cross_20_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
|
|
337
|
+
cross_21_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
338
|
+
vfloat64m2_t cross_22_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
339
|
+
vfloat64m2_t sum_norm_squared_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
417
340
|
nk_f32_t const *a_ptr = a, *b_ptr = b;
|
|
418
|
-
nk_size_t remaining =
|
|
341
|
+
nk_size_t remaining = points_count;
|
|
419
342
|
for (nk_size_t vector_length; remaining > 0;
|
|
420
343
|
remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
|
|
421
344
|
vector_length = __riscv_vsetvl_e32m1(remaining);
|
|
@@ -450,49 +373,56 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_rvv_( //
|
|
|
450
373
|
norm_squared_f64m2, vector_length);
|
|
451
374
|
}
|
|
452
375
|
vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
|
|
453
|
-
nk_f64_t
|
|
454
|
-
nk_f64_t
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
nk_f64_t
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
h[
|
|
480
|
-
n_f64 *
|
|
481
|
-
h[
|
|
482
|
-
n_f64 *
|
|
483
|
-
h[
|
|
484
|
-
n_f64 *
|
|
485
|
-
h[
|
|
486
|
-
n_f64 *
|
|
487
|
-
h[
|
|
488
|
-
n_f64 *
|
|
489
|
-
h[
|
|
490
|
-
n_f64 *
|
|
491
|
-
|
|
376
|
+
nk_f64_t inv_points_count = 1.0 / (nk_f64_t)points_count;
|
|
377
|
+
nk_f64_t centroid_a_x_f64 = __riscv_vfmv_f_s_f64m1_f64(
|
|
378
|
+
__riscv_vfredusum_vs_f64m2_f64m1(sum_a_x_f64m2, zero_f64m1, max_vector_length)) *
|
|
379
|
+
inv_points_count;
|
|
380
|
+
nk_f64_t centroid_a_y_f64 = __riscv_vfmv_f_s_f64m1_f64(
|
|
381
|
+
__riscv_vfredusum_vs_f64m2_f64m1(sum_a_y_f64m2, zero_f64m1, max_vector_length)) *
|
|
382
|
+
inv_points_count;
|
|
383
|
+
nk_f64_t centroid_a_z_f64 = __riscv_vfmv_f_s_f64m1_f64(
|
|
384
|
+
__riscv_vfredusum_vs_f64m2_f64m1(sum_a_z_f64m2, zero_f64m1, max_vector_length)) *
|
|
385
|
+
inv_points_count;
|
|
386
|
+
nk_f64_t centroid_b_x_f64 = __riscv_vfmv_f_s_f64m1_f64(
|
|
387
|
+
__riscv_vfredusum_vs_f64m2_f64m1(sum_b_x_f64m2, zero_f64m1, max_vector_length)) *
|
|
388
|
+
inv_points_count;
|
|
389
|
+
nk_f64_t centroid_b_y_f64 = __riscv_vfmv_f_s_f64m1_f64(
|
|
390
|
+
__riscv_vfredusum_vs_f64m2_f64m1(sum_b_y_f64m2, zero_f64m1, max_vector_length)) *
|
|
391
|
+
inv_points_count;
|
|
392
|
+
nk_f64_t centroid_b_z_f64 = __riscv_vfmv_f_s_f64m1_f64(
|
|
393
|
+
__riscv_vfredusum_vs_f64m2_f64m1(sum_b_z_f64m2, zero_f64m1, max_vector_length)) *
|
|
394
|
+
inv_points_count;
|
|
395
|
+
*centroid_a_x = centroid_a_x_f64;
|
|
396
|
+
*centroid_a_y = centroid_a_y_f64;
|
|
397
|
+
*centroid_a_z = centroid_a_z_f64;
|
|
398
|
+
*centroid_b_x = centroid_b_x_f64;
|
|
399
|
+
*centroid_b_y = centroid_b_y_f64;
|
|
400
|
+
*centroid_b_z = centroid_b_z_f64;
|
|
401
|
+
nk_f64_t n_f64 = (nk_f64_t)points_count;
|
|
402
|
+
h[0] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_00_f64m2, zero_f64m1, max_vector_length)) -
|
|
403
|
+
n_f64 * centroid_a_x_f64 * centroid_b_x_f64;
|
|
404
|
+
h[1] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_01_f64m2, zero_f64m1, max_vector_length)) -
|
|
405
|
+
n_f64 * centroid_a_x_f64 * centroid_b_y_f64;
|
|
406
|
+
h[2] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_02_f64m2, zero_f64m1, max_vector_length)) -
|
|
407
|
+
n_f64 * centroid_a_x_f64 * centroid_b_z_f64;
|
|
408
|
+
h[3] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_10_f64m2, zero_f64m1, max_vector_length)) -
|
|
409
|
+
n_f64 * centroid_a_y_f64 * centroid_b_x_f64;
|
|
410
|
+
h[4] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_11_f64m2, zero_f64m1, max_vector_length)) -
|
|
411
|
+
n_f64 * centroid_a_y_f64 * centroid_b_y_f64;
|
|
412
|
+
h[5] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_12_f64m2, zero_f64m1, max_vector_length)) -
|
|
413
|
+
n_f64 * centroid_a_y_f64 * centroid_b_z_f64;
|
|
414
|
+
h[6] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_20_f64m2, zero_f64m1, max_vector_length)) -
|
|
415
|
+
n_f64 * centroid_a_z_f64 * centroid_b_x_f64;
|
|
416
|
+
h[7] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_21_f64m2, zero_f64m1, max_vector_length)) -
|
|
417
|
+
n_f64 * centroid_a_z_f64 * centroid_b_y_f64;
|
|
418
|
+
h[8] = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredusum_vs_f64m2_f64m1(cross_22_f64m2, zero_f64m1, max_vector_length)) -
|
|
419
|
+
n_f64 * centroid_a_z_f64 * centroid_b_z_f64;
|
|
420
|
+
// variance_a = (1/points_count) * (Σ ||a[i]||² - points_count * ||ca||²)
|
|
492
421
|
*variance_a = __riscv_vfmv_f_s_f64m1_f64(
|
|
493
|
-
__riscv_vfredusum_vs_f64m2_f64m1(sum_norm_squared_f64m2, zero_f64m1,
|
|
494
|
-
|
|
495
|
-
(
|
|
422
|
+
__riscv_vfredusum_vs_f64m2_f64m1(sum_norm_squared_f64m2, zero_f64m1, max_vector_length)) *
|
|
423
|
+
inv_points_count -
|
|
424
|
+
(centroid_a_x_f64 * centroid_a_x_f64 + centroid_a_y_f64 * centroid_a_y_f64 +
|
|
425
|
+
centroid_a_z_f64 * centroid_a_z_f64);
|
|
496
426
|
}
|
|
497
427
|
|
|
498
428
|
/**
|
|
@@ -501,44 +431,46 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_rvv_( //
|
|
|
501
431
|
* Per-lane `vfadd_vv`/`vfmacc_vv` accumulation with deferred `vfredusum` after the loop
|
|
502
432
|
* — eliminates 16 horizontal reductions per iteration.
|
|
503
433
|
*/
|
|
504
|
-
NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f64_rvv_(
|
|
505
|
-
nk_f64_t const *a, nk_f64_t const *b, nk_size_t
|
|
506
|
-
nk_f64_t *
|
|
507
|
-
nk_f64_t *
|
|
434
|
+
NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f64_rvv_( //
|
|
435
|
+
nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, //
|
|
436
|
+
nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
|
|
437
|
+
nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
|
|
508
438
|
nk_f64_t h[9], nk_f64_t *variance_a) {
|
|
509
|
-
nk_size_t
|
|
510
|
-
vfloat64m1_t sum_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0,
|
|
511
|
-
|
|
512
|
-
vfloat64m1_t
|
|
513
|
-
vfloat64m1_t
|
|
514
|
-
|
|
515
|
-
vfloat64m1_t
|
|
516
|
-
vfloat64m1_t
|
|
517
|
-
vfloat64m1_t
|
|
518
|
-
vfloat64m1_t
|
|
519
|
-
vfloat64m1_t
|
|
520
|
-
vfloat64m1_t
|
|
521
|
-
|
|
522
|
-
vfloat64m1_t
|
|
523
|
-
|
|
524
|
-
vfloat64m1_t
|
|
525
|
-
|
|
526
|
-
vfloat64m1_t
|
|
527
|
-
|
|
528
|
-
vfloat64m1_t
|
|
529
|
-
|
|
530
|
-
vfloat64m1_t
|
|
531
|
-
vfloat64m1_t
|
|
532
|
-
vfloat64m1_t
|
|
533
|
-
vfloat64m1_t
|
|
534
|
-
vfloat64m1_t
|
|
535
|
-
vfloat64m1_t
|
|
536
|
-
vfloat64m1_t
|
|
537
|
-
vfloat64m1_t
|
|
538
|
-
vfloat64m1_t
|
|
539
|
-
vfloat64m1_t
|
|
439
|
+
nk_size_t max_vector_length = __riscv_vsetvlmax_e64m1();
|
|
440
|
+
vfloat64m1_t sum_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
|
|
441
|
+
sum_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
442
|
+
vfloat64m1_t sum_a_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
443
|
+
vfloat64m1_t sum_b_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
|
|
444
|
+
sum_b_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
445
|
+
vfloat64m1_t sum_b_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
446
|
+
vfloat64m1_t compensation_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
447
|
+
vfloat64m1_t compensation_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
448
|
+
vfloat64m1_t compensation_a_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
449
|
+
vfloat64m1_t compensation_b_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
450
|
+
vfloat64m1_t compensation_b_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
451
|
+
vfloat64m1_t compensation_b_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
452
|
+
vfloat64m1_t cross_00_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
|
|
453
|
+
cross_01_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
454
|
+
vfloat64m1_t cross_02_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
|
|
455
|
+
cross_10_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
456
|
+
vfloat64m1_t cross_11_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
|
|
457
|
+
cross_12_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
458
|
+
vfloat64m1_t cross_20_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
|
|
459
|
+
cross_21_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
460
|
+
vfloat64m1_t cross_22_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
461
|
+
vfloat64m1_t compensation_00_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
462
|
+
vfloat64m1_t compensation_01_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
463
|
+
vfloat64m1_t compensation_02_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
464
|
+
vfloat64m1_t compensation_10_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
465
|
+
vfloat64m1_t compensation_11_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
466
|
+
vfloat64m1_t compensation_12_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
467
|
+
vfloat64m1_t compensation_20_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
468
|
+
vfloat64m1_t compensation_21_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
469
|
+
vfloat64m1_t compensation_22_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
470
|
+
vfloat64m1_t sum_norm_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
471
|
+
vfloat64m1_t compensation_norm_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
540
472
|
nk_f64_t const *a_ptr = a, *b_ptr = b;
|
|
541
|
-
nk_size_t remaining =
|
|
473
|
+
nk_size_t remaining = points_count;
|
|
542
474
|
for (nk_size_t vector_length; remaining > 0;
|
|
543
475
|
remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
|
|
544
476
|
vector_length = __riscv_vsetvl_e64m1(remaining);
|
|
@@ -571,56 +503,70 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f64_rvv_( //
|
|
|
571
503
|
nk_accumulate_sum_f64m1_rvv_(&sum_norm_squared_f64m1, &compensation_norm_squared_f64m1, norm_squared_f64m1,
|
|
572
504
|
vector_length);
|
|
573
505
|
}
|
|
574
|
-
nk_f64_t
|
|
575
|
-
nk_f64_t
|
|
576
|
-
nk_f64_t
|
|
577
|
-
nk_f64_t
|
|
578
|
-
nk_f64_t
|
|
579
|
-
nk_f64_t
|
|
580
|
-
nk_f64_t
|
|
581
|
-
*
|
|
582
|
-
*
|
|
583
|
-
*
|
|
584
|
-
*
|
|
585
|
-
*
|
|
586
|
-
*
|
|
587
|
-
nk_f64_t n_f64 = (nk_f64_t)
|
|
588
|
-
h[0] = nk_dot_stable_sum_f64m1_rvv_(cross_00_f64m1, compensation_00_f64m1) -
|
|
589
|
-
|
|
590
|
-
h[
|
|
591
|
-
|
|
592
|
-
h[
|
|
593
|
-
|
|
594
|
-
h[
|
|
595
|
-
|
|
596
|
-
h[
|
|
597
|
-
|
|
598
|
-
|
|
506
|
+
nk_f64_t inv_points_count = 1.0 / (nk_f64_t)points_count;
|
|
507
|
+
nk_f64_t centroid_a_x_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_a_x_f64m1, compensation_a_x_f64m1) * inv_points_count;
|
|
508
|
+
nk_f64_t centroid_a_y_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_a_y_f64m1, compensation_a_y_f64m1) * inv_points_count;
|
|
509
|
+
nk_f64_t centroid_a_z_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_a_z_f64m1, compensation_a_z_f64m1) * inv_points_count;
|
|
510
|
+
nk_f64_t centroid_b_x_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_b_x_f64m1, compensation_b_x_f64m1) * inv_points_count;
|
|
511
|
+
nk_f64_t centroid_b_y_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_b_y_f64m1, compensation_b_y_f64m1) * inv_points_count;
|
|
512
|
+
nk_f64_t centroid_b_z_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_b_z_f64m1, compensation_b_z_f64m1) * inv_points_count;
|
|
513
|
+
*centroid_a_x = centroid_a_x_f64;
|
|
514
|
+
*centroid_a_y = centroid_a_y_f64;
|
|
515
|
+
*centroid_a_z = centroid_a_z_f64;
|
|
516
|
+
*centroid_b_x = centroid_b_x_f64;
|
|
517
|
+
*centroid_b_y = centroid_b_y_f64;
|
|
518
|
+
*centroid_b_z = centroid_b_z_f64;
|
|
519
|
+
nk_f64_t n_f64 = (nk_f64_t)points_count;
|
|
520
|
+
h[0] = nk_dot_stable_sum_f64m1_rvv_(cross_00_f64m1, compensation_00_f64m1) -
|
|
521
|
+
n_f64 * centroid_a_x_f64 * centroid_b_x_f64;
|
|
522
|
+
h[1] = nk_dot_stable_sum_f64m1_rvv_(cross_01_f64m1, compensation_01_f64m1) -
|
|
523
|
+
n_f64 * centroid_a_x_f64 * centroid_b_y_f64;
|
|
524
|
+
h[2] = nk_dot_stable_sum_f64m1_rvv_(cross_02_f64m1, compensation_02_f64m1) -
|
|
525
|
+
n_f64 * centroid_a_x_f64 * centroid_b_z_f64;
|
|
526
|
+
h[3] = nk_dot_stable_sum_f64m1_rvv_(cross_10_f64m1, compensation_10_f64m1) -
|
|
527
|
+
n_f64 * centroid_a_y_f64 * centroid_b_x_f64;
|
|
528
|
+
h[4] = nk_dot_stable_sum_f64m1_rvv_(cross_11_f64m1, compensation_11_f64m1) -
|
|
529
|
+
n_f64 * centroid_a_y_f64 * centroid_b_y_f64;
|
|
530
|
+
h[5] = nk_dot_stable_sum_f64m1_rvv_(cross_12_f64m1, compensation_12_f64m1) -
|
|
531
|
+
n_f64 * centroid_a_y_f64 * centroid_b_z_f64;
|
|
532
|
+
h[6] = nk_dot_stable_sum_f64m1_rvv_(cross_20_f64m1, compensation_20_f64m1) -
|
|
533
|
+
n_f64 * centroid_a_z_f64 * centroid_b_x_f64;
|
|
534
|
+
h[7] = nk_dot_stable_sum_f64m1_rvv_(cross_21_f64m1, compensation_21_f64m1) -
|
|
535
|
+
n_f64 * centroid_a_z_f64 * centroid_b_y_f64;
|
|
536
|
+
h[8] = nk_dot_stable_sum_f64m1_rvv_(cross_22_f64m1, compensation_22_f64m1) -
|
|
537
|
+
n_f64 * centroid_a_z_f64 * centroid_b_z_f64;
|
|
538
|
+
*variance_a = nk_dot_stable_sum_f64m1_rvv_(sum_norm_squared_f64m1, compensation_norm_squared_f64m1) *
|
|
539
|
+
inv_points_count -
|
|
540
|
+
(centroid_a_x_f64 * centroid_a_x_f64 + centroid_a_y_f64 * centroid_a_y_f64 +
|
|
541
|
+
centroid_a_z_f64 * centroid_a_z_f64);
|
|
599
542
|
}
|
|
600
543
|
|
|
601
|
-
NK_INTERNAL nk_f64_t nk_transformed_ssd_f32_rvv_(
|
|
602
|
-
nk_f32_t const *a, nk_f32_t const *b, nk_size_t
|
|
603
|
-
nk_f64_t const *r, nk_f64_t scale,
|
|
604
|
-
nk_f64_t
|
|
605
|
-
nk_f64_t
|
|
544
|
+
NK_INTERNAL nk_f64_t nk_transformed_ssd_f32_rvv_( //
|
|
545
|
+
nk_f32_t const *a, nk_f32_t const *b, nk_size_t points_count, //
|
|
546
|
+
nk_f64_t const *r, nk_f64_t scale, //
|
|
547
|
+
nk_f64_t centroid_a_x, nk_f64_t centroid_a_y, nk_f64_t centroid_a_z, //
|
|
548
|
+
nk_f64_t centroid_b_x, nk_f64_t centroid_b_y, nk_f64_t centroid_b_z) {
|
|
606
549
|
nk_f64_t scaled_rotation_x_x = scale * r[0], scaled_rotation_x_y = scale * r[1], scaled_rotation_x_z = scale * r[2];
|
|
607
550
|
nk_f64_t scaled_rotation_y_x = scale * r[3], scaled_rotation_y_y = scale * r[4], scaled_rotation_y_z = scale * r[5];
|
|
608
551
|
nk_f64_t scaled_rotation_z_x = scale * r[6], scaled_rotation_z_y = scale * r[7], scaled_rotation_z_z = scale * r[8];
|
|
609
|
-
nk_size_t
|
|
610
|
-
vfloat64m2_t sum_distance_squared_f64m2 = __riscv_vfmv_v_f_f64m2(0.0,
|
|
552
|
+
nk_size_t max_vector_length = __riscv_vsetvlmax_e64m2();
|
|
553
|
+
vfloat64m2_t sum_distance_squared_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
611
554
|
vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
|
|
612
555
|
nk_f32_t const *a_ptr = a, *b_ptr = b;
|
|
613
|
-
nk_size_t remaining =
|
|
556
|
+
nk_size_t remaining = points_count;
|
|
614
557
|
for (nk_size_t vector_length; remaining > 0;
|
|
615
558
|
remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
|
|
616
559
|
vector_length = __riscv_vsetvl_e32m1(remaining);
|
|
617
560
|
vfloat32m1x3_t a_f32m1x3 = __riscv_vlseg3e32_v_f32m1x3(a_ptr, vector_length);
|
|
618
561
|
vfloat64m2_t centered_a_x_f64m2 = __riscv_vfsub_vf_f64m2(
|
|
619
|
-
__riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 0), vector_length),
|
|
562
|
+
__riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 0), vector_length), centroid_a_x,
|
|
563
|
+
vector_length);
|
|
620
564
|
vfloat64m2_t centered_a_y_f64m2 = __riscv_vfsub_vf_f64m2(
|
|
621
|
-
__riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 1), vector_length),
|
|
565
|
+
__riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 1), vector_length), centroid_a_y,
|
|
566
|
+
vector_length);
|
|
622
567
|
vfloat64m2_t centered_a_z_f64m2 = __riscv_vfsub_vf_f64m2(
|
|
623
|
-
__riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 2), vector_length),
|
|
568
|
+
__riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 2), vector_length), centroid_a_z,
|
|
569
|
+
vector_length);
|
|
624
570
|
vfloat64m2_t rotated_a_x_f64m2 = __riscv_vfmul_vf_f64m2(centered_a_x_f64m2, scaled_rotation_x_x, vector_length);
|
|
625
571
|
rotated_a_x_f64m2 = __riscv_vfmacc_vf_f64m2(rotated_a_x_f64m2, scaled_rotation_x_y, centered_a_y_f64m2,
|
|
626
572
|
vector_length);
|
|
@@ -638,11 +584,14 @@ NK_INTERNAL nk_f64_t nk_transformed_ssd_f32_rvv_( //
|
|
|
638
584
|
vector_length);
|
|
639
585
|
vfloat32m1x3_t b_f32m1x3 = __riscv_vlseg3e32_v_f32m1x3(b_ptr, vector_length);
|
|
640
586
|
vfloat64m2_t centered_b_x_f64m2 = __riscv_vfsub_vf_f64m2(
|
|
641
|
-
__riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 0), vector_length),
|
|
587
|
+
__riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 0), vector_length), centroid_b_x,
|
|
588
|
+
vector_length);
|
|
642
589
|
vfloat64m2_t centered_b_y_f64m2 = __riscv_vfsub_vf_f64m2(
|
|
643
|
-
__riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 1), vector_length),
|
|
590
|
+
__riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 1), vector_length), centroid_b_y,
|
|
591
|
+
vector_length);
|
|
644
592
|
vfloat64m2_t centered_b_z_f64m2 = __riscv_vfsub_vf_f64m2(
|
|
645
|
-
__riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 2), vector_length),
|
|
593
|
+
__riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 2), vector_length), centroid_b_z,
|
|
594
|
+
vector_length);
|
|
646
595
|
vfloat64m2_t delta_x_f64m2 = __riscv_vfsub_vv_f64m2(rotated_a_x_f64m2, centered_b_x_f64m2, vector_length);
|
|
647
596
|
vfloat64m2_t delta_y_f64m2 = __riscv_vfsub_vv_f64m2(rotated_a_y_f64m2, centered_b_y_f64m2, vector_length);
|
|
648
597
|
vfloat64m2_t delta_z_f64m2 = __riscv_vfsub_vv_f64m2(rotated_a_z_f64m2, centered_b_z_f64m2, vector_length);
|
|
@@ -653,32 +602,33 @@ NK_INTERNAL nk_f64_t nk_transformed_ssd_f32_rvv_( //
|
|
|
653
602
|
sum_distance_squared_f64m2 = __riscv_vfmacc_vv_f64m2_tu(sum_distance_squared_f64m2, delta_z_f64m2,
|
|
654
603
|
delta_z_f64m2, vector_length);
|
|
655
604
|
}
|
|
656
|
-
return __riscv_vfmv_f_s_f64m1_f64(
|
|
605
|
+
return __riscv_vfmv_f_s_f64m1_f64(
|
|
606
|
+
__riscv_vfredusum_vs_f64m2_f64m1(sum_distance_squared_f64m2, zero_f64m1, max_vector_length));
|
|
657
607
|
}
|
|
658
608
|
|
|
659
|
-
NK_INTERNAL nk_f64_t nk_transformed_ssd_f64_rvv_(
|
|
660
|
-
nk_f64_t const *a, nk_f64_t const *b, nk_size_t
|
|
661
|
-
nk_f64_t const *r, nk_f64_t scale,
|
|
662
|
-
nk_f64_t
|
|
663
|
-
nk_f64_t
|
|
609
|
+
NK_INTERNAL nk_f64_t nk_transformed_ssd_f64_rvv_( //
|
|
610
|
+
nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, //
|
|
611
|
+
nk_f64_t const *r, nk_f64_t scale, //
|
|
612
|
+
nk_f64_t centroid_a_x, nk_f64_t centroid_a_y, nk_f64_t centroid_a_z, //
|
|
613
|
+
nk_f64_t centroid_b_x, nk_f64_t centroid_b_y, nk_f64_t centroid_b_z) {
|
|
664
614
|
nk_f64_t scaled_rotation_x_x = scale * r[0], scaled_rotation_x_y = scale * r[1], scaled_rotation_x_z = scale * r[2];
|
|
665
615
|
nk_f64_t scaled_rotation_y_x = scale * r[3], scaled_rotation_y_y = scale * r[4], scaled_rotation_y_z = scale * r[5];
|
|
666
616
|
nk_f64_t scaled_rotation_z_x = scale * r[6], scaled_rotation_z_y = scale * r[7], scaled_rotation_z_z = scale * r[8];
|
|
667
|
-
nk_size_t
|
|
668
|
-
vfloat64m1_t sum_distance_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0,
|
|
669
|
-
vfloat64m1_t compensation_distance_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0,
|
|
617
|
+
nk_size_t max_vector_length = __riscv_vsetvlmax_e64m1();
|
|
618
|
+
vfloat64m1_t sum_distance_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
619
|
+
vfloat64m1_t compensation_distance_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
670
620
|
nk_f64_t const *a_ptr = a, *b_ptr = b;
|
|
671
|
-
nk_size_t remaining =
|
|
621
|
+
nk_size_t remaining = points_count;
|
|
672
622
|
for (nk_size_t vector_length; remaining > 0;
|
|
673
623
|
remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
|
|
674
624
|
vector_length = __riscv_vsetvl_e64m1(remaining);
|
|
675
625
|
vfloat64m1x3_t a_f64m1x3 = __riscv_vlseg3e64_v_f64m1x3(a_ptr, vector_length);
|
|
676
|
-
vfloat64m1_t centered_a_x_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 0),
|
|
677
|
-
vector_length);
|
|
678
|
-
vfloat64m1_t centered_a_y_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 1),
|
|
679
|
-
vector_length);
|
|
680
|
-
vfloat64m1_t centered_a_z_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 2),
|
|
681
|
-
vector_length);
|
|
626
|
+
vfloat64m1_t centered_a_x_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 0),
|
|
627
|
+
centroid_a_x, vector_length);
|
|
628
|
+
vfloat64m1_t centered_a_y_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 1),
|
|
629
|
+
centroid_a_y, vector_length);
|
|
630
|
+
vfloat64m1_t centered_a_z_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 2),
|
|
631
|
+
centroid_a_z, vector_length);
|
|
682
632
|
vfloat64m1_t rotated_a_x_f64m1 = __riscv_vfmul_vf_f64m1(centered_a_x_f64m1, scaled_rotation_x_x, vector_length);
|
|
683
633
|
rotated_a_x_f64m1 = __riscv_vfmacc_vf_f64m1(rotated_a_x_f64m1, scaled_rotation_x_y, centered_a_y_f64m1,
|
|
684
634
|
vector_length);
|
|
@@ -695,12 +645,12 @@ NK_INTERNAL nk_f64_t nk_transformed_ssd_f64_rvv_( //
|
|
|
695
645
|
rotated_a_z_f64m1 = __riscv_vfmacc_vf_f64m1(rotated_a_z_f64m1, scaled_rotation_z_z, centered_a_z_f64m1,
|
|
696
646
|
vector_length);
|
|
697
647
|
vfloat64m1x3_t b_f64m1x3 = __riscv_vlseg3e64_v_f64m1x3(b_ptr, vector_length);
|
|
698
|
-
vfloat64m1_t centered_b_x_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 0),
|
|
699
|
-
vector_length);
|
|
700
|
-
vfloat64m1_t centered_b_y_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 1),
|
|
701
|
-
vector_length);
|
|
702
|
-
vfloat64m1_t centered_b_z_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 2),
|
|
703
|
-
vector_length);
|
|
648
|
+
vfloat64m1_t centered_b_x_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 0),
|
|
649
|
+
centroid_b_x, vector_length);
|
|
650
|
+
vfloat64m1_t centered_b_y_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 1),
|
|
651
|
+
centroid_b_y, vector_length);
|
|
652
|
+
vfloat64m1_t centered_b_z_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 2),
|
|
653
|
+
centroid_b_z, vector_length);
|
|
704
654
|
vfloat64m1_t delta_x_f64m1 = __riscv_vfsub_vv_f64m1(rotated_a_x_f64m1, centered_b_x_f64m1, vector_length);
|
|
705
655
|
vfloat64m1_t delta_y_f64m1 = __riscv_vfsub_vv_f64m1(rotated_a_y_f64m1, centered_b_y_f64m1, vector_length);
|
|
706
656
|
vfloat64m1_t delta_z_f64m1 = __riscv_vfsub_vv_f64m1(rotated_a_z_f64m1, centered_b_z_f64m1, vector_length);
|
|
@@ -745,42 +695,176 @@ NK_INTERNAL void nk_rotation_from_svd_f64_rvv_( //
|
|
|
745
695
|
nk_rotation_from_svd_f64_serial_(svd_u, svd_v, r);
|
|
746
696
|
}
|
|
747
697
|
|
|
748
|
-
NK_PUBLIC void nk_rmsd_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t
|
|
698
|
+
NK_PUBLIC void nk_rmsd_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
|
|
749
699
|
nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
|
|
750
|
-
nk_f64_t identity[9] = {1, 0, 0, 0, 1, 0, 0, 0, 1};
|
|
751
700
|
if (rotation)
|
|
752
|
-
|
|
701
|
+
rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
|
|
702
|
+
rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
|
|
753
703
|
if (scale) *scale = 1.0f;
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
704
|
+
|
|
705
|
+
// Fused single-pass: accumulate centroids and squared differences simultaneously.
|
|
706
|
+
// RMSD = √(E[(a−b)²] − (ā − b̄)²)
|
|
707
|
+
nk_size_t max_vector_length = __riscv_vsetvlmax_e64m2();
|
|
708
|
+
vfloat64m2_t sum_a_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
709
|
+
vfloat64m2_t sum_a_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
710
|
+
vfloat64m2_t sum_a_z_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
711
|
+
vfloat64m2_t sum_b_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
712
|
+
vfloat64m2_t sum_b_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
713
|
+
vfloat64m2_t sum_b_z_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
714
|
+
vfloat64m2_t sum_squared_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
715
|
+
nk_f32_t const *a_ptr = a, *b_ptr = b;
|
|
716
|
+
nk_size_t remaining = points_count;
|
|
717
|
+
for (nk_size_t vector_length; remaining > 0;
|
|
718
|
+
remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
|
|
719
|
+
vector_length = __riscv_vsetvl_e32m1(remaining);
|
|
720
|
+
vfloat32m1x3_t a_f32m1x3 = __riscv_vlseg3e32_v_f32m1x3(a_ptr, vector_length);
|
|
721
|
+
vfloat32m1_t a_x_f32m1 = __riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 0);
|
|
722
|
+
vfloat32m1_t a_y_f32m1 = __riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 1);
|
|
723
|
+
vfloat32m1_t a_z_f32m1 = __riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 2);
|
|
724
|
+
vfloat32m1x3_t b_f32m1x3 = __riscv_vlseg3e32_v_f32m1x3(b_ptr, vector_length);
|
|
725
|
+
vfloat32m1_t b_x_f32m1 = __riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 0);
|
|
726
|
+
vfloat32m1_t b_y_f32m1 = __riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 1);
|
|
727
|
+
vfloat32m1_t b_z_f32m1 = __riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 2);
|
|
728
|
+
// Accumulate centroids in f64.
|
|
729
|
+
sum_a_x_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_a_x_f64m2, sum_a_x_f64m2, a_x_f32m1, vector_length);
|
|
730
|
+
sum_a_y_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_a_y_f64m2, sum_a_y_f64m2, a_y_f32m1, vector_length);
|
|
731
|
+
sum_a_z_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_a_z_f64m2, sum_a_z_f64m2, a_z_f32m1, vector_length);
|
|
732
|
+
sum_b_x_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_b_x_f64m2, sum_b_x_f64m2, b_x_f32m1, vector_length);
|
|
733
|
+
sum_b_y_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_b_y_f64m2, sum_b_y_f64m2, b_y_f32m1, vector_length);
|
|
734
|
+
sum_b_z_f64m2 = __riscv_vfwadd_wv_f64m2_tu(sum_b_z_f64m2, sum_b_z_f64m2, b_z_f32m1, vector_length);
|
|
735
|
+
// Accumulate (a−b)² per component. Widen a,b to f64 before subtracting to avoid f32
|
|
736
|
+
// cancellation in the single-pass formula RMSD = √(E[(a−b)²] − (ā − b̄)²).
|
|
737
|
+
vfloat64m2_t a_x_f64m2 = __riscv_vfwcvt_f_f_v_f64m2(a_x_f32m1, vector_length);
|
|
738
|
+
vfloat64m2_t b_x_f64m2 = __riscv_vfwcvt_f_f_v_f64m2(b_x_f32m1, vector_length);
|
|
739
|
+
vfloat64m2_t a_y_f64m2 = __riscv_vfwcvt_f_f_v_f64m2(a_y_f32m1, vector_length);
|
|
740
|
+
vfloat64m2_t b_y_f64m2 = __riscv_vfwcvt_f_f_v_f64m2(b_y_f32m1, vector_length);
|
|
741
|
+
vfloat64m2_t a_z_f64m2 = __riscv_vfwcvt_f_f_v_f64m2(a_z_f32m1, vector_length);
|
|
742
|
+
vfloat64m2_t b_z_f64m2 = __riscv_vfwcvt_f_f_v_f64m2(b_z_f32m1, vector_length);
|
|
743
|
+
vfloat64m2_t delta_x_f64m2 = __riscv_vfsub_vv_f64m2(a_x_f64m2, b_x_f64m2, vector_length);
|
|
744
|
+
vfloat64m2_t delta_y_f64m2 = __riscv_vfsub_vv_f64m2(a_y_f64m2, b_y_f64m2, vector_length);
|
|
745
|
+
vfloat64m2_t delta_z_f64m2 = __riscv_vfsub_vv_f64m2(a_z_f64m2, b_z_f64m2, vector_length);
|
|
746
|
+
sum_squared_f64m2 = __riscv_vfmacc_vv_f64m2_tu(sum_squared_f64m2, delta_x_f64m2, delta_x_f64m2, vector_length);
|
|
747
|
+
sum_squared_f64m2 = __riscv_vfmacc_vv_f64m2_tu(sum_squared_f64m2, delta_y_f64m2, delta_y_f64m2, vector_length);
|
|
748
|
+
sum_squared_f64m2 = __riscv_vfmacc_vv_f64m2_tu(sum_squared_f64m2, delta_z_f64m2, delta_z_f64m2, vector_length);
|
|
749
|
+
}
|
|
750
|
+
vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
|
|
751
|
+
nk_f64_t inv_points_count = 1.0 / (nk_f64_t)points_count;
|
|
752
|
+
nk_f64_t centroid_a_x = __riscv_vfmv_f_s_f64m1_f64(
|
|
753
|
+
__riscv_vfredusum_vs_f64m2_f64m1(sum_a_x_f64m2, zero_f64m1, max_vector_length)) *
|
|
754
|
+
inv_points_count;
|
|
755
|
+
nk_f64_t centroid_a_y = __riscv_vfmv_f_s_f64m1_f64(
|
|
756
|
+
__riscv_vfredusum_vs_f64m2_f64m1(sum_a_y_f64m2, zero_f64m1, max_vector_length)) *
|
|
757
|
+
inv_points_count;
|
|
758
|
+
nk_f64_t centroid_a_z = __riscv_vfmv_f_s_f64m1_f64(
|
|
759
|
+
__riscv_vfredusum_vs_f64m2_f64m1(sum_a_z_f64m2, zero_f64m1, max_vector_length)) *
|
|
760
|
+
inv_points_count;
|
|
761
|
+
nk_f64_t centroid_b_x = __riscv_vfmv_f_s_f64m1_f64(
|
|
762
|
+
__riscv_vfredusum_vs_f64m2_f64m1(sum_b_x_f64m2, zero_f64m1, max_vector_length)) *
|
|
763
|
+
inv_points_count;
|
|
764
|
+
nk_f64_t centroid_b_y = __riscv_vfmv_f_s_f64m1_f64(
|
|
765
|
+
__riscv_vfredusum_vs_f64m2_f64m1(sum_b_y_f64m2, zero_f64m1, max_vector_length)) *
|
|
766
|
+
inv_points_count;
|
|
767
|
+
nk_f64_t centroid_b_z = __riscv_vfmv_f_s_f64m1_f64(
|
|
768
|
+
__riscv_vfredusum_vs_f64m2_f64m1(sum_b_z_f64m2, zero_f64m1, max_vector_length)) *
|
|
769
|
+
inv_points_count;
|
|
770
|
+
if (a_centroid)
|
|
771
|
+
a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
|
|
772
|
+
a_centroid[2] = (nk_f32_t)centroid_a_z;
|
|
773
|
+
if (b_centroid)
|
|
774
|
+
b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
|
|
775
|
+
b_centroid[2] = (nk_f32_t)centroid_b_z;
|
|
776
|
+
|
|
777
|
+
nk_f64_t sum_squared = __riscv_vfmv_f_s_f64m1_f64(
|
|
778
|
+
__riscv_vfredusum_vs_f64m2_f64m1(sum_squared_f64m2, zero_f64m1, max_vector_length));
|
|
779
|
+
nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x, mean_diff_y = centroid_a_y - centroid_b_y,
|
|
780
|
+
mean_diff_z = centroid_a_z - centroid_b_z;
|
|
781
|
+
nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
|
|
782
|
+
*result = nk_f64_sqrt_rvv(sum_squared * inv_points_count - mean_diff_sq);
|
|
760
783
|
}
|
|
761
784
|
|
|
762
|
-
NK_PUBLIC void nk_rmsd_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t
|
|
785
|
+
NK_PUBLIC void nk_rmsd_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, nk_f64_t *a_centroid,
|
|
763
786
|
nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
|
|
764
|
-
nk_f64_t identity[9] = {1, 0, 0, 0, 1, 0, 0, 0, 1};
|
|
765
787
|
if (rotation)
|
|
766
|
-
|
|
788
|
+
rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
|
|
789
|
+
rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
|
|
767
790
|
if (scale) *scale = 1.0;
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
791
|
+
|
|
792
|
+
// Fused single-pass: accumulate centroids and squared differences simultaneously.
|
|
793
|
+
// RMSD = √(E[(a−b)²] − (ā − b̄)²)
|
|
794
|
+
nk_size_t max_vector_length = __riscv_vsetvlmax_e64m1();
|
|
795
|
+
vfloat64m1_t sum_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
796
|
+
vfloat64m1_t sum_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
797
|
+
vfloat64m1_t sum_a_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
798
|
+
vfloat64m1_t sum_b_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
799
|
+
vfloat64m1_t sum_b_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
800
|
+
vfloat64m1_t sum_b_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
801
|
+
vfloat64m1_t compensation_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
802
|
+
vfloat64m1_t compensation_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
803
|
+
vfloat64m1_t compensation_a_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
804
|
+
vfloat64m1_t compensation_b_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
805
|
+
vfloat64m1_t compensation_b_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
806
|
+
vfloat64m1_t compensation_b_z_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
807
|
+
vfloat64m1_t sum_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
808
|
+
vfloat64m1_t compensation_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
809
|
+
nk_f64_t const *a_ptr = a, *b_ptr = b;
|
|
810
|
+
nk_size_t remaining = points_count;
|
|
811
|
+
for (nk_size_t vector_length; remaining > 0;
|
|
812
|
+
remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
|
|
813
|
+
vector_length = __riscv_vsetvl_e64m1(remaining);
|
|
814
|
+
vfloat64m1x3_t a_f64m1x3 = __riscv_vlseg3e64_v_f64m1x3(a_ptr, vector_length);
|
|
815
|
+
vfloat64m1_t a_x_f64m1 = __riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 0);
|
|
816
|
+
vfloat64m1_t a_y_f64m1 = __riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 1);
|
|
817
|
+
vfloat64m1_t a_z_f64m1 = __riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 2);
|
|
818
|
+
vfloat64m1x3_t b_f64m1x3 = __riscv_vlseg3e64_v_f64m1x3(b_ptr, vector_length);
|
|
819
|
+
vfloat64m1_t b_x_f64m1 = __riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 0);
|
|
820
|
+
vfloat64m1_t b_y_f64m1 = __riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 1);
|
|
821
|
+
vfloat64m1_t b_z_f64m1 = __riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 2);
|
|
822
|
+
// Accumulate centroids with Kahan compensation.
|
|
823
|
+
nk_accumulate_sum_f64m1_rvv_(&sum_a_x_f64m1, &compensation_a_x_f64m1, a_x_f64m1, vector_length);
|
|
824
|
+
nk_accumulate_sum_f64m1_rvv_(&sum_a_y_f64m1, &compensation_a_y_f64m1, a_y_f64m1, vector_length);
|
|
825
|
+
nk_accumulate_sum_f64m1_rvv_(&sum_a_z_f64m1, &compensation_a_z_f64m1, a_z_f64m1, vector_length);
|
|
826
|
+
nk_accumulate_sum_f64m1_rvv_(&sum_b_x_f64m1, &compensation_b_x_f64m1, b_x_f64m1, vector_length);
|
|
827
|
+
nk_accumulate_sum_f64m1_rvv_(&sum_b_y_f64m1, &compensation_b_y_f64m1, b_y_f64m1, vector_length);
|
|
828
|
+
nk_accumulate_sum_f64m1_rvv_(&sum_b_z_f64m1, &compensation_b_z_f64m1, b_z_f64m1, vector_length);
|
|
829
|
+
// Accumulate (a-b)^2 per component.
|
|
830
|
+
vfloat64m1_t delta_x_f64m1 = __riscv_vfsub_vv_f64m1(a_x_f64m1, b_x_f64m1, vector_length);
|
|
831
|
+
vfloat64m1_t delta_y_f64m1 = __riscv_vfsub_vv_f64m1(a_y_f64m1, b_y_f64m1, vector_length);
|
|
832
|
+
vfloat64m1_t delta_z_f64m1 = __riscv_vfsub_vv_f64m1(a_z_f64m1, b_z_f64m1, vector_length);
|
|
833
|
+
vfloat64m1_t dist_sq_f64m1 = __riscv_vfmul_vv_f64m1(delta_x_f64m1, delta_x_f64m1, vector_length);
|
|
834
|
+
dist_sq_f64m1 = __riscv_vfmacc_vv_f64m1(dist_sq_f64m1, delta_y_f64m1, delta_y_f64m1, vector_length);
|
|
835
|
+
dist_sq_f64m1 = __riscv_vfmacc_vv_f64m1(dist_sq_f64m1, delta_z_f64m1, delta_z_f64m1, vector_length);
|
|
836
|
+
nk_accumulate_sum_f64m1_rvv_(&sum_squared_f64m1, &compensation_squared_f64m1, dist_sq_f64m1, vector_length);
|
|
837
|
+
}
|
|
838
|
+
nk_f64_t inv_points_count = 1.0 / (nk_f64_t)points_count;
|
|
839
|
+
nk_f64_t centroid_a_x = nk_dot_stable_sum_f64m1_rvv_(sum_a_x_f64m1, compensation_a_x_f64m1) * inv_points_count;
|
|
840
|
+
nk_f64_t centroid_a_y = nk_dot_stable_sum_f64m1_rvv_(sum_a_y_f64m1, compensation_a_y_f64m1) * inv_points_count;
|
|
841
|
+
nk_f64_t centroid_a_z = nk_dot_stable_sum_f64m1_rvv_(sum_a_z_f64m1, compensation_a_z_f64m1) * inv_points_count;
|
|
842
|
+
nk_f64_t centroid_b_x = nk_dot_stable_sum_f64m1_rvv_(sum_b_x_f64m1, compensation_b_x_f64m1) * inv_points_count;
|
|
843
|
+
nk_f64_t centroid_b_y = nk_dot_stable_sum_f64m1_rvv_(sum_b_y_f64m1, compensation_b_y_f64m1) * inv_points_count;
|
|
844
|
+
nk_f64_t centroid_b_z = nk_dot_stable_sum_f64m1_rvv_(sum_b_z_f64m1, compensation_b_z_f64m1) * inv_points_count;
|
|
845
|
+
if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
|
|
846
|
+
if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
|
|
847
|
+
|
|
848
|
+
nk_f64_t sum_squared = nk_dot_stable_sum_f64m1_rvv_(sum_squared_f64m1, compensation_squared_f64m1);
|
|
849
|
+
nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x, mean_diff_y = centroid_a_y - centroid_b_y,
|
|
850
|
+
mean_diff_z = centroid_a_z - centroid_b_z;
|
|
851
|
+
nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
|
|
852
|
+
*result = nk_f64_sqrt_rvv(sum_squared * inv_points_count - mean_diff_sq);
|
|
774
853
|
}
|
|
775
854
|
|
|
776
|
-
NK_PUBLIC void nk_kabsch_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t
|
|
855
|
+
NK_PUBLIC void nk_kabsch_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
|
|
777
856
|
nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
|
|
778
857
|
if (scale) *scale = 1.0f;
|
|
779
|
-
nk_f64_t
|
|
858
|
+
nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z;
|
|
780
859
|
nk_f64_t h[9];
|
|
781
|
-
nk_centroid_and_cross_covariance_f32_rvv_(a, b,
|
|
782
|
-
|
|
783
|
-
if (
|
|
860
|
+
nk_centroid_and_cross_covariance_f32_rvv_(a, b, points_count, ¢roid_a_x, ¢roid_a_y, ¢roid_a_z,
|
|
861
|
+
¢roid_b_x, ¢roid_b_y, ¢roid_b_z, h);
|
|
862
|
+
if (a_centroid)
|
|
863
|
+
a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
|
|
864
|
+
a_centroid[2] = (nk_f32_t)centroid_a_z;
|
|
865
|
+
if (b_centroid)
|
|
866
|
+
b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
|
|
867
|
+
b_centroid[2] = (nk_f32_t)centroid_b_z;
|
|
784
868
|
nk_f64_t svd_u[9], svd_s[9], svd_v[9];
|
|
785
869
|
nk_svd3x3_f64_(h, svd_u, svd_s, svd_v);
|
|
786
870
|
nk_f64_t r[9];
|
|
@@ -791,18 +875,20 @@ NK_PUBLIC void nk_kabsch_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t
|
|
|
791
875
|
}
|
|
792
876
|
if (rotation)
|
|
793
877
|
for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)r[j];
|
|
794
|
-
nk_f64_t ssd = nk_transformed_ssd_f32_rvv_(a, b,
|
|
795
|
-
|
|
878
|
+
nk_f64_t ssd = nk_transformed_ssd_f32_rvv_(a, b, points_count, r, 1.0, centroid_a_x, centroid_a_y, centroid_a_z,
|
|
879
|
+
centroid_b_x, centroid_b_y, centroid_b_z);
|
|
880
|
+
*result = nk_f64_sqrt_rvv(ssd / (nk_f64_t)points_count);
|
|
796
881
|
}
|
|
797
882
|
|
|
798
|
-
NK_PUBLIC void nk_kabsch_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t
|
|
883
|
+
NK_PUBLIC void nk_kabsch_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, nk_f64_t *a_centroid,
|
|
799
884
|
nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
|
|
800
885
|
if (scale) *scale = 1.0;
|
|
801
|
-
nk_f64_t
|
|
886
|
+
nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z;
|
|
802
887
|
nk_f64_t h[9];
|
|
803
|
-
nk_centroid_and_cross_covariance_f64_rvv_(a, b,
|
|
804
|
-
|
|
805
|
-
if (
|
|
888
|
+
nk_centroid_and_cross_covariance_f64_rvv_(a, b, points_count, ¢roid_a_x, ¢roid_a_y, ¢roid_a_z,
|
|
889
|
+
¢roid_b_x, ¢roid_b_y, ¢roid_b_z, h);
|
|
890
|
+
if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
|
|
891
|
+
if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
|
|
806
892
|
nk_f64_t svd_u[9], svd_s[9], svd_v[9];
|
|
807
893
|
nk_svd3x3_f64_(h, svd_u, svd_s, svd_v);
|
|
808
894
|
nk_f64_t r[9];
|
|
@@ -813,18 +899,24 @@ NK_PUBLIC void nk_kabsch_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t
|
|
|
813
899
|
}
|
|
814
900
|
if (rotation)
|
|
815
901
|
for (int j = 0; j < 9; ++j) rotation[j] = r[j];
|
|
816
|
-
nk_f64_t ssd = nk_transformed_ssd_f64_rvv_(a, b,
|
|
817
|
-
|
|
902
|
+
nk_f64_t ssd = nk_transformed_ssd_f64_rvv_(a, b, points_count, r, 1.0, centroid_a_x, centroid_a_y, centroid_a_z,
|
|
903
|
+
centroid_b_x, centroid_b_y, centroid_b_z);
|
|
904
|
+
*result = nk_f64_sqrt_rvv(ssd / (nk_f64_t)points_count);
|
|
818
905
|
}
|
|
819
906
|
|
|
820
|
-
NK_PUBLIC void nk_umeyama_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t
|
|
907
|
+
NK_PUBLIC void nk_umeyama_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
|
|
821
908
|
nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
|
|
822
|
-
nk_f64_t
|
|
909
|
+
nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z;
|
|
823
910
|
nk_f64_t h[9], variance_a;
|
|
824
|
-
nk_centroid_and_cross_covariance_and_variance_f32_rvv_(a, b,
|
|
825
|
-
&
|
|
826
|
-
|
|
827
|
-
if (
|
|
911
|
+
nk_centroid_and_cross_covariance_and_variance_f32_rvv_(a, b, points_count, ¢roid_a_x, ¢roid_a_y,
|
|
912
|
+
¢roid_a_z, ¢roid_b_x, ¢roid_b_y, ¢roid_b_z,
|
|
913
|
+
h, &variance_a);
|
|
914
|
+
if (a_centroid)
|
|
915
|
+
a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
|
|
916
|
+
a_centroid[2] = (nk_f32_t)centroid_a_z;
|
|
917
|
+
if (b_centroid)
|
|
918
|
+
b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
|
|
919
|
+
b_centroid[2] = (nk_f32_t)centroid_b_z;
|
|
828
920
|
nk_f64_t svd_u[9], svd_s[9], svd_v[9];
|
|
829
921
|
nk_svd3x3_f64_(h, svd_u, svd_s, svd_v);
|
|
830
922
|
nk_f64_t r[9];
|
|
@@ -832,7 +924,7 @@ NK_PUBLIC void nk_umeyama_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_
|
|
|
832
924
|
nk_f64_t det = nk_det3x3_f64_(r);
|
|
833
925
|
nk_f64_t sign_det = det < 0 ? -1.0 : 1.0;
|
|
834
926
|
nk_f64_t trace_ds = nk_sum_three_products_f64_(svd_s[0], 1.0, svd_s[4], 1.0, svd_s[8], sign_det);
|
|
835
|
-
nk_f64_t scale_factor = trace_ds / ((nk_f64_t)
|
|
927
|
+
nk_f64_t scale_factor = trace_ds / ((nk_f64_t)points_count * variance_a);
|
|
836
928
|
if (scale) *scale = (nk_f32_t)scale_factor;
|
|
837
929
|
if (det < 0) {
|
|
838
930
|
svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
|
|
@@ -840,18 +932,20 @@ NK_PUBLIC void nk_umeyama_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_
|
|
|
840
932
|
}
|
|
841
933
|
if (rotation)
|
|
842
934
|
for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)r[j];
|
|
843
|
-
nk_f64_t ssd = nk_transformed_ssd_f32_rvv_(a, b,
|
|
844
|
-
|
|
935
|
+
nk_f64_t ssd = nk_transformed_ssd_f32_rvv_(a, b, points_count, r, scale_factor, centroid_a_x, centroid_a_y,
|
|
936
|
+
centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z);
|
|
937
|
+
*result = nk_f64_sqrt_rvv(ssd / (nk_f64_t)points_count);
|
|
845
938
|
}
|
|
846
939
|
|
|
847
|
-
NK_PUBLIC void nk_umeyama_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t
|
|
940
|
+
NK_PUBLIC void nk_umeyama_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, nk_f64_t *a_centroid,
|
|
848
941
|
nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
|
|
849
|
-
nk_f64_t
|
|
942
|
+
nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z;
|
|
850
943
|
nk_f64_t h[9], variance_a;
|
|
851
|
-
nk_centroid_and_cross_covariance_and_variance_f64_rvv_(a, b,
|
|
852
|
-
&
|
|
853
|
-
|
|
854
|
-
if (
|
|
944
|
+
nk_centroid_and_cross_covariance_and_variance_f64_rvv_(a, b, points_count, ¢roid_a_x, ¢roid_a_y,
|
|
945
|
+
¢roid_a_z, ¢roid_b_x, ¢roid_b_y, ¢roid_b_z,
|
|
946
|
+
h, &variance_a);
|
|
947
|
+
if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
|
|
948
|
+
if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
|
|
855
949
|
nk_f64_t svd_u[9], svd_s[9], svd_v[9];
|
|
856
950
|
nk_svd3x3_f64_(h, svd_u, svd_s, svd_v);
|
|
857
951
|
nk_f64_t r[9];
|
|
@@ -859,7 +953,7 @@ NK_PUBLIC void nk_umeyama_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_
|
|
|
859
953
|
nk_f64_t det = nk_det3x3_f64_(r);
|
|
860
954
|
nk_f64_t sign_det = det < 0 ? -1.0 : 1.0;
|
|
861
955
|
nk_f64_t trace_ds = nk_sum_three_products_f64_(svd_s[0], 1.0, svd_s[4], 1.0, svd_s[8], sign_det);
|
|
862
|
-
nk_f64_t scale_factor = trace_ds / ((nk_f64_t)
|
|
956
|
+
nk_f64_t scale_factor = trace_ds / ((nk_f64_t)points_count * variance_a);
|
|
863
957
|
if (scale) *scale = scale_factor;
|
|
864
958
|
if (det < 0) {
|
|
865
959
|
svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
|
|
@@ -867,38 +961,39 @@ NK_PUBLIC void nk_umeyama_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_
|
|
|
867
961
|
}
|
|
868
962
|
if (rotation)
|
|
869
963
|
for (int j = 0; j < 9; ++j) rotation[j] = r[j];
|
|
870
|
-
nk_f64_t ssd = nk_transformed_ssd_f64_rvv_(a, b,
|
|
871
|
-
|
|
964
|
+
nk_f64_t ssd = nk_transformed_ssd_f64_rvv_(a, b, points_count, r, scale_factor, centroid_a_x, centroid_a_y,
|
|
965
|
+
centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z);
|
|
966
|
+
*result = nk_f64_sqrt_rvv(ssd / (nk_f64_t)points_count);
|
|
872
967
|
}
|
|
873
968
|
|
|
874
|
-
NK_PUBLIC void nk_rmsd_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_size_t
|
|
969
|
+
NK_PUBLIC void nk_rmsd_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
|
|
875
970
|
nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
|
|
876
|
-
nk_rmsd_f16_serial(a, b,
|
|
971
|
+
nk_rmsd_f16_serial(a, b, points_count, a_centroid, b_centroid, rotation, scale, result);
|
|
877
972
|
}
|
|
878
973
|
|
|
879
|
-
NK_PUBLIC void nk_kabsch_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_size_t
|
|
974
|
+
NK_PUBLIC void nk_kabsch_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
|
|
880
975
|
nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
|
|
881
|
-
nk_kabsch_f16_serial(a, b,
|
|
976
|
+
nk_kabsch_f16_serial(a, b, points_count, a_centroid, b_centroid, rotation, scale, result);
|
|
882
977
|
}
|
|
883
978
|
|
|
884
|
-
NK_PUBLIC void nk_umeyama_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_size_t
|
|
979
|
+
NK_PUBLIC void nk_umeyama_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
|
|
885
980
|
nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
|
|
886
|
-
nk_umeyama_f16_serial(a, b,
|
|
981
|
+
nk_umeyama_f16_serial(a, b, points_count, a_centroid, b_centroid, rotation, scale, result);
|
|
887
982
|
}
|
|
888
983
|
|
|
889
|
-
NK_PUBLIC void nk_rmsd_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t
|
|
984
|
+
NK_PUBLIC void nk_rmsd_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
|
|
890
985
|
nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
|
|
891
|
-
nk_rmsd_bf16_serial(a, b,
|
|
986
|
+
nk_rmsd_bf16_serial(a, b, points_count, a_centroid, b_centroid, rotation, scale, result);
|
|
892
987
|
}
|
|
893
988
|
|
|
894
|
-
NK_PUBLIC void nk_kabsch_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t
|
|
989
|
+
NK_PUBLIC void nk_kabsch_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
|
|
895
990
|
nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
|
|
896
|
-
nk_kabsch_bf16_serial(a, b,
|
|
991
|
+
nk_kabsch_bf16_serial(a, b, points_count, a_centroid, b_centroid, rotation, scale, result);
|
|
897
992
|
}
|
|
898
993
|
|
|
899
|
-
NK_PUBLIC void nk_umeyama_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t
|
|
994
|
+
NK_PUBLIC void nk_umeyama_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
|
|
900
995
|
nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
|
|
901
|
-
nk_umeyama_bf16_serial(a, b,
|
|
996
|
+
nk_umeyama_bf16_serial(a, b, points_count, a_centroid, b_centroid, rotation, scale, result);
|
|
902
997
|
}
|
|
903
998
|
|
|
904
999
|
#if defined(__cplusplus)
|