numkong 7.5.0 → 7.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +18 -0
- package/c/dispatch_e5m2.c +23 -3
- package/include/numkong/capabilities.h +1 -1
- package/include/numkong/cast/README.md +3 -0
- package/include/numkong/cast/haswell.h +28 -64
- package/include/numkong/cast/serial.h +17 -0
- package/include/numkong/cast/skylake.h +67 -52
- package/include/numkong/cast.h +1 -0
- package/include/numkong/dot/README.md +1 -0
- package/include/numkong/dot/haswell.h +92 -13
- package/include/numkong/dot/serial.h +15 -0
- package/include/numkong/dot/skylake.h +61 -14
- package/include/numkong/dots/README.md +2 -0
- package/include/numkong/dots/graniteamx.h +434 -0
- package/include/numkong/dots/haswell.h +28 -28
- package/include/numkong/dots/sapphireamx.h +1 -1
- package/include/numkong/dots/serial.h +23 -8
- package/include/numkong/dots/skylake.h +28 -23
- package/include/numkong/dots.h +12 -0
- package/include/numkong/each/serial.h +18 -1
- package/include/numkong/geospatial/serial.h +14 -3
- package/include/numkong/maxsim/serial.h +15 -0
- package/include/numkong/mesh/README.md +50 -44
- package/include/numkong/mesh/genoa.h +462 -0
- package/include/numkong/mesh/haswell.h +806 -933
- package/include/numkong/mesh/neon.h +871 -943
- package/include/numkong/mesh/neonbfdot.h +382 -522
- package/include/numkong/mesh/neonfhm.h +676 -0
- package/include/numkong/mesh/rvv.h +404 -319
- package/include/numkong/mesh/serial.h +204 -162
- package/include/numkong/mesh/skylake.h +1029 -1585
- package/include/numkong/mesh/v128relaxed.h +403 -377
- package/include/numkong/mesh.h +38 -0
- package/include/numkong/reduce/serial.h +15 -1
- package/include/numkong/sparse/serial.h +17 -2
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +98 -56
- package/include/numkong/spatial/serial.h +15 -0
- package/include/numkong/spatial/skylake.h +114 -54
- package/include/numkong/spatial.h +0 -12
- package/include/numkong/spatials/graniteamx.h +128 -0
- package/include/numkong/spatials/serial.h +18 -1
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials.h +17 -0
- package/include/numkong/tensor.hpp +107 -23
- package/javascript/numkong.c +3 -2
- package/package.json +7 -7
- package/wasm/numkong.wasm +0 -0
|
@@ -16,8 +16,8 @@
|
|
|
16
16
|
* Fused helpers minimize data passes:
|
|
17
17
|
*
|
|
18
18
|
* - RMSD: fully fused single-pass (centroids + squared diffs), no separate helper
|
|
19
|
-
* - `nk_centroid_and_cross_covariance_*_rvv_`: centroids + H
|
|
20
|
-
* - `nk_centroid_and_cross_covariance_and_variance_*_rvv_`:
|
|
19
|
+
* - `nk_centroid_and_cross_covariance_*_rvv_`: centroids + H + centered ‖·‖² of a and b (Kabsch)
|
|
20
|
+
* - `nk_centroid_and_cross_covariance_and_variance_*_rvv_`: same outputs, used by Umeyama
|
|
21
21
|
*
|
|
22
22
|
* Math for fused centroid+covariance:
|
|
23
23
|
* H[i][j] = Σ (a[i] - ca[i]) * (b[j] - cb[j])
|
|
@@ -104,7 +104,7 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_rvv_( //
|
|
|
104
104
|
nk_f32_t const *a, nk_f32_t const *b, nk_size_t points_count, //
|
|
105
105
|
nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
|
|
106
106
|
nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
|
|
107
|
-
nk_f64_t
|
|
107
|
+
nk_f64_t cross_covariance[9], nk_f64_t *centered_norm_squared_a, nk_f64_t *centered_norm_squared_b) {
|
|
108
108
|
nk_size_t max_vector_length = __riscv_vsetvlmax_e64m2();
|
|
109
109
|
vfloat64m2_t sum_a_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
|
|
110
110
|
sum_a_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
@@ -121,6 +121,8 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_rvv_( //
|
|
|
121
121
|
vfloat64m2_t cross_20_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
|
|
122
122
|
cross_21_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
123
123
|
vfloat64m2_t cross_22_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
124
|
+
vfloat64m2_t norm_squared_a_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
125
|
+
vfloat64m2_t norm_squared_b_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
124
126
|
nk_f32_t const *a_ptr = a, *b_ptr = b;
|
|
125
127
|
nk_size_t remaining = points_count;
|
|
126
128
|
for (nk_size_t vector_length; remaining > 0;
|
|
@@ -149,6 +151,13 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_rvv_( //
|
|
|
149
151
|
cross_20_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(cross_20_f64m2, a_z_f32m1, b_x_f32m1, vector_length);
|
|
150
152
|
cross_21_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(cross_21_f64m2, a_z_f32m1, b_y_f32m1, vector_length);
|
|
151
153
|
cross_22_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(cross_22_f64m2, a_z_f32m1, b_z_f32m1, vector_length);
|
|
154
|
+
// Accumulate norm-squared for a and b (uncentered; centering fixup applied after reduction).
|
|
155
|
+
norm_squared_a_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_a_f64m2, a_x_f32m1, a_x_f32m1, vector_length);
|
|
156
|
+
norm_squared_a_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_a_f64m2, a_y_f32m1, a_y_f32m1, vector_length);
|
|
157
|
+
norm_squared_a_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_a_f64m2, a_z_f32m1, a_z_f32m1, vector_length);
|
|
158
|
+
norm_squared_b_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_b_f64m2, b_x_f32m1, b_x_f32m1, vector_length);
|
|
159
|
+
norm_squared_b_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_b_f64m2, b_y_f32m1, b_y_f32m1, vector_length);
|
|
160
|
+
norm_squared_b_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_b_f64m2, b_z_f32m1, b_z_f32m1, vector_length);
|
|
152
161
|
}
|
|
153
162
|
vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
|
|
154
163
|
// Compute centroids
|
|
@@ -179,24 +188,46 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_rvv_( //
|
|
|
179
188
|
*centroid_b_z = centroid_b_z_f64;
|
|
180
189
|
// Fix up: H[i][j] = raw[i][j] - points_count * ca[i] * cb[j]
|
|
181
190
|
nk_f64_t n_f64 = (nk_f64_t)points_count;
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
191
|
+
cross_covariance[0] = __riscv_vfmv_f_s_f64m1_f64(
|
|
192
|
+
__riscv_vfredusum_vs_f64m2_f64m1(cross_00_f64m2, zero_f64m1, max_vector_length)) -
|
|
193
|
+
n_f64 * centroid_a_x_f64 * centroid_b_x_f64;
|
|
194
|
+
cross_covariance[1] = __riscv_vfmv_f_s_f64m1_f64(
|
|
195
|
+
__riscv_vfredusum_vs_f64m2_f64m1(cross_01_f64m2, zero_f64m1, max_vector_length)) -
|
|
196
|
+
n_f64 * centroid_a_x_f64 * centroid_b_y_f64;
|
|
197
|
+
cross_covariance[2] = __riscv_vfmv_f_s_f64m1_f64(
|
|
198
|
+
__riscv_vfredusum_vs_f64m2_f64m1(cross_02_f64m2, zero_f64m1, max_vector_length)) -
|
|
199
|
+
n_f64 * centroid_a_x_f64 * centroid_b_z_f64;
|
|
200
|
+
cross_covariance[3] = __riscv_vfmv_f_s_f64m1_f64(
|
|
201
|
+
__riscv_vfredusum_vs_f64m2_f64m1(cross_10_f64m2, zero_f64m1, max_vector_length)) -
|
|
202
|
+
n_f64 * centroid_a_y_f64 * centroid_b_x_f64;
|
|
203
|
+
cross_covariance[4] = __riscv_vfmv_f_s_f64m1_f64(
|
|
204
|
+
__riscv_vfredusum_vs_f64m2_f64m1(cross_11_f64m2, zero_f64m1, max_vector_length)) -
|
|
205
|
+
n_f64 * centroid_a_y_f64 * centroid_b_y_f64;
|
|
206
|
+
cross_covariance[5] = __riscv_vfmv_f_s_f64m1_f64(
|
|
207
|
+
__riscv_vfredusum_vs_f64m2_f64m1(cross_12_f64m2, zero_f64m1, max_vector_length)) -
|
|
208
|
+
n_f64 * centroid_a_y_f64 * centroid_b_z_f64;
|
|
209
|
+
cross_covariance[6] = __riscv_vfmv_f_s_f64m1_f64(
|
|
210
|
+
__riscv_vfredusum_vs_f64m2_f64m1(cross_20_f64m2, zero_f64m1, max_vector_length)) -
|
|
211
|
+
n_f64 * centroid_a_z_f64 * centroid_b_x_f64;
|
|
212
|
+
cross_covariance[7] = __riscv_vfmv_f_s_f64m1_f64(
|
|
213
|
+
__riscv_vfredusum_vs_f64m2_f64m1(cross_21_f64m2, zero_f64m1, max_vector_length)) -
|
|
214
|
+
n_f64 * centroid_a_z_f64 * centroid_b_y_f64;
|
|
215
|
+
cross_covariance[8] = __riscv_vfmv_f_s_f64m1_f64(
|
|
216
|
+
__riscv_vfredusum_vs_f64m2_f64m1(cross_22_f64m2, zero_f64m1, max_vector_length)) -
|
|
217
|
+
n_f64 * centroid_a_z_f64 * centroid_b_z_f64;
|
|
218
|
+
// Centered norm-squared via parallel-axis identity; clamp at zero for numeric safety.
|
|
219
|
+
nk_f64_t norm_squared_a_sum = __riscv_vfmv_f_s_f64m1_f64(
|
|
220
|
+
__riscv_vfredusum_vs_f64m2_f64m1(norm_squared_a_f64m2, zero_f64m1, max_vector_length));
|
|
221
|
+
nk_f64_t norm_squared_b_sum = __riscv_vfmv_f_s_f64m1_f64(
|
|
222
|
+
__riscv_vfredusum_vs_f64m2_f64m1(norm_squared_b_f64m2, zero_f64m1, max_vector_length));
|
|
223
|
+
*centered_norm_squared_a = norm_squared_a_sum -
|
|
224
|
+
n_f64 * (centroid_a_x_f64 * centroid_a_x_f64 + centroid_a_y_f64 * centroid_a_y_f64 +
|
|
225
|
+
centroid_a_z_f64 * centroid_a_z_f64);
|
|
226
|
+
*centered_norm_squared_b = norm_squared_b_sum -
|
|
227
|
+
n_f64 * (centroid_b_x_f64 * centroid_b_x_f64 + centroid_b_y_f64 * centroid_b_y_f64 +
|
|
228
|
+
centroid_b_z_f64 * centroid_b_z_f64);
|
|
229
|
+
if (*centered_norm_squared_a < 0.0) *centered_norm_squared_a = 0.0;
|
|
230
|
+
if (*centered_norm_squared_b < 0.0) *centered_norm_squared_b = 0.0;
|
|
200
231
|
}
|
|
201
232
|
|
|
202
233
|
/**
|
|
@@ -209,7 +240,7 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f64_rvv_( //
|
|
|
209
240
|
nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, //
|
|
210
241
|
nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
|
|
211
242
|
nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
|
|
212
|
-
nk_f64_t
|
|
243
|
+
nk_f64_t cross_covariance[9], nk_f64_t *centered_norm_squared_a, nk_f64_t *centered_norm_squared_b) {
|
|
213
244
|
nk_size_t max_vector_length = __riscv_vsetvlmax_e64m1();
|
|
214
245
|
vfloat64m1_t sum_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
|
|
215
246
|
sum_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
@@ -241,6 +272,10 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f64_rvv_( //
|
|
|
241
272
|
vfloat64m1_t compensation_20_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
242
273
|
vfloat64m1_t compensation_21_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
243
274
|
vfloat64m1_t compensation_22_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
275
|
+
vfloat64m1_t norm_squared_a_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
276
|
+
vfloat64m1_t norm_squared_b_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
277
|
+
vfloat64m1_t compensation_norm_squared_a_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
278
|
+
vfloat64m1_t compensation_norm_squared_b_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
244
279
|
nk_f64_t const *a_ptr = a, *b_ptr = b;
|
|
245
280
|
nk_size_t remaining = points_count;
|
|
246
281
|
for (nk_size_t vector_length; remaining > 0;
|
|
@@ -269,6 +304,19 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f64_rvv_( //
|
|
|
269
304
|
nk_accumulate_product_f64m1_rvv_(&cross_20_f64m1, &compensation_20_f64m1, a_z_f64m1, b_x_f64m1, vector_length);
|
|
270
305
|
nk_accumulate_product_f64m1_rvv_(&cross_21_f64m1, &compensation_21_f64m1, a_z_f64m1, b_y_f64m1, vector_length);
|
|
271
306
|
nk_accumulate_product_f64m1_rvv_(&cross_22_f64m1, &compensation_22_f64m1, a_z_f64m1, b_z_f64m1, vector_length);
|
|
307
|
+
// Accumulate norm-squared for a and b via Kahan-compensated products (self*self).
|
|
308
|
+
nk_accumulate_product_f64m1_rvv_(&norm_squared_a_f64m1, &compensation_norm_squared_a_f64m1, a_x_f64m1,
|
|
309
|
+
a_x_f64m1, vector_length);
|
|
310
|
+
nk_accumulate_product_f64m1_rvv_(&norm_squared_a_f64m1, &compensation_norm_squared_a_f64m1, a_y_f64m1,
|
|
311
|
+
a_y_f64m1, vector_length);
|
|
312
|
+
nk_accumulate_product_f64m1_rvv_(&norm_squared_a_f64m1, &compensation_norm_squared_a_f64m1, a_z_f64m1,
|
|
313
|
+
a_z_f64m1, vector_length);
|
|
314
|
+
nk_accumulate_product_f64m1_rvv_(&norm_squared_b_f64m1, &compensation_norm_squared_b_f64m1, b_x_f64m1,
|
|
315
|
+
b_x_f64m1, vector_length);
|
|
316
|
+
nk_accumulate_product_f64m1_rvv_(&norm_squared_b_f64m1, &compensation_norm_squared_b_f64m1, b_y_f64m1,
|
|
317
|
+
b_y_f64m1, vector_length);
|
|
318
|
+
nk_accumulate_product_f64m1_rvv_(&norm_squared_b_f64m1, &compensation_norm_squared_b_f64m1, b_z_f64m1,
|
|
319
|
+
b_z_f64m1, vector_length);
|
|
272
320
|
}
|
|
273
321
|
// Compute centroids.
|
|
274
322
|
nk_f64_t inv_points_count = 1.0 / (nk_f64_t)points_count;
|
|
@@ -285,32 +333,45 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f64_rvv_( //
|
|
|
285
333
|
*centroid_b_y = centroid_b_y_f64;
|
|
286
334
|
*centroid_b_z = centroid_b_z_f64;
|
|
287
335
|
nk_f64_t n_f64 = (nk_f64_t)points_count;
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
336
|
+
cross_covariance[0] = nk_dot_stable_sum_f64m1_rvv_(cross_00_f64m1, compensation_00_f64m1) -
|
|
337
|
+
n_f64 * centroid_a_x_f64 * centroid_b_x_f64;
|
|
338
|
+
cross_covariance[1] = nk_dot_stable_sum_f64m1_rvv_(cross_01_f64m1, compensation_01_f64m1) -
|
|
339
|
+
n_f64 * centroid_a_x_f64 * centroid_b_y_f64;
|
|
340
|
+
cross_covariance[2] = nk_dot_stable_sum_f64m1_rvv_(cross_02_f64m1, compensation_02_f64m1) -
|
|
341
|
+
n_f64 * centroid_a_x_f64 * centroid_b_z_f64;
|
|
342
|
+
cross_covariance[3] = nk_dot_stable_sum_f64m1_rvv_(cross_10_f64m1, compensation_10_f64m1) -
|
|
343
|
+
n_f64 * centroid_a_y_f64 * centroid_b_x_f64;
|
|
344
|
+
cross_covariance[4] = nk_dot_stable_sum_f64m1_rvv_(cross_11_f64m1, compensation_11_f64m1) -
|
|
345
|
+
n_f64 * centroid_a_y_f64 * centroid_b_y_f64;
|
|
346
|
+
cross_covariance[5] = nk_dot_stable_sum_f64m1_rvv_(cross_12_f64m1, compensation_12_f64m1) -
|
|
347
|
+
n_f64 * centroid_a_y_f64 * centroid_b_z_f64;
|
|
348
|
+
cross_covariance[6] = nk_dot_stable_sum_f64m1_rvv_(cross_20_f64m1, compensation_20_f64m1) -
|
|
349
|
+
n_f64 * centroid_a_z_f64 * centroid_b_x_f64;
|
|
350
|
+
cross_covariance[7] = nk_dot_stable_sum_f64m1_rvv_(cross_21_f64m1, compensation_21_f64m1) -
|
|
351
|
+
n_f64 * centroid_a_z_f64 * centroid_b_y_f64;
|
|
352
|
+
cross_covariance[8] = nk_dot_stable_sum_f64m1_rvv_(cross_22_f64m1, compensation_22_f64m1) -
|
|
353
|
+
n_f64 * centroid_a_z_f64 * centroid_b_z_f64;
|
|
354
|
+
// Centered norm-squared via parallel-axis identity; clamp at zero for numeric safety.
|
|
355
|
+
nk_f64_t norm_squared_a_sum = nk_dot_stable_sum_f64m1_rvv_(norm_squared_a_f64m1, compensation_norm_squared_a_f64m1);
|
|
356
|
+
nk_f64_t norm_squared_b_sum = nk_dot_stable_sum_f64m1_rvv_(norm_squared_b_f64m1, compensation_norm_squared_b_f64m1);
|
|
357
|
+
*centered_norm_squared_a = norm_squared_a_sum -
|
|
358
|
+
n_f64 * (centroid_a_x_f64 * centroid_a_x_f64 + centroid_a_y_f64 * centroid_a_y_f64 +
|
|
359
|
+
centroid_a_z_f64 * centroid_a_z_f64);
|
|
360
|
+
*centered_norm_squared_b = norm_squared_b_sum -
|
|
361
|
+
n_f64 * (centroid_b_x_f64 * centroid_b_x_f64 + centroid_b_y_f64 * centroid_b_y_f64 +
|
|
362
|
+
centroid_b_z_f64 * centroid_b_z_f64);
|
|
363
|
+
if (*centered_norm_squared_a < 0.0) *centered_norm_squared_a = 0.0;
|
|
364
|
+
if (*centered_norm_squared_b < 0.0) *centered_norm_squared_b = 0.0;
|
|
306
365
|
}
|
|
307
366
|
|
|
308
367
|
/**
|
|
309
|
-
* @brief Compute centroids, cross-covariance, and
|
|
368
|
+
* @brief Compute centroids, cross-covariance, and centered norm-squared of both point sets (f32).
|
|
310
369
|
*
|
|
311
|
-
* Same as centroid_and_cross_covariance but also
|
|
312
|
-
*
|
|
313
|
-
*
|
|
370
|
+
* Same as centroid_and_cross_covariance but also outputs:
|
|
371
|
+
* centered_norm_squared_a = Σ ||a[i] - ca||² = Σ ||a[i]||² - n * ||ca||²
|
|
372
|
+
* centered_norm_squared_b = Σ ||b[i] - cb||² = Σ ||b[i]||² - n * ||cb||²
|
|
373
|
+
*
|
|
374
|
+
* These enable the trace-identity SSD fold in Kabsch/Umeyama callers.
|
|
314
375
|
*
|
|
315
376
|
* Cross-products use per-lane `vfwmacc_vv` accumulation (vfloat64m2_t) with
|
|
316
377
|
* deferred `vfredusum` after the loop — eliminates 9 reductions per iteration.
|
|
@@ -319,7 +380,7 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_rvv_( //
|
|
|
319
380
|
nk_f32_t const *a, nk_f32_t const *b, nk_size_t points_count, //
|
|
320
381
|
nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
|
|
321
382
|
nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
|
|
322
|
-
nk_f64_t
|
|
383
|
+
nk_f64_t cross_covariance[9], nk_f64_t *centered_norm_squared_a, nk_f64_t *centered_norm_squared_b) {
|
|
323
384
|
nk_size_t max_vector_length = __riscv_vsetvlmax_e64m2();
|
|
324
385
|
vfloat64m2_t sum_a_x_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
|
|
325
386
|
sum_a_y_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
@@ -336,7 +397,8 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_rvv_( //
|
|
|
336
397
|
vfloat64m2_t cross_20_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length),
|
|
337
398
|
cross_21_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
338
399
|
vfloat64m2_t cross_22_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
339
|
-
vfloat64m2_t
|
|
400
|
+
vfloat64m2_t norm_squared_a_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
401
|
+
vfloat64m2_t norm_squared_b_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
340
402
|
nk_f32_t const *a_ptr = a, *b_ptr = b;
|
|
341
403
|
nk_size_t remaining = points_count;
|
|
342
404
|
for (nk_size_t vector_length; remaining > 0;
|
|
@@ -365,12 +427,13 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_rvv_( //
|
|
|
365
427
|
cross_20_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(cross_20_f64m2, a_z_f32m1, b_x_f32m1, vector_length);
|
|
366
428
|
cross_21_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(cross_21_f64m2, a_z_f32m1, b_y_f32m1, vector_length);
|
|
367
429
|
cross_22_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(cross_22_f64m2, a_z_f32m1, b_z_f32m1, vector_length);
|
|
368
|
-
//
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
430
|
+
// Accumulate norm-squared for a and b (uncentered; centering fixup applied after reduction).
|
|
431
|
+
norm_squared_a_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_a_f64m2, a_x_f32m1, a_x_f32m1, vector_length);
|
|
432
|
+
norm_squared_a_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_a_f64m2, a_y_f32m1, a_y_f32m1, vector_length);
|
|
433
|
+
norm_squared_a_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_a_f64m2, a_z_f32m1, a_z_f32m1, vector_length);
|
|
434
|
+
norm_squared_b_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_b_f64m2, b_x_f32m1, b_x_f32m1, vector_length);
|
|
435
|
+
norm_squared_b_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_b_f64m2, b_y_f32m1, b_y_f32m1, vector_length);
|
|
436
|
+
norm_squared_b_f64m2 = __riscv_vfwmacc_vv_f64m2_tu(norm_squared_b_f64m2, b_z_f32m1, b_z_f32m1, vector_length);
|
|
374
437
|
}
|
|
375
438
|
vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
|
|
376
439
|
nk_f64_t inv_points_count = 1.0 / (nk_f64_t)points_count;
|
|
@@ -399,35 +462,52 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_rvv_( //
|
|
|
399
462
|
*centroid_b_y = centroid_b_y_f64;
|
|
400
463
|
*centroid_b_z = centroid_b_z_f64;
|
|
401
464
|
nk_f64_t n_f64 = (nk_f64_t)points_count;
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
465
|
+
cross_covariance[0] = __riscv_vfmv_f_s_f64m1_f64(
|
|
466
|
+
__riscv_vfredusum_vs_f64m2_f64m1(cross_00_f64m2, zero_f64m1, max_vector_length)) -
|
|
467
|
+
n_f64 * centroid_a_x_f64 * centroid_b_x_f64;
|
|
468
|
+
cross_covariance[1] = __riscv_vfmv_f_s_f64m1_f64(
|
|
469
|
+
__riscv_vfredusum_vs_f64m2_f64m1(cross_01_f64m2, zero_f64m1, max_vector_length)) -
|
|
470
|
+
n_f64 * centroid_a_x_f64 * centroid_b_y_f64;
|
|
471
|
+
cross_covariance[2] = __riscv_vfmv_f_s_f64m1_f64(
|
|
472
|
+
__riscv_vfredusum_vs_f64m2_f64m1(cross_02_f64m2, zero_f64m1, max_vector_length)) -
|
|
473
|
+
n_f64 * centroid_a_x_f64 * centroid_b_z_f64;
|
|
474
|
+
cross_covariance[3] = __riscv_vfmv_f_s_f64m1_f64(
|
|
475
|
+
__riscv_vfredusum_vs_f64m2_f64m1(cross_10_f64m2, zero_f64m1, max_vector_length)) -
|
|
476
|
+
n_f64 * centroid_a_y_f64 * centroid_b_x_f64;
|
|
477
|
+
cross_covariance[4] = __riscv_vfmv_f_s_f64m1_f64(
|
|
478
|
+
__riscv_vfredusum_vs_f64m2_f64m1(cross_11_f64m2, zero_f64m1, max_vector_length)) -
|
|
479
|
+
n_f64 * centroid_a_y_f64 * centroid_b_y_f64;
|
|
480
|
+
cross_covariance[5] = __riscv_vfmv_f_s_f64m1_f64(
|
|
481
|
+
__riscv_vfredusum_vs_f64m2_f64m1(cross_12_f64m2, zero_f64m1, max_vector_length)) -
|
|
482
|
+
n_f64 * centroid_a_y_f64 * centroid_b_z_f64;
|
|
483
|
+
cross_covariance[6] = __riscv_vfmv_f_s_f64m1_f64(
|
|
484
|
+
__riscv_vfredusum_vs_f64m2_f64m1(cross_20_f64m2, zero_f64m1, max_vector_length)) -
|
|
485
|
+
n_f64 * centroid_a_z_f64 * centroid_b_x_f64;
|
|
486
|
+
cross_covariance[7] = __riscv_vfmv_f_s_f64m1_f64(
|
|
487
|
+
__riscv_vfredusum_vs_f64m2_f64m1(cross_21_f64m2, zero_f64m1, max_vector_length)) -
|
|
488
|
+
n_f64 * centroid_a_z_f64 * centroid_b_y_f64;
|
|
489
|
+
cross_covariance[8] = __riscv_vfmv_f_s_f64m1_f64(
|
|
490
|
+
__riscv_vfredusum_vs_f64m2_f64m1(cross_22_f64m2, zero_f64m1, max_vector_length)) -
|
|
491
|
+
n_f64 * centroid_a_z_f64 * centroid_b_z_f64;
|
|
492
|
+
// Centered norm-squared via parallel-axis identity; clamp at zero for numeric safety.
|
|
493
|
+
nk_f64_t norm_squared_a_sum = __riscv_vfmv_f_s_f64m1_f64(
|
|
494
|
+
__riscv_vfredusum_vs_f64m2_f64m1(norm_squared_a_f64m2, zero_f64m1, max_vector_length));
|
|
495
|
+
nk_f64_t norm_squared_b_sum = __riscv_vfmv_f_s_f64m1_f64(
|
|
496
|
+
__riscv_vfredusum_vs_f64m2_f64m1(norm_squared_b_f64m2, zero_f64m1, max_vector_length));
|
|
497
|
+
*centered_norm_squared_a = norm_squared_a_sum -
|
|
498
|
+
n_f64 * (centroid_a_x_f64 * centroid_a_x_f64 + centroid_a_y_f64 * centroid_a_y_f64 +
|
|
499
|
+
centroid_a_z_f64 * centroid_a_z_f64);
|
|
500
|
+
*centered_norm_squared_b = norm_squared_b_sum -
|
|
501
|
+
n_f64 * (centroid_b_x_f64 * centroid_b_x_f64 + centroid_b_y_f64 * centroid_b_y_f64 +
|
|
502
|
+
centroid_b_z_f64 * centroid_b_z_f64);
|
|
503
|
+
if (*centered_norm_squared_a < 0.0) *centered_norm_squared_a = 0.0;
|
|
504
|
+
if (*centered_norm_squared_b < 0.0) *centered_norm_squared_b = 0.0;
|
|
426
505
|
}
|
|
427
506
|
|
|
428
507
|
/**
|
|
429
|
-
* @brief Compute centroids, cross-covariance, and
|
|
508
|
+
* @brief Compute centroids, cross-covariance, and centered norm-squared of both point sets (f64).
|
|
430
509
|
*
|
|
510
|
+
* Same outputs as the f32 variant; used by the Umeyama caller for the trace-identity SSD fold.
|
|
431
511
|
* Per-lane `vfadd_vv`/`vfmacc_vv` accumulation with deferred `vfredusum` after the loop
|
|
432
512
|
* — eliminates 16 horizontal reductions per iteration.
|
|
433
513
|
*/
|
|
@@ -435,7 +515,7 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f64_rvv_( //
|
|
|
435
515
|
nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, //
|
|
436
516
|
nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
|
|
437
517
|
nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
|
|
438
|
-
nk_f64_t
|
|
518
|
+
nk_f64_t cross_covariance[9], nk_f64_t *centered_norm_squared_a, nk_f64_t *centered_norm_squared_b) {
|
|
439
519
|
nk_size_t max_vector_length = __riscv_vsetvlmax_e64m1();
|
|
440
520
|
vfloat64m1_t sum_a_x_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length),
|
|
441
521
|
sum_a_y_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
@@ -467,8 +547,10 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f64_rvv_( //
|
|
|
467
547
|
vfloat64m1_t compensation_20_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
468
548
|
vfloat64m1_t compensation_21_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
469
549
|
vfloat64m1_t compensation_22_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
470
|
-
vfloat64m1_t
|
|
471
|
-
vfloat64m1_t
|
|
550
|
+
vfloat64m1_t norm_squared_a_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
551
|
+
vfloat64m1_t norm_squared_b_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
552
|
+
vfloat64m1_t compensation_norm_squared_a_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
553
|
+
vfloat64m1_t compensation_norm_squared_b_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
472
554
|
nk_f64_t const *a_ptr = a, *b_ptr = b;
|
|
473
555
|
nk_size_t remaining = points_count;
|
|
474
556
|
for (nk_size_t vector_length; remaining > 0;
|
|
@@ -497,11 +579,19 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f64_rvv_( //
|
|
|
497
579
|
nk_accumulate_product_f64m1_rvv_(&cross_20_f64m1, &compensation_20_f64m1, a_z_f64m1, b_x_f64m1, vector_length);
|
|
498
580
|
nk_accumulate_product_f64m1_rvv_(&cross_21_f64m1, &compensation_21_f64m1, a_z_f64m1, b_y_f64m1, vector_length);
|
|
499
581
|
nk_accumulate_product_f64m1_rvv_(&cross_22_f64m1, &compensation_22_f64m1, a_z_f64m1, b_z_f64m1, vector_length);
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
582
|
+
// Accumulate norm-squared for a and b via Kahan-compensated products (self*self).
|
|
583
|
+
nk_accumulate_product_f64m1_rvv_(&norm_squared_a_f64m1, &compensation_norm_squared_a_f64m1, a_x_f64m1,
|
|
584
|
+
a_x_f64m1, vector_length);
|
|
585
|
+
nk_accumulate_product_f64m1_rvv_(&norm_squared_a_f64m1, &compensation_norm_squared_a_f64m1, a_y_f64m1,
|
|
586
|
+
a_y_f64m1, vector_length);
|
|
587
|
+
nk_accumulate_product_f64m1_rvv_(&norm_squared_a_f64m1, &compensation_norm_squared_a_f64m1, a_z_f64m1,
|
|
588
|
+
a_z_f64m1, vector_length);
|
|
589
|
+
nk_accumulate_product_f64m1_rvv_(&norm_squared_b_f64m1, &compensation_norm_squared_b_f64m1, b_x_f64m1,
|
|
590
|
+
b_x_f64m1, vector_length);
|
|
591
|
+
nk_accumulate_product_f64m1_rvv_(&norm_squared_b_f64m1, &compensation_norm_squared_b_f64m1, b_y_f64m1,
|
|
592
|
+
b_y_f64m1, vector_length);
|
|
593
|
+
nk_accumulate_product_f64m1_rvv_(&norm_squared_b_f64m1, &compensation_norm_squared_b_f64m1, b_z_f64m1,
|
|
594
|
+
b_z_f64m1, vector_length);
|
|
505
595
|
}
|
|
506
596
|
nk_f64_t inv_points_count = 1.0 / (nk_f64_t)points_count;
|
|
507
597
|
nk_f64_t centroid_a_x_f64 = nk_dot_stable_sum_f64m1_rvv_(sum_a_x_f64m1, compensation_a_x_f64m1) * inv_points_count;
|
|
@@ -517,182 +607,59 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f64_rvv_( //
|
|
|
517
607
|
*centroid_b_y = centroid_b_y_f64;
|
|
518
608
|
*centroid_b_z = centroid_b_z_f64;
|
|
519
609
|
nk_f64_t n_f64 = (nk_f64_t)points_count;
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
nk_f64_t scaled_rotation_x_x = scale * r[0], scaled_rotation_x_y = scale * r[1], scaled_rotation_x_z = scale * r[2];
|
|
550
|
-
nk_f64_t scaled_rotation_y_x = scale * r[3], scaled_rotation_y_y = scale * r[4], scaled_rotation_y_z = scale * r[5];
|
|
551
|
-
nk_f64_t scaled_rotation_z_x = scale * r[6], scaled_rotation_z_y = scale * r[7], scaled_rotation_z_z = scale * r[8];
|
|
552
|
-
nk_size_t max_vector_length = __riscv_vsetvlmax_e64m2();
|
|
553
|
-
vfloat64m2_t sum_distance_squared_f64m2 = __riscv_vfmv_v_f_f64m2(0.0, max_vector_length);
|
|
554
|
-
vfloat64m1_t zero_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, 1);
|
|
555
|
-
nk_f32_t const *a_ptr = a, *b_ptr = b;
|
|
556
|
-
nk_size_t remaining = points_count;
|
|
557
|
-
for (nk_size_t vector_length; remaining > 0;
|
|
558
|
-
remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
|
|
559
|
-
vector_length = __riscv_vsetvl_e32m1(remaining);
|
|
560
|
-
vfloat32m1x3_t a_f32m1x3 = __riscv_vlseg3e32_v_f32m1x3(a_ptr, vector_length);
|
|
561
|
-
vfloat64m2_t centered_a_x_f64m2 = __riscv_vfsub_vf_f64m2(
|
|
562
|
-
__riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 0), vector_length), centroid_a_x,
|
|
563
|
-
vector_length);
|
|
564
|
-
vfloat64m2_t centered_a_y_f64m2 = __riscv_vfsub_vf_f64m2(
|
|
565
|
-
__riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 1), vector_length), centroid_a_y,
|
|
566
|
-
vector_length);
|
|
567
|
-
vfloat64m2_t centered_a_z_f64m2 = __riscv_vfsub_vf_f64m2(
|
|
568
|
-
__riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(a_f32m1x3, 2), vector_length), centroid_a_z,
|
|
569
|
-
vector_length);
|
|
570
|
-
vfloat64m2_t rotated_a_x_f64m2 = __riscv_vfmul_vf_f64m2(centered_a_x_f64m2, scaled_rotation_x_x, vector_length);
|
|
571
|
-
rotated_a_x_f64m2 = __riscv_vfmacc_vf_f64m2(rotated_a_x_f64m2, scaled_rotation_x_y, centered_a_y_f64m2,
|
|
572
|
-
vector_length);
|
|
573
|
-
rotated_a_x_f64m2 = __riscv_vfmacc_vf_f64m2(rotated_a_x_f64m2, scaled_rotation_x_z, centered_a_z_f64m2,
|
|
574
|
-
vector_length);
|
|
575
|
-
vfloat64m2_t rotated_a_y_f64m2 = __riscv_vfmul_vf_f64m2(centered_a_x_f64m2, scaled_rotation_y_x, vector_length);
|
|
576
|
-
rotated_a_y_f64m2 = __riscv_vfmacc_vf_f64m2(rotated_a_y_f64m2, scaled_rotation_y_y, centered_a_y_f64m2,
|
|
577
|
-
vector_length);
|
|
578
|
-
rotated_a_y_f64m2 = __riscv_vfmacc_vf_f64m2(rotated_a_y_f64m2, scaled_rotation_y_z, centered_a_z_f64m2,
|
|
579
|
-
vector_length);
|
|
580
|
-
vfloat64m2_t rotated_a_z_f64m2 = __riscv_vfmul_vf_f64m2(centered_a_x_f64m2, scaled_rotation_z_x, vector_length);
|
|
581
|
-
rotated_a_z_f64m2 = __riscv_vfmacc_vf_f64m2(rotated_a_z_f64m2, scaled_rotation_z_y, centered_a_y_f64m2,
|
|
582
|
-
vector_length);
|
|
583
|
-
rotated_a_z_f64m2 = __riscv_vfmacc_vf_f64m2(rotated_a_z_f64m2, scaled_rotation_z_z, centered_a_z_f64m2,
|
|
584
|
-
vector_length);
|
|
585
|
-
vfloat32m1x3_t b_f32m1x3 = __riscv_vlseg3e32_v_f32m1x3(b_ptr, vector_length);
|
|
586
|
-
vfloat64m2_t centered_b_x_f64m2 = __riscv_vfsub_vf_f64m2(
|
|
587
|
-
__riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 0), vector_length), centroid_b_x,
|
|
588
|
-
vector_length);
|
|
589
|
-
vfloat64m2_t centered_b_y_f64m2 = __riscv_vfsub_vf_f64m2(
|
|
590
|
-
__riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 1), vector_length), centroid_b_y,
|
|
591
|
-
vector_length);
|
|
592
|
-
vfloat64m2_t centered_b_z_f64m2 = __riscv_vfsub_vf_f64m2(
|
|
593
|
-
__riscv_vfwcvt_f_f_v_f64m2(__riscv_vget_v_f32m1x3_f32m1(b_f32m1x3, 2), vector_length), centroid_b_z,
|
|
594
|
-
vector_length);
|
|
595
|
-
vfloat64m2_t delta_x_f64m2 = __riscv_vfsub_vv_f64m2(rotated_a_x_f64m2, centered_b_x_f64m2, vector_length);
|
|
596
|
-
vfloat64m2_t delta_y_f64m2 = __riscv_vfsub_vv_f64m2(rotated_a_y_f64m2, centered_b_y_f64m2, vector_length);
|
|
597
|
-
vfloat64m2_t delta_z_f64m2 = __riscv_vfsub_vv_f64m2(rotated_a_z_f64m2, centered_b_z_f64m2, vector_length);
|
|
598
|
-
sum_distance_squared_f64m2 = __riscv_vfmacc_vv_f64m2_tu(sum_distance_squared_f64m2, delta_x_f64m2,
|
|
599
|
-
delta_x_f64m2, vector_length);
|
|
600
|
-
sum_distance_squared_f64m2 = __riscv_vfmacc_vv_f64m2_tu(sum_distance_squared_f64m2, delta_y_f64m2,
|
|
601
|
-
delta_y_f64m2, vector_length);
|
|
602
|
-
sum_distance_squared_f64m2 = __riscv_vfmacc_vv_f64m2_tu(sum_distance_squared_f64m2, delta_z_f64m2,
|
|
603
|
-
delta_z_f64m2, vector_length);
|
|
604
|
-
}
|
|
605
|
-
return __riscv_vfmv_f_s_f64m1_f64(
|
|
606
|
-
__riscv_vfredusum_vs_f64m2_f64m1(sum_distance_squared_f64m2, zero_f64m1, max_vector_length));
|
|
607
|
-
}
|
|
608
|
-
|
|
609
|
-
NK_INTERNAL nk_f64_t nk_transformed_ssd_f64_rvv_( //
|
|
610
|
-
nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, //
|
|
611
|
-
nk_f64_t const *r, nk_f64_t scale, //
|
|
612
|
-
nk_f64_t centroid_a_x, nk_f64_t centroid_a_y, nk_f64_t centroid_a_z, //
|
|
613
|
-
nk_f64_t centroid_b_x, nk_f64_t centroid_b_y, nk_f64_t centroid_b_z) {
|
|
614
|
-
nk_f64_t scaled_rotation_x_x = scale * r[0], scaled_rotation_x_y = scale * r[1], scaled_rotation_x_z = scale * r[2];
|
|
615
|
-
nk_f64_t scaled_rotation_y_x = scale * r[3], scaled_rotation_y_y = scale * r[4], scaled_rotation_y_z = scale * r[5];
|
|
616
|
-
nk_f64_t scaled_rotation_z_x = scale * r[6], scaled_rotation_z_y = scale * r[7], scaled_rotation_z_z = scale * r[8];
|
|
617
|
-
nk_size_t max_vector_length = __riscv_vsetvlmax_e64m1();
|
|
618
|
-
vfloat64m1_t sum_distance_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
619
|
-
vfloat64m1_t compensation_distance_squared_f64m1 = __riscv_vfmv_v_f_f64m1(0.0, max_vector_length);
|
|
620
|
-
nk_f64_t const *a_ptr = a, *b_ptr = b;
|
|
621
|
-
nk_size_t remaining = points_count;
|
|
622
|
-
for (nk_size_t vector_length; remaining > 0;
|
|
623
|
-
remaining -= vector_length, a_ptr += vector_length * 3, b_ptr += vector_length * 3) {
|
|
624
|
-
vector_length = __riscv_vsetvl_e64m1(remaining);
|
|
625
|
-
vfloat64m1x3_t a_f64m1x3 = __riscv_vlseg3e64_v_f64m1x3(a_ptr, vector_length);
|
|
626
|
-
vfloat64m1_t centered_a_x_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 0),
|
|
627
|
-
centroid_a_x, vector_length);
|
|
628
|
-
vfloat64m1_t centered_a_y_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 1),
|
|
629
|
-
centroid_a_y, vector_length);
|
|
630
|
-
vfloat64m1_t centered_a_z_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(a_f64m1x3, 2),
|
|
631
|
-
centroid_a_z, vector_length);
|
|
632
|
-
vfloat64m1_t rotated_a_x_f64m1 = __riscv_vfmul_vf_f64m1(centered_a_x_f64m1, scaled_rotation_x_x, vector_length);
|
|
633
|
-
rotated_a_x_f64m1 = __riscv_vfmacc_vf_f64m1(rotated_a_x_f64m1, scaled_rotation_x_y, centered_a_y_f64m1,
|
|
634
|
-
vector_length);
|
|
635
|
-
rotated_a_x_f64m1 = __riscv_vfmacc_vf_f64m1(rotated_a_x_f64m1, scaled_rotation_x_z, centered_a_z_f64m1,
|
|
636
|
-
vector_length);
|
|
637
|
-
vfloat64m1_t rotated_a_y_f64m1 = __riscv_vfmul_vf_f64m1(centered_a_x_f64m1, scaled_rotation_y_x, vector_length);
|
|
638
|
-
rotated_a_y_f64m1 = __riscv_vfmacc_vf_f64m1(rotated_a_y_f64m1, scaled_rotation_y_y, centered_a_y_f64m1,
|
|
639
|
-
vector_length);
|
|
640
|
-
rotated_a_y_f64m1 = __riscv_vfmacc_vf_f64m1(rotated_a_y_f64m1, scaled_rotation_y_z, centered_a_z_f64m1,
|
|
641
|
-
vector_length);
|
|
642
|
-
vfloat64m1_t rotated_a_z_f64m1 = __riscv_vfmul_vf_f64m1(centered_a_x_f64m1, scaled_rotation_z_x, vector_length);
|
|
643
|
-
rotated_a_z_f64m1 = __riscv_vfmacc_vf_f64m1(rotated_a_z_f64m1, scaled_rotation_z_y, centered_a_y_f64m1,
|
|
644
|
-
vector_length);
|
|
645
|
-
rotated_a_z_f64m1 = __riscv_vfmacc_vf_f64m1(rotated_a_z_f64m1, scaled_rotation_z_z, centered_a_z_f64m1,
|
|
646
|
-
vector_length);
|
|
647
|
-
vfloat64m1x3_t b_f64m1x3 = __riscv_vlseg3e64_v_f64m1x3(b_ptr, vector_length);
|
|
648
|
-
vfloat64m1_t centered_b_x_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 0),
|
|
649
|
-
centroid_b_x, vector_length);
|
|
650
|
-
vfloat64m1_t centered_b_y_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 1),
|
|
651
|
-
centroid_b_y, vector_length);
|
|
652
|
-
vfloat64m1_t centered_b_z_f64m1 = __riscv_vfsub_vf_f64m1(__riscv_vget_v_f64m1x3_f64m1(b_f64m1x3, 2),
|
|
653
|
-
centroid_b_z, vector_length);
|
|
654
|
-
vfloat64m1_t delta_x_f64m1 = __riscv_vfsub_vv_f64m1(rotated_a_x_f64m1, centered_b_x_f64m1, vector_length);
|
|
655
|
-
vfloat64m1_t delta_y_f64m1 = __riscv_vfsub_vv_f64m1(rotated_a_y_f64m1, centered_b_y_f64m1, vector_length);
|
|
656
|
-
vfloat64m1_t delta_z_f64m1 = __riscv_vfsub_vv_f64m1(rotated_a_z_f64m1, centered_b_z_f64m1, vector_length);
|
|
657
|
-
vfloat64m1_t distance_squared_f64m1 = __riscv_vfmul_vv_f64m1(delta_x_f64m1, delta_x_f64m1, vector_length);
|
|
658
|
-
distance_squared_f64m1 = __riscv_vfmacc_vv_f64m1(distance_squared_f64m1, delta_y_f64m1, delta_y_f64m1,
|
|
659
|
-
vector_length);
|
|
660
|
-
distance_squared_f64m1 = __riscv_vfmacc_vv_f64m1(distance_squared_f64m1, delta_z_f64m1, delta_z_f64m1,
|
|
661
|
-
vector_length);
|
|
662
|
-
nk_accumulate_sum_f64m1_rvv_(&sum_distance_squared_f64m1, &compensation_distance_squared_f64m1,
|
|
663
|
-
distance_squared_f64m1, vector_length);
|
|
664
|
-
}
|
|
665
|
-
return nk_dot_stable_sum_f64m1_rvv_(sum_distance_squared_f64m1, compensation_distance_squared_f64m1);
|
|
610
|
+
cross_covariance[0] = nk_dot_stable_sum_f64m1_rvv_(cross_00_f64m1, compensation_00_f64m1) -
|
|
611
|
+
n_f64 * centroid_a_x_f64 * centroid_b_x_f64;
|
|
612
|
+
cross_covariance[1] = nk_dot_stable_sum_f64m1_rvv_(cross_01_f64m1, compensation_01_f64m1) -
|
|
613
|
+
n_f64 * centroid_a_x_f64 * centroid_b_y_f64;
|
|
614
|
+
cross_covariance[2] = nk_dot_stable_sum_f64m1_rvv_(cross_02_f64m1, compensation_02_f64m1) -
|
|
615
|
+
n_f64 * centroid_a_x_f64 * centroid_b_z_f64;
|
|
616
|
+
cross_covariance[3] = nk_dot_stable_sum_f64m1_rvv_(cross_10_f64m1, compensation_10_f64m1) -
|
|
617
|
+
n_f64 * centroid_a_y_f64 * centroid_b_x_f64;
|
|
618
|
+
cross_covariance[4] = nk_dot_stable_sum_f64m1_rvv_(cross_11_f64m1, compensation_11_f64m1) -
|
|
619
|
+
n_f64 * centroid_a_y_f64 * centroid_b_y_f64;
|
|
620
|
+
cross_covariance[5] = nk_dot_stable_sum_f64m1_rvv_(cross_12_f64m1, compensation_12_f64m1) -
|
|
621
|
+
n_f64 * centroid_a_y_f64 * centroid_b_z_f64;
|
|
622
|
+
cross_covariance[6] = nk_dot_stable_sum_f64m1_rvv_(cross_20_f64m1, compensation_20_f64m1) -
|
|
623
|
+
n_f64 * centroid_a_z_f64 * centroid_b_x_f64;
|
|
624
|
+
cross_covariance[7] = nk_dot_stable_sum_f64m1_rvv_(cross_21_f64m1, compensation_21_f64m1) -
|
|
625
|
+
n_f64 * centroid_a_z_f64 * centroid_b_y_f64;
|
|
626
|
+
cross_covariance[8] = nk_dot_stable_sum_f64m1_rvv_(cross_22_f64m1, compensation_22_f64m1) -
|
|
627
|
+
n_f64 * centroid_a_z_f64 * centroid_b_z_f64;
|
|
628
|
+
// Centered norm-squared via parallel-axis identity; clamp at zero for numeric safety.
|
|
629
|
+
nk_f64_t norm_squared_a_sum = nk_dot_stable_sum_f64m1_rvv_(norm_squared_a_f64m1, compensation_norm_squared_a_f64m1);
|
|
630
|
+
nk_f64_t norm_squared_b_sum = nk_dot_stable_sum_f64m1_rvv_(norm_squared_b_f64m1, compensation_norm_squared_b_f64m1);
|
|
631
|
+
*centered_norm_squared_a = norm_squared_a_sum -
|
|
632
|
+
n_f64 * (centroid_a_x_f64 * centroid_a_x_f64 + centroid_a_y_f64 * centroid_a_y_f64 +
|
|
633
|
+
centroid_a_z_f64 * centroid_a_z_f64);
|
|
634
|
+
*centered_norm_squared_b = norm_squared_b_sum -
|
|
635
|
+
n_f64 * (centroid_b_x_f64 * centroid_b_x_f64 + centroid_b_y_f64 * centroid_b_y_f64 +
|
|
636
|
+
centroid_b_z_f64 * centroid_b_z_f64);
|
|
637
|
+
if (*centered_norm_squared_a < 0.0) *centered_norm_squared_a = 0.0;
|
|
638
|
+
if (*centered_norm_squared_b < 0.0) *centered_norm_squared_b = 0.0;
|
|
666
639
|
}
|
|
667
640
|
|
|
668
641
|
/** @brief Compute R = V * Uᵀ from SVD factors (f32), vectorized with `vfmul_vf`/`vfmacc_vf`. */
|
|
669
642
|
NK_INTERNAL void nk_rotation_from_svd_f32_rvv_( //
|
|
670
|
-
nk_f32_t *
|
|
643
|
+
nk_f32_t *svd_left, nk_f32_t *svd_right, nk_f32_t optimal_rotation[9]) {
|
|
671
644
|
nk_size_t vl3 = __riscv_vsetvl_e32m1(3);
|
|
672
|
-
vfloat32m1_t u_row0_f32m1 = __riscv_vle32_v_f32m1(
|
|
673
|
-
vfloat32m1_t u_row1_f32m1 = __riscv_vle32_v_f32m1(
|
|
674
|
-
vfloat32m1_t u_row2_f32m1 = __riscv_vle32_v_f32m1(
|
|
645
|
+
vfloat32m1_t u_row0_f32m1 = __riscv_vle32_v_f32m1(svd_left + 0, vl3);
|
|
646
|
+
vfloat32m1_t u_row1_f32m1 = __riscv_vle32_v_f32m1(svd_left + 3, vl3);
|
|
647
|
+
vfloat32m1_t u_row2_f32m1 = __riscv_vle32_v_f32m1(svd_left + 6, vl3);
|
|
675
648
|
// Row 0: R[0..2] = V[0]*U_row0 + V[1]*U_row1 + V[2]*U_row2
|
|
676
|
-
vfloat32m1_t rotation_row_f32m1 = __riscv_vfmul_vf_f32m1(u_row0_f32m1,
|
|
677
|
-
rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1,
|
|
678
|
-
rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1,
|
|
679
|
-
__riscv_vse32_v_f32m1(
|
|
649
|
+
vfloat32m1_t rotation_row_f32m1 = __riscv_vfmul_vf_f32m1(u_row0_f32m1, svd_right[0], vl3);
|
|
650
|
+
rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1, svd_right[1], u_row1_f32m1, vl3);
|
|
651
|
+
rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1, svd_right[2], u_row2_f32m1, vl3);
|
|
652
|
+
__riscv_vse32_v_f32m1(optimal_rotation + 0, rotation_row_f32m1, vl3);
|
|
680
653
|
// Row 1: R[3..5]
|
|
681
|
-
rotation_row_f32m1 = __riscv_vfmul_vf_f32m1(u_row0_f32m1,
|
|
682
|
-
rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1,
|
|
683
|
-
rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1,
|
|
684
|
-
__riscv_vse32_v_f32m1(
|
|
654
|
+
rotation_row_f32m1 = __riscv_vfmul_vf_f32m1(u_row0_f32m1, svd_right[3], vl3);
|
|
655
|
+
rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1, svd_right[4], u_row1_f32m1, vl3);
|
|
656
|
+
rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1, svd_right[5], u_row2_f32m1, vl3);
|
|
657
|
+
__riscv_vse32_v_f32m1(optimal_rotation + 3, rotation_row_f32m1, vl3);
|
|
685
658
|
// Row 2: R[6..8]
|
|
686
|
-
rotation_row_f32m1 = __riscv_vfmul_vf_f32m1(u_row0_f32m1,
|
|
687
|
-
rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1,
|
|
688
|
-
rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1,
|
|
689
|
-
__riscv_vse32_v_f32m1(
|
|
690
|
-
}
|
|
691
|
-
|
|
692
|
-
/** @brief Compute R = V * Uᵀ from SVD factors (f64), vectorized with `vfmul_vf`/`vfmacc_vf`. */
|
|
693
|
-
NK_INTERNAL void nk_rotation_from_svd_f64_rvv_( //
|
|
694
|
-
nk_f64_t *svd_u, nk_f64_t *svd_v, nk_f64_t r[9]) {
|
|
695
|
-
nk_rotation_from_svd_f64_serial_(svd_u, svd_v, r);
|
|
659
|
+
rotation_row_f32m1 = __riscv_vfmul_vf_f32m1(u_row0_f32m1, svd_right[6], vl3);
|
|
660
|
+
rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1, svd_right[7], u_row1_f32m1, vl3);
|
|
661
|
+
rotation_row_f32m1 = __riscv_vfmacc_vf_f32m1(rotation_row_f32m1, svd_right[8], u_row2_f32m1, vl3);
|
|
662
|
+
__riscv_vse32_v_f32m1(optimal_rotation + 6, rotation_row_f32m1, vl3);
|
|
696
663
|
}
|
|
697
664
|
|
|
698
665
|
NK_PUBLIC void nk_rmsd_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
|
|
@@ -781,114 +748,232 @@ NK_PUBLIC void nk_kabsch_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t
|
|
|
781
748
|
nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
|
|
782
749
|
if (scale) *scale = 1.0f;
|
|
783
750
|
nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z;
|
|
784
|
-
nk_f64_t
|
|
751
|
+
nk_f64_t centered_norm_squared_a, centered_norm_squared_b;
|
|
752
|
+
nk_f64_t cross_covariance[9];
|
|
785
753
|
nk_centroid_and_cross_covariance_f32_rvv_(a, b, points_count, ¢roid_a_x, ¢roid_a_y, ¢roid_a_z,
|
|
786
|
-
¢roid_b_x, ¢roid_b_y, ¢roid_b_z,
|
|
754
|
+
¢roid_b_x, ¢roid_b_y, ¢roid_b_z, cross_covariance,
|
|
755
|
+
¢ered_norm_squared_a, ¢ered_norm_squared_b);
|
|
787
756
|
if (a_centroid)
|
|
788
757
|
a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
|
|
789
758
|
a_centroid[2] = (nk_f32_t)centroid_a_z;
|
|
790
759
|
if (b_centroid)
|
|
791
760
|
b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
|
|
792
761
|
b_centroid[2] = (nk_f32_t)centroid_b_z;
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
nk_f64_t
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
762
|
+
|
|
763
|
+
// Identity-dominant short-circuit: if H ≈ diag(positive entries), R = I and trace(R·H) = trace(H).
|
|
764
|
+
nk_f64_t covariance_diagonal_norm_squared = cross_covariance[0] * cross_covariance[0] +
|
|
765
|
+
cross_covariance[4] * cross_covariance[4] +
|
|
766
|
+
cross_covariance[8] * cross_covariance[8];
|
|
767
|
+
nk_f64_t covariance_offdiagonal_norm_squared =
|
|
768
|
+
cross_covariance[1] * cross_covariance[1] + cross_covariance[2] * cross_covariance[2] +
|
|
769
|
+
cross_covariance[3] * cross_covariance[3] + cross_covariance[5] * cross_covariance[5] +
|
|
770
|
+
cross_covariance[6] * cross_covariance[6] + cross_covariance[7] * cross_covariance[7];
|
|
771
|
+
nk_f64_t optimal_rotation[9];
|
|
772
|
+
nk_f64_t trace_rotation_covariance;
|
|
773
|
+
if (covariance_offdiagonal_norm_squared < 1e-20 * covariance_diagonal_norm_squared && cross_covariance[0] > 0.0 &&
|
|
774
|
+
cross_covariance[4] > 0.0 && cross_covariance[8] > 0.0) {
|
|
775
|
+
optimal_rotation[0] = 1, optimal_rotation[1] = 0, optimal_rotation[2] = 0, optimal_rotation[3] = 0,
|
|
776
|
+
optimal_rotation[4] = 1, optimal_rotation[5] = 0, optimal_rotation[6] = 0, optimal_rotation[7] = 0,
|
|
777
|
+
optimal_rotation[8] = 1;
|
|
778
|
+
trace_rotation_covariance = cross_covariance[0] + cross_covariance[4] + cross_covariance[8];
|
|
779
|
+
}
|
|
780
|
+
else {
|
|
781
|
+
nk_f64_t svd_left[9], svd_diagonal[9], svd_right[9];
|
|
782
|
+
nk_svd3x3_f64_(cross_covariance, svd_left, svd_diagonal, svd_right);
|
|
783
|
+
nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
|
|
784
|
+
if (nk_det3x3_f64_(optimal_rotation) < 0) {
|
|
785
|
+
svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8];
|
|
786
|
+
nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
|
|
787
|
+
}
|
|
788
|
+
trace_rotation_covariance =
|
|
789
|
+
optimal_rotation[0] * cross_covariance[0] + optimal_rotation[1] * cross_covariance[3] +
|
|
790
|
+
optimal_rotation[2] * cross_covariance[6] + optimal_rotation[3] * cross_covariance[1] +
|
|
791
|
+
optimal_rotation[4] * cross_covariance[4] + optimal_rotation[5] * cross_covariance[7] +
|
|
792
|
+
optimal_rotation[6] * cross_covariance[2] + optimal_rotation[7] * cross_covariance[5] +
|
|
793
|
+
optimal_rotation[8] * cross_covariance[8];
|
|
800
794
|
}
|
|
801
795
|
if (rotation)
|
|
802
|
-
for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
796
|
+
for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)optimal_rotation[j];
|
|
797
|
+
// Folded SSD via trace identity: SSD = ‖a-ā‖² + ‖b-b̄‖² − 2·trace(R · H_centered).
|
|
798
|
+
nk_f64_t sum_squared = centered_norm_squared_a + centered_norm_squared_b - 2.0 * trace_rotation_covariance;
|
|
799
|
+
if (sum_squared < 0.0) sum_squared = 0.0;
|
|
800
|
+
*result = nk_f64_sqrt_rvv(sum_squared / (nk_f64_t)points_count);
|
|
806
801
|
}
|
|
807
802
|
|
|
808
803
|
NK_PUBLIC void nk_kabsch_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, nk_f64_t *a_centroid,
|
|
809
804
|
nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
|
|
810
805
|
if (scale) *scale = 1.0;
|
|
811
806
|
nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z;
|
|
812
|
-
nk_f64_t
|
|
807
|
+
nk_f64_t centered_norm_squared_a, centered_norm_squared_b;
|
|
808
|
+
nk_f64_t cross_covariance[9];
|
|
813
809
|
nk_centroid_and_cross_covariance_f64_rvv_(a, b, points_count, ¢roid_a_x, ¢roid_a_y, ¢roid_a_z,
|
|
814
|
-
¢roid_b_x, ¢roid_b_y, ¢roid_b_z,
|
|
810
|
+
¢roid_b_x, ¢roid_b_y, ¢roid_b_z, cross_covariance,
|
|
811
|
+
¢ered_norm_squared_a, ¢ered_norm_squared_b);
|
|
815
812
|
if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
|
|
816
813
|
if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
nk_f64_t
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
814
|
+
|
|
815
|
+
// Identity-dominant short-circuit: if H ≈ diag(positive entries), R = I and trace(R·H) = trace(H).
|
|
816
|
+
nk_f64_t covariance_diagonal_norm_squared = cross_covariance[0] * cross_covariance[0] +
|
|
817
|
+
cross_covariance[4] * cross_covariance[4] +
|
|
818
|
+
cross_covariance[8] * cross_covariance[8];
|
|
819
|
+
nk_f64_t covariance_offdiagonal_norm_squared =
|
|
820
|
+
cross_covariance[1] * cross_covariance[1] + cross_covariance[2] * cross_covariance[2] +
|
|
821
|
+
cross_covariance[3] * cross_covariance[3] + cross_covariance[5] * cross_covariance[5] +
|
|
822
|
+
cross_covariance[6] * cross_covariance[6] + cross_covariance[7] * cross_covariance[7];
|
|
823
|
+
nk_f64_t optimal_rotation[9];
|
|
824
|
+
nk_f64_t trace_rotation_covariance;
|
|
825
|
+
if (covariance_offdiagonal_norm_squared < 1e-20 * covariance_diagonal_norm_squared && cross_covariance[0] > 0.0 &&
|
|
826
|
+
cross_covariance[4] > 0.0 && cross_covariance[8] > 0.0) {
|
|
827
|
+
optimal_rotation[0] = 1, optimal_rotation[1] = 0, optimal_rotation[2] = 0, optimal_rotation[3] = 0,
|
|
828
|
+
optimal_rotation[4] = 1, optimal_rotation[5] = 0, optimal_rotation[6] = 0, optimal_rotation[7] = 0,
|
|
829
|
+
optimal_rotation[8] = 1;
|
|
830
|
+
trace_rotation_covariance = cross_covariance[0] + cross_covariance[4] + cross_covariance[8];
|
|
831
|
+
}
|
|
832
|
+
else {
|
|
833
|
+
nk_f64_t svd_left[9], svd_diagonal[9], svd_right[9];
|
|
834
|
+
nk_svd3x3_f64_(cross_covariance, svd_left, svd_diagonal, svd_right);
|
|
835
|
+
nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
|
|
836
|
+
if (nk_det3x3_f64_(optimal_rotation) < 0) {
|
|
837
|
+
svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8];
|
|
838
|
+
nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
|
|
839
|
+
}
|
|
840
|
+
trace_rotation_covariance =
|
|
841
|
+
optimal_rotation[0] * cross_covariance[0] + optimal_rotation[1] * cross_covariance[3] +
|
|
842
|
+
optimal_rotation[2] * cross_covariance[6] + optimal_rotation[3] * cross_covariance[1] +
|
|
843
|
+
optimal_rotation[4] * cross_covariance[4] + optimal_rotation[5] * cross_covariance[7] +
|
|
844
|
+
optimal_rotation[6] * cross_covariance[2] + optimal_rotation[7] * cross_covariance[5] +
|
|
845
|
+
optimal_rotation[8] * cross_covariance[8];
|
|
824
846
|
}
|
|
825
847
|
if (rotation)
|
|
826
|
-
for (int j = 0; j < 9; ++j) rotation[j] =
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
848
|
+
for (int j = 0; j < 9; ++j) rotation[j] = optimal_rotation[j];
|
|
849
|
+
// Folded SSD via trace identity: SSD = ‖a-ā‖² + ‖b-b̄‖² − 2·trace(R · H_centered).
|
|
850
|
+
nk_f64_t sum_squared = centered_norm_squared_a + centered_norm_squared_b - 2.0 * trace_rotation_covariance;
|
|
851
|
+
if (sum_squared < 0.0) sum_squared = 0.0;
|
|
852
|
+
*result = nk_f64_sqrt_rvv(sum_squared / (nk_f64_t)points_count);
|
|
830
853
|
}
|
|
831
854
|
|
|
832
855
|
NK_PUBLIC void nk_umeyama_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
|
|
833
856
|
nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
|
|
834
857
|
nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z;
|
|
835
|
-
nk_f64_t
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
858
|
+
nk_f64_t centered_norm_squared_a, centered_norm_squared_b;
|
|
859
|
+
nk_f64_t cross_covariance[9];
|
|
860
|
+
nk_centroid_and_cross_covariance_and_variance_f32_rvv_(
|
|
861
|
+
a, b, points_count, ¢roid_a_x, ¢roid_a_y, ¢roid_a_z, ¢roid_b_x, ¢roid_b_y, ¢roid_b_z,
|
|
862
|
+
cross_covariance, ¢ered_norm_squared_a, ¢ered_norm_squared_b);
|
|
839
863
|
if (a_centroid)
|
|
840
864
|
a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
|
|
841
865
|
a_centroid[2] = (nk_f32_t)centroid_a_z;
|
|
842
866
|
if (b_centroid)
|
|
843
867
|
b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
|
|
844
868
|
b_centroid[2] = (nk_f32_t)centroid_b_z;
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
nk_f64_t
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
nk_f64_t
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
869
|
+
|
|
870
|
+
// Identity-dominant short-circuit: if H ≈ diag(positive entries), R = I and trace(R·H) = trace(H).
|
|
871
|
+
nk_f64_t covariance_diagonal_norm_squared = cross_covariance[0] * cross_covariance[0] +
|
|
872
|
+
cross_covariance[4] * cross_covariance[4] +
|
|
873
|
+
cross_covariance[8] * cross_covariance[8];
|
|
874
|
+
nk_f64_t covariance_offdiagonal_norm_squared =
|
|
875
|
+
cross_covariance[1] * cross_covariance[1] + cross_covariance[2] * cross_covariance[2] +
|
|
876
|
+
cross_covariance[3] * cross_covariance[3] + cross_covariance[5] * cross_covariance[5] +
|
|
877
|
+
cross_covariance[6] * cross_covariance[6] + cross_covariance[7] * cross_covariance[7];
|
|
878
|
+
nk_f64_t optimal_rotation[9];
|
|
879
|
+
nk_f64_t trace_rotation_covariance;
|
|
880
|
+
nk_f64_t scale_factor;
|
|
881
|
+
if (covariance_offdiagonal_norm_squared < 1e-20 * covariance_diagonal_norm_squared && cross_covariance[0] > 0.0 &&
|
|
882
|
+
cross_covariance[4] > 0.0 && cross_covariance[8] > 0.0) {
|
|
883
|
+
optimal_rotation[0] = 1, optimal_rotation[1] = 0, optimal_rotation[2] = 0, optimal_rotation[3] = 0,
|
|
884
|
+
optimal_rotation[4] = 1, optimal_rotation[5] = 0, optimal_rotation[6] = 0, optimal_rotation[7] = 0,
|
|
885
|
+
optimal_rotation[8] = 1;
|
|
886
|
+
trace_rotation_covariance = cross_covariance[0] + cross_covariance[4] + cross_covariance[8];
|
|
887
|
+
scale_factor = centered_norm_squared_a > 0.0 ? trace_rotation_covariance / centered_norm_squared_a : 0.0;
|
|
888
|
+
}
|
|
889
|
+
else {
|
|
890
|
+
nk_f64_t svd_left[9], svd_diagonal[9], svd_right[9];
|
|
891
|
+
nk_svd3x3_f64_(cross_covariance, svd_left, svd_diagonal, svd_right);
|
|
892
|
+
nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
|
|
893
|
+
nk_f64_t det = nk_det3x3_f64_(optimal_rotation);
|
|
894
|
+
nk_f64_t sign_det = det < 0 ? -1.0 : 1.0;
|
|
895
|
+
nk_f64_t trace_ds = nk_sum_three_products_f64_(svd_diagonal[0], 1.0, svd_diagonal[4], 1.0, svd_diagonal[8],
|
|
896
|
+
sign_det);
|
|
897
|
+
scale_factor = centered_norm_squared_a > 0.0 ? trace_ds / centered_norm_squared_a : 0.0;
|
|
898
|
+
if (det < 0) {
|
|
899
|
+
svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8];
|
|
900
|
+
nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
|
|
901
|
+
}
|
|
902
|
+
trace_rotation_covariance =
|
|
903
|
+
optimal_rotation[0] * cross_covariance[0] + optimal_rotation[1] * cross_covariance[3] +
|
|
904
|
+
optimal_rotation[2] * cross_covariance[6] + optimal_rotation[3] * cross_covariance[1] +
|
|
905
|
+
optimal_rotation[4] * cross_covariance[4] + optimal_rotation[5] * cross_covariance[7] +
|
|
906
|
+
optimal_rotation[6] * cross_covariance[2] + optimal_rotation[7] * cross_covariance[5] +
|
|
907
|
+
optimal_rotation[8] * cross_covariance[8];
|
|
857
908
|
}
|
|
909
|
+
if (scale) *scale = (nk_f32_t)scale_factor;
|
|
858
910
|
if (rotation)
|
|
859
|
-
for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
911
|
+
for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)optimal_rotation[j];
|
|
912
|
+
// Folded SSD with scale: c²·‖a-ā‖² + ‖b-b̄‖² − 2c·trace(R · H_centered).
|
|
913
|
+
nk_f64_t sum_squared = scale_factor * scale_factor * centered_norm_squared_a + centered_norm_squared_b -
|
|
914
|
+
2.0 * scale_factor * trace_rotation_covariance;
|
|
915
|
+
if (sum_squared < 0.0) sum_squared = 0.0;
|
|
916
|
+
*result = nk_f64_sqrt_rvv(sum_squared / (nk_f64_t)points_count);
|
|
863
917
|
}
|
|
864
918
|
|
|
865
919
|
NK_PUBLIC void nk_umeyama_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t points_count, nk_f64_t *a_centroid,
|
|
866
920
|
nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
|
|
867
921
|
nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z;
|
|
868
|
-
nk_f64_t
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
922
|
+
nk_f64_t centered_norm_squared_a, centered_norm_squared_b;
|
|
923
|
+
nk_f64_t cross_covariance[9];
|
|
924
|
+
nk_centroid_and_cross_covariance_and_variance_f64_rvv_(
|
|
925
|
+
a, b, points_count, ¢roid_a_x, ¢roid_a_y, ¢roid_a_z, ¢roid_b_x, ¢roid_b_y, ¢roid_b_z,
|
|
926
|
+
cross_covariance, ¢ered_norm_squared_a, ¢ered_norm_squared_b);
|
|
872
927
|
if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
|
|
873
928
|
if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
nk_f64_t
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
nk_f64_t
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
929
|
+
|
|
930
|
+
// Identity-dominant short-circuit: if H ≈ diag(positive entries), R = I and trace(R·H) = trace(H).
|
|
931
|
+
nk_f64_t covariance_diagonal_norm_squared = cross_covariance[0] * cross_covariance[0] +
|
|
932
|
+
cross_covariance[4] * cross_covariance[4] +
|
|
933
|
+
cross_covariance[8] * cross_covariance[8];
|
|
934
|
+
nk_f64_t covariance_offdiagonal_norm_squared =
|
|
935
|
+
cross_covariance[1] * cross_covariance[1] + cross_covariance[2] * cross_covariance[2] +
|
|
936
|
+
cross_covariance[3] * cross_covariance[3] + cross_covariance[5] * cross_covariance[5] +
|
|
937
|
+
cross_covariance[6] * cross_covariance[6] + cross_covariance[7] * cross_covariance[7];
|
|
938
|
+
nk_f64_t optimal_rotation[9];
|
|
939
|
+
nk_f64_t trace_rotation_covariance;
|
|
940
|
+
nk_f64_t scale_factor;
|
|
941
|
+
if (covariance_offdiagonal_norm_squared < 1e-20 * covariance_diagonal_norm_squared && cross_covariance[0] > 0.0 &&
|
|
942
|
+
cross_covariance[4] > 0.0 && cross_covariance[8] > 0.0) {
|
|
943
|
+
optimal_rotation[0] = 1, optimal_rotation[1] = 0, optimal_rotation[2] = 0, optimal_rotation[3] = 0,
|
|
944
|
+
optimal_rotation[4] = 1, optimal_rotation[5] = 0, optimal_rotation[6] = 0, optimal_rotation[7] = 0,
|
|
945
|
+
optimal_rotation[8] = 1;
|
|
946
|
+
trace_rotation_covariance = cross_covariance[0] + cross_covariance[4] + cross_covariance[8];
|
|
947
|
+
scale_factor = centered_norm_squared_a > 0.0 ? trace_rotation_covariance / centered_norm_squared_a : 0.0;
|
|
886
948
|
}
|
|
949
|
+
else {
|
|
950
|
+
nk_f64_t svd_left[9], svd_diagonal[9], svd_right[9];
|
|
951
|
+
nk_svd3x3_f64_(cross_covariance, svd_left, svd_diagonal, svd_right);
|
|
952
|
+
nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
|
|
953
|
+
nk_f64_t det = nk_det3x3_f64_(optimal_rotation);
|
|
954
|
+
nk_f64_t sign_det = det < 0 ? -1.0 : 1.0;
|
|
955
|
+
nk_f64_t trace_ds = nk_sum_three_products_f64_(svd_diagonal[0], 1.0, svd_diagonal[4], 1.0, svd_diagonal[8],
|
|
956
|
+
sign_det);
|
|
957
|
+
scale_factor = centered_norm_squared_a > 0.0 ? trace_ds / centered_norm_squared_a : 0.0;
|
|
958
|
+
if (det < 0) {
|
|
959
|
+
svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8];
|
|
960
|
+
nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
|
|
961
|
+
}
|
|
962
|
+
trace_rotation_covariance =
|
|
963
|
+
optimal_rotation[0] * cross_covariance[0] + optimal_rotation[1] * cross_covariance[3] +
|
|
964
|
+
optimal_rotation[2] * cross_covariance[6] + optimal_rotation[3] * cross_covariance[1] +
|
|
965
|
+
optimal_rotation[4] * cross_covariance[4] + optimal_rotation[5] * cross_covariance[7] +
|
|
966
|
+
optimal_rotation[6] * cross_covariance[2] + optimal_rotation[7] * cross_covariance[5] +
|
|
967
|
+
optimal_rotation[8] * cross_covariance[8];
|
|
968
|
+
}
|
|
969
|
+
if (scale) *scale = scale_factor;
|
|
887
970
|
if (rotation)
|
|
888
|
-
for (int j = 0; j < 9; ++j) rotation[j] =
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
971
|
+
for (int j = 0; j < 9; ++j) rotation[j] = optimal_rotation[j];
|
|
972
|
+
// Folded SSD with scale: c²·‖a-ā‖² + ‖b-b̄‖² − 2c·trace(R · H_centered).
|
|
973
|
+
nk_f64_t sum_squared = scale_factor * scale_factor * centered_norm_squared_a + centered_norm_squared_b -
|
|
974
|
+
2.0 * scale_factor * trace_rotation_covariance;
|
|
975
|
+
if (sum_squared < 0.0) sum_squared = 0.0;
|
|
976
|
+
*result = nk_f64_sqrt_rvv(sum_squared / (nk_f64_t)points_count);
|
|
892
977
|
}
|
|
893
978
|
|
|
894
979
|
NK_PUBLIC void nk_rmsd_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_size_t points_count, nk_f32_t *a_centroid,
|