numkong 7.5.0 → 7.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +18 -0
- package/c/dispatch_e5m2.c +23 -3
- package/include/numkong/capabilities.h +1 -1
- package/include/numkong/cast/README.md +3 -0
- package/include/numkong/cast/haswell.h +28 -64
- package/include/numkong/cast/serial.h +17 -0
- package/include/numkong/cast/skylake.h +67 -52
- package/include/numkong/cast.h +1 -0
- package/include/numkong/dot/README.md +1 -0
- package/include/numkong/dot/haswell.h +92 -13
- package/include/numkong/dot/serial.h +15 -0
- package/include/numkong/dot/skylake.h +61 -14
- package/include/numkong/dots/README.md +2 -0
- package/include/numkong/dots/graniteamx.h +434 -0
- package/include/numkong/dots/haswell.h +28 -28
- package/include/numkong/dots/sapphireamx.h +1 -1
- package/include/numkong/dots/serial.h +23 -8
- package/include/numkong/dots/skylake.h +28 -23
- package/include/numkong/dots.h +12 -0
- package/include/numkong/each/serial.h +18 -1
- package/include/numkong/geospatial/serial.h +14 -3
- package/include/numkong/maxsim/serial.h +15 -0
- package/include/numkong/mesh/README.md +50 -44
- package/include/numkong/mesh/genoa.h +462 -0
- package/include/numkong/mesh/haswell.h +806 -933
- package/include/numkong/mesh/neon.h +871 -943
- package/include/numkong/mesh/neonbfdot.h +382 -522
- package/include/numkong/mesh/neonfhm.h +676 -0
- package/include/numkong/mesh/rvv.h +404 -319
- package/include/numkong/mesh/serial.h +204 -162
- package/include/numkong/mesh/skylake.h +1029 -1585
- package/include/numkong/mesh/v128relaxed.h +403 -377
- package/include/numkong/mesh.h +38 -0
- package/include/numkong/reduce/serial.h +15 -1
- package/include/numkong/sparse/serial.h +17 -2
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +98 -56
- package/include/numkong/spatial/serial.h +15 -0
- package/include/numkong/spatial/skylake.h +114 -54
- package/include/numkong/spatial.h +0 -12
- package/include/numkong/spatials/graniteamx.h +128 -0
- package/include/numkong/spatials/serial.h +18 -1
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials.h +17 -0
- package/include/numkong/tensor.hpp +107 -23
- package/javascript/numkong.c +3 -2
- package/package.json +7 -7
- package/wasm/numkong.wasm +0 -0
|
@@ -101,11 +101,6 @@ NK_INTERNAL nk_f64_t nk_reduce_stable_f64x2_v128relaxed_(v128_t values_f64x2) {
|
|
|
101
101
|
return sum + compensation;
|
|
102
102
|
}
|
|
103
103
|
|
|
104
|
-
NK_INTERNAL void nk_rotation_from_svd_f64_v128relaxed_(nk_f64_t const *svd_u, nk_f64_t const *svd_v,
|
|
105
|
-
nk_f64_t *rotation) {
|
|
106
|
-
nk_rotation_from_svd_f64_serial_(svd_u, svd_v, rotation);
|
|
107
|
-
}
|
|
108
|
-
|
|
109
104
|
NK_INTERNAL void nk_accumulate_square_f64x2_v128relaxed_(v128_t *sum_f64x2, v128_t *compensation_f64x2,
|
|
110
105
|
v128_t values_f64x2) {
|
|
111
106
|
v128_t product_f64x2 = wasm_f64x2_mul(values_f64x2, values_f64x2);
|
|
@@ -124,7 +119,7 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_v128relaxed_( //
|
|
|
124
119
|
nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, //
|
|
125
120
|
nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
|
|
126
121
|
nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
|
|
127
|
-
nk_f64_t
|
|
122
|
+
nk_f64_t cross_covariance[9], nk_f64_t *centered_norm_squared_a, nk_f64_t *centered_norm_squared_b) {
|
|
128
123
|
v128_t zero_f64x2 = wasm_f64x2_splat(0.0);
|
|
129
124
|
v128_t sum_a_x_low_f64x2 = zero_f64x2, sum_a_x_high_f64x2 = zero_f64x2;
|
|
130
125
|
v128_t sum_a_y_low_f64x2 = zero_f64x2, sum_a_y_high_f64x2 = zero_f64x2;
|
|
@@ -141,6 +136,8 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_v128relaxed_( //
|
|
|
141
136
|
v128_t cross_20_low_f64x2 = zero_f64x2, cross_20_high_f64x2 = zero_f64x2;
|
|
142
137
|
v128_t cross_21_low_f64x2 = zero_f64x2, cross_21_high_f64x2 = zero_f64x2;
|
|
143
138
|
v128_t cross_22_low_f64x2 = zero_f64x2, cross_22_high_f64x2 = zero_f64x2;
|
|
139
|
+
v128_t norm_squared_a_low_f64x2 = zero_f64x2, norm_squared_a_high_f64x2 = zero_f64x2;
|
|
140
|
+
v128_t norm_squared_b_low_f64x2 = zero_f64x2, norm_squared_b_high_f64x2 = zero_f64x2;
|
|
144
141
|
nk_size_t index = 0;
|
|
145
142
|
|
|
146
143
|
for (; index + 4 <= n; index += 4) {
|
|
@@ -192,6 +189,19 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_v128relaxed_( //
|
|
|
192
189
|
cross_21_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, b_y_high_f64x2, cross_21_high_f64x2);
|
|
193
190
|
cross_22_low_f64x2 = wasm_f64x2_relaxed_madd(a_z_low_f64x2, b_z_low_f64x2, cross_22_low_f64x2),
|
|
194
191
|
cross_22_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, b_z_high_f64x2, cross_22_high_f64x2);
|
|
192
|
+
|
|
193
|
+
norm_squared_a_low_f64x2 = wasm_f64x2_relaxed_madd(a_x_low_f64x2, a_x_low_f64x2, norm_squared_a_low_f64x2);
|
|
194
|
+
norm_squared_a_high_f64x2 = wasm_f64x2_relaxed_madd(a_x_high_f64x2, a_x_high_f64x2, norm_squared_a_high_f64x2);
|
|
195
|
+
norm_squared_a_low_f64x2 = wasm_f64x2_relaxed_madd(a_y_low_f64x2, a_y_low_f64x2, norm_squared_a_low_f64x2);
|
|
196
|
+
norm_squared_a_high_f64x2 = wasm_f64x2_relaxed_madd(a_y_high_f64x2, a_y_high_f64x2, norm_squared_a_high_f64x2);
|
|
197
|
+
norm_squared_a_low_f64x2 = wasm_f64x2_relaxed_madd(a_z_low_f64x2, a_z_low_f64x2, norm_squared_a_low_f64x2);
|
|
198
|
+
norm_squared_a_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, a_z_high_f64x2, norm_squared_a_high_f64x2);
|
|
199
|
+
norm_squared_b_low_f64x2 = wasm_f64x2_relaxed_madd(b_x_low_f64x2, b_x_low_f64x2, norm_squared_b_low_f64x2);
|
|
200
|
+
norm_squared_b_high_f64x2 = wasm_f64x2_relaxed_madd(b_x_high_f64x2, b_x_high_f64x2, norm_squared_b_high_f64x2);
|
|
201
|
+
norm_squared_b_low_f64x2 = wasm_f64x2_relaxed_madd(b_y_low_f64x2, b_y_low_f64x2, norm_squared_b_low_f64x2);
|
|
202
|
+
norm_squared_b_high_f64x2 = wasm_f64x2_relaxed_madd(b_y_high_f64x2, b_y_high_f64x2, norm_squared_b_high_f64x2);
|
|
203
|
+
norm_squared_b_low_f64x2 = wasm_f64x2_relaxed_madd(b_z_low_f64x2, b_z_low_f64x2, norm_squared_b_low_f64x2);
|
|
204
|
+
norm_squared_b_high_f64x2 = wasm_f64x2_relaxed_madd(b_z_high_f64x2, b_z_high_f64x2, norm_squared_b_high_f64x2);
|
|
195
205
|
}
|
|
196
206
|
|
|
197
207
|
nk_f64_t sum_a_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_x_low_f64x2, sum_a_x_high_f64x2));
|
|
@@ -209,6 +219,10 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_v128relaxed_( //
|
|
|
209
219
|
nk_f64_t cross_20 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_20_low_f64x2, cross_20_high_f64x2));
|
|
210
220
|
nk_f64_t cross_21 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_21_low_f64x2, cross_21_high_f64x2));
|
|
211
221
|
nk_f64_t cross_22 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_22_low_f64x2, cross_22_high_f64x2));
|
|
222
|
+
nk_f64_t norm_squared_a_sum = nk_hsum_f64x2_v128relaxed_(
|
|
223
|
+
wasm_f64x2_add(norm_squared_a_low_f64x2, norm_squared_a_high_f64x2));
|
|
224
|
+
nk_f64_t norm_squared_b_sum = nk_hsum_f64x2_v128relaxed_(
|
|
225
|
+
wasm_f64x2_add(norm_squared_b_low_f64x2, norm_squared_b_high_f64x2));
|
|
212
226
|
|
|
213
227
|
for (; index < n; ++index) {
|
|
214
228
|
nk_f64_t a_x = a[index * 3 + 0], a_y = a[index * 3 + 1], a_z = a[index * 3 + 2];
|
|
@@ -218,6 +232,8 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_v128relaxed_( //
|
|
|
218
232
|
cross_00 += a_x * b_x, cross_01 += a_x * b_y, cross_02 += a_x * b_z;
|
|
219
233
|
cross_10 += a_y * b_x, cross_11 += a_y * b_y, cross_12 += a_y * b_z;
|
|
220
234
|
cross_20 += a_z * b_x, cross_21 += a_z * b_y, cross_22 += a_z * b_z;
|
|
235
|
+
norm_squared_a_sum += a_x * a_x + a_y * a_y + a_z * a_z;
|
|
236
|
+
norm_squared_b_sum += b_x * b_x + b_y * b_y + b_z * b_z;
|
|
221
237
|
}
|
|
222
238
|
|
|
223
239
|
nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
|
|
@@ -227,22 +243,31 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_v128relaxed_( //
|
|
|
227
243
|
*centroid_b_z = sum_b_z * inv_points_count;
|
|
228
244
|
|
|
229
245
|
nk_f64_t n_f64 = (nk_f64_t)n;
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
246
|
+
cross_covariance[0] = cross_00 - n_f64 * (*centroid_a_x) * (*centroid_b_x),
|
|
247
|
+
cross_covariance[1] = cross_01 - n_f64 * (*centroid_a_x) * (*centroid_b_y),
|
|
248
|
+
cross_covariance[2] = cross_02 - n_f64 * (*centroid_a_x) * (*centroid_b_z);
|
|
249
|
+
cross_covariance[3] = cross_10 - n_f64 * (*centroid_a_y) * (*centroid_b_x),
|
|
250
|
+
cross_covariance[4] = cross_11 - n_f64 * (*centroid_a_y) * (*centroid_b_y),
|
|
251
|
+
cross_covariance[5] = cross_12 - n_f64 * (*centroid_a_y) * (*centroid_b_z);
|
|
252
|
+
cross_covariance[6] = cross_20 - n_f64 * (*centroid_a_z) * (*centroid_b_x),
|
|
253
|
+
cross_covariance[7] = cross_21 - n_f64 * (*centroid_a_z) * (*centroid_b_y),
|
|
254
|
+
cross_covariance[8] = cross_22 - n_f64 * (*centroid_a_z) * (*centroid_b_z);
|
|
255
|
+
|
|
256
|
+
*centered_norm_squared_a = norm_squared_a_sum -
|
|
257
|
+
n_f64 * ((*centroid_a_x) * (*centroid_a_x) + (*centroid_a_y) * (*centroid_a_y) +
|
|
258
|
+
(*centroid_a_z) * (*centroid_a_z));
|
|
259
|
+
*centered_norm_squared_b = norm_squared_b_sum -
|
|
260
|
+
n_f64 * ((*centroid_b_x) * (*centroid_b_x) + (*centroid_b_y) * (*centroid_b_y) +
|
|
261
|
+
(*centroid_b_z) * (*centroid_b_z));
|
|
262
|
+
if (*centered_norm_squared_a < 0.0) *centered_norm_squared_a = 0.0;
|
|
263
|
+
if (*centered_norm_squared_b < 0.0) *centered_norm_squared_b = 0.0;
|
|
239
264
|
}
|
|
240
265
|
|
|
241
266
|
NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_v128relaxed_( //
|
|
242
267
|
nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, //
|
|
243
268
|
nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
|
|
244
269
|
nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
|
|
245
|
-
nk_f64_t
|
|
270
|
+
nk_f64_t cross_covariance[9], nk_f64_t *centered_norm_squared_a, nk_f64_t *centered_norm_squared_b) {
|
|
246
271
|
v128_t zero_f64x2 = wasm_f64x2_splat(0.0);
|
|
247
272
|
v128_t sum_a_x_low_f64x2 = zero_f64x2, sum_a_x_high_f64x2 = zero_f64x2;
|
|
248
273
|
v128_t sum_a_y_low_f64x2 = zero_f64x2, sum_a_y_high_f64x2 = zero_f64x2;
|
|
@@ -250,6 +275,7 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_v128relaxed_(
|
|
|
250
275
|
v128_t sum_b_x_low_f64x2 = zero_f64x2, sum_b_x_high_f64x2 = zero_f64x2;
|
|
251
276
|
v128_t sum_b_y_low_f64x2 = zero_f64x2, sum_b_y_high_f64x2 = zero_f64x2;
|
|
252
277
|
v128_t sum_b_z_low_f64x2 = zero_f64x2, sum_b_z_high_f64x2 = zero_f64x2;
|
|
278
|
+
v128_t norm_squared_b_low_f64x2 = zero_f64x2, norm_squared_b_high_f64x2 = zero_f64x2;
|
|
253
279
|
v128_t cross_00_low_f64x2 = zero_f64x2, cross_00_high_f64x2 = zero_f64x2;
|
|
254
280
|
v128_t cross_01_low_f64x2 = zero_f64x2, cross_01_high_f64x2 = zero_f64x2;
|
|
255
281
|
v128_t cross_02_low_f64x2 = zero_f64x2, cross_02_high_f64x2 = zero_f64x2;
|
|
@@ -312,7 +338,7 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_v128relaxed_(
|
|
|
312
338
|
cross_22_low_f64x2 = wasm_f64x2_relaxed_madd(a_z_low_f64x2, b_z_low_f64x2, cross_22_low_f64x2),
|
|
313
339
|
cross_22_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, b_z_high_f64x2, cross_22_high_f64x2);
|
|
314
340
|
|
|
315
|
-
//
|
|
341
|
+
// Norm-squared accumulators for both point sets (used for folded SSD).
|
|
316
342
|
v128_t norm_squared_low_f64x2 = wasm_f64x2_relaxed_madd(a_y_low_f64x2, a_y_low_f64x2,
|
|
317
343
|
wasm_f64x2_mul(a_x_low_f64x2, a_x_low_f64x2));
|
|
318
344
|
v128_t norm_squared_high_f64x2 = wasm_f64x2_relaxed_madd(a_y_high_f64x2, a_y_high_f64x2,
|
|
@@ -321,6 +347,13 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_v128relaxed_(
|
|
|
321
347
|
norm_squared_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, a_z_high_f64x2, norm_squared_high_f64x2);
|
|
322
348
|
sum_norm_squared_low_f64x2 = wasm_f64x2_add(sum_norm_squared_low_f64x2, norm_squared_low_f64x2);
|
|
323
349
|
sum_norm_squared_high_f64x2 = wasm_f64x2_add(sum_norm_squared_high_f64x2, norm_squared_high_f64x2);
|
|
350
|
+
|
|
351
|
+
norm_squared_b_low_f64x2 = wasm_f64x2_relaxed_madd(b_x_low_f64x2, b_x_low_f64x2, norm_squared_b_low_f64x2);
|
|
352
|
+
norm_squared_b_high_f64x2 = wasm_f64x2_relaxed_madd(b_x_high_f64x2, b_x_high_f64x2, norm_squared_b_high_f64x2);
|
|
353
|
+
norm_squared_b_low_f64x2 = wasm_f64x2_relaxed_madd(b_y_low_f64x2, b_y_low_f64x2, norm_squared_b_low_f64x2);
|
|
354
|
+
norm_squared_b_high_f64x2 = wasm_f64x2_relaxed_madd(b_y_high_f64x2, b_y_high_f64x2, norm_squared_b_high_f64x2);
|
|
355
|
+
norm_squared_b_low_f64x2 = wasm_f64x2_relaxed_madd(b_z_low_f64x2, b_z_low_f64x2, norm_squared_b_low_f64x2);
|
|
356
|
+
norm_squared_b_high_f64x2 = wasm_f64x2_relaxed_madd(b_z_high_f64x2, b_z_high_f64x2, norm_squared_b_high_f64x2);
|
|
324
357
|
}
|
|
325
358
|
|
|
326
359
|
nk_f64_t sum_a_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_x_low_f64x2, sum_a_x_high_f64x2));
|
|
@@ -338,8 +371,10 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_v128relaxed_(
|
|
|
338
371
|
nk_f64_t cross_20 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_20_low_f64x2, cross_20_high_f64x2));
|
|
339
372
|
nk_f64_t cross_21 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_21_low_f64x2, cross_21_high_f64x2));
|
|
340
373
|
nk_f64_t cross_22 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_22_low_f64x2, cross_22_high_f64x2));
|
|
341
|
-
nk_f64_t
|
|
374
|
+
nk_f64_t norm_squared_a_sum = nk_hsum_f64x2_v128relaxed_(
|
|
342
375
|
wasm_f64x2_add(sum_norm_squared_low_f64x2, sum_norm_squared_high_f64x2));
|
|
376
|
+
nk_f64_t norm_squared_b_sum = nk_hsum_f64x2_v128relaxed_(
|
|
377
|
+
wasm_f64x2_add(norm_squared_b_low_f64x2, norm_squared_b_high_f64x2));
|
|
343
378
|
|
|
344
379
|
for (; index < n; ++index) {
|
|
345
380
|
nk_f64_t a_x = a[index * 3 + 0], a_y = a[index * 3 + 1], a_z = a[index * 3 + 2];
|
|
@@ -349,7 +384,8 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_v128relaxed_(
|
|
|
349
384
|
cross_00 += a_x * b_x, cross_01 += a_x * b_y, cross_02 += a_x * b_z;
|
|
350
385
|
cross_10 += a_y * b_x, cross_11 += a_y * b_y, cross_12 += a_y * b_z;
|
|
351
386
|
cross_20 += a_z * b_x, cross_21 += a_z * b_y, cross_22 += a_z * b_z;
|
|
352
|
-
|
|
387
|
+
norm_squared_a_sum += a_x * a_x + a_y * a_y + a_z * a_z;
|
|
388
|
+
norm_squared_b_sum += b_x * b_x + b_y * b_y + b_z * b_z;
|
|
353
389
|
}
|
|
354
390
|
|
|
355
391
|
nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
|
|
@@ -359,209 +395,24 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_v128relaxed_(
|
|
|
359
395
|
*centroid_b_z = sum_b_z * inv_points_count;
|
|
360
396
|
|
|
361
397
|
nk_f64_t n_f64 = (nk_f64_t)n;
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
v128_t scaled_rotation_x_y_f64x2 = wasm_f64x2_splat(scale * r[1]);
|
|
381
|
-
v128_t scaled_rotation_x_z_f64x2 = wasm_f64x2_splat(scale * r[2]);
|
|
382
|
-
v128_t scaled_rotation_y_x_f64x2 = wasm_f64x2_splat(scale * r[3]);
|
|
383
|
-
v128_t scaled_rotation_y_y_f64x2 = wasm_f64x2_splat(scale * r[4]);
|
|
384
|
-
v128_t scaled_rotation_y_z_f64x2 = wasm_f64x2_splat(scale * r[5]);
|
|
385
|
-
v128_t scaled_rotation_z_x_f64x2 = wasm_f64x2_splat(scale * r[6]);
|
|
386
|
-
v128_t scaled_rotation_z_y_f64x2 = wasm_f64x2_splat(scale * r[7]);
|
|
387
|
-
v128_t scaled_rotation_z_z_f64x2 = wasm_f64x2_splat(scale * r[8]);
|
|
388
|
-
v128_t centroid_a_x_f64x2 = wasm_f64x2_splat(centroid_a_x), centroid_a_y_f64x2 = wasm_f64x2_splat(centroid_a_y);
|
|
389
|
-
v128_t centroid_a_z_f64x2 = wasm_f64x2_splat(centroid_a_z), centroid_b_x_f64x2 = wasm_f64x2_splat(centroid_b_x);
|
|
390
|
-
v128_t centroid_b_y_f64x2 = wasm_f64x2_splat(centroid_b_y), centroid_b_z_f64x2 = wasm_f64x2_splat(centroid_b_z);
|
|
391
|
-
v128_t sum_squared_low_f64x2 = wasm_f64x2_splat(0.0), sum_squared_high_f64x2 = wasm_f64x2_splat(0.0);
|
|
392
|
-
nk_size_t index = 0;
|
|
393
|
-
|
|
394
|
-
for (; index + 4 <= n; index += 4) {
|
|
395
|
-
v128_t a_x_f32x4, a_y_f32x4, a_z_f32x4, b_x_f32x4, b_y_f32x4, b_z_f32x4;
|
|
396
|
-
nk_deinterleave_f32x4_v128relaxed_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4);
|
|
397
|
-
nk_deinterleave_f32x4_v128relaxed_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
|
|
398
|
-
|
|
399
|
-
v128_t centered_a_x_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(a_x_f32x4), centroid_a_x_f64x2);
|
|
400
|
-
v128_t centered_a_x_high_f64x2 = wasm_f64x2_sub(
|
|
401
|
-
wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_x_f32x4, a_x_f32x4, 2, 3, 0, 1)), centroid_a_x_f64x2);
|
|
402
|
-
v128_t centered_a_y_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(a_y_f32x4), centroid_a_y_f64x2);
|
|
403
|
-
v128_t centered_a_y_high_f64x2 = wasm_f64x2_sub(
|
|
404
|
-
wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_y_f32x4, a_y_f32x4, 2, 3, 0, 1)), centroid_a_y_f64x2);
|
|
405
|
-
v128_t centered_a_z_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(a_z_f32x4), centroid_a_z_f64x2);
|
|
406
|
-
v128_t centered_a_z_high_f64x2 = wasm_f64x2_sub(
|
|
407
|
-
wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_z_f32x4, a_z_f32x4, 2, 3, 0, 1)), centroid_a_z_f64x2);
|
|
408
|
-
v128_t centered_b_x_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(b_x_f32x4), centroid_b_x_f64x2);
|
|
409
|
-
v128_t centered_b_x_high_f64x2 = wasm_f64x2_sub(
|
|
410
|
-
wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_x_f32x4, b_x_f32x4, 2, 3, 0, 1)), centroid_b_x_f64x2);
|
|
411
|
-
v128_t centered_b_y_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(b_y_f32x4), centroid_b_y_f64x2);
|
|
412
|
-
v128_t centered_b_y_high_f64x2 = wasm_f64x2_sub(
|
|
413
|
-
wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_y_f32x4, b_y_f32x4, 2, 3, 0, 1)), centroid_b_y_f64x2);
|
|
414
|
-
v128_t centered_b_z_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(b_z_f32x4), centroid_b_z_f64x2);
|
|
415
|
-
v128_t centered_b_z_high_f64x2 = wasm_f64x2_sub(
|
|
416
|
-
wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_z_f32x4, b_z_f32x4, 2, 3, 0, 1)), centroid_b_z_f64x2);
|
|
417
|
-
|
|
418
|
-
v128_t rotated_a_x_low_f64x2 = wasm_f64x2_relaxed_madd(
|
|
419
|
-
scaled_rotation_x_z_f64x2, centered_a_z_low_f64x2,
|
|
420
|
-
wasm_f64x2_relaxed_madd(scaled_rotation_x_y_f64x2, centered_a_y_low_f64x2,
|
|
421
|
-
wasm_f64x2_mul(scaled_rotation_x_x_f64x2, centered_a_x_low_f64x2)));
|
|
422
|
-
v128_t rotated_a_x_high_f64x2 = wasm_f64x2_relaxed_madd(
|
|
423
|
-
scaled_rotation_x_z_f64x2, centered_a_z_high_f64x2,
|
|
424
|
-
wasm_f64x2_relaxed_madd(scaled_rotation_x_y_f64x2, centered_a_y_high_f64x2,
|
|
425
|
-
wasm_f64x2_mul(scaled_rotation_x_x_f64x2, centered_a_x_high_f64x2)));
|
|
426
|
-
v128_t rotated_a_y_low_f64x2 = wasm_f64x2_relaxed_madd(
|
|
427
|
-
scaled_rotation_y_z_f64x2, centered_a_z_low_f64x2,
|
|
428
|
-
wasm_f64x2_relaxed_madd(scaled_rotation_y_y_f64x2, centered_a_y_low_f64x2,
|
|
429
|
-
wasm_f64x2_mul(scaled_rotation_y_x_f64x2, centered_a_x_low_f64x2)));
|
|
430
|
-
v128_t rotated_a_y_high_f64x2 = wasm_f64x2_relaxed_madd(
|
|
431
|
-
scaled_rotation_y_z_f64x2, centered_a_z_high_f64x2,
|
|
432
|
-
wasm_f64x2_relaxed_madd(scaled_rotation_y_y_f64x2, centered_a_y_high_f64x2,
|
|
433
|
-
wasm_f64x2_mul(scaled_rotation_y_x_f64x2, centered_a_x_high_f64x2)));
|
|
434
|
-
v128_t rotated_a_z_low_f64x2 = wasm_f64x2_relaxed_madd(
|
|
435
|
-
scaled_rotation_z_z_f64x2, centered_a_z_low_f64x2,
|
|
436
|
-
wasm_f64x2_relaxed_madd(scaled_rotation_z_y_f64x2, centered_a_y_low_f64x2,
|
|
437
|
-
wasm_f64x2_mul(scaled_rotation_z_x_f64x2, centered_a_x_low_f64x2)));
|
|
438
|
-
v128_t rotated_a_z_high_f64x2 = wasm_f64x2_relaxed_madd(
|
|
439
|
-
scaled_rotation_z_z_f64x2, centered_a_z_high_f64x2,
|
|
440
|
-
wasm_f64x2_relaxed_madd(scaled_rotation_z_y_f64x2, centered_a_y_high_f64x2,
|
|
441
|
-
wasm_f64x2_mul(scaled_rotation_z_x_f64x2, centered_a_x_high_f64x2)));
|
|
442
|
-
|
|
443
|
-
v128_t delta_x_low_f64x2 = wasm_f64x2_sub(rotated_a_x_low_f64x2, centered_b_x_low_f64x2);
|
|
444
|
-
v128_t delta_x_high_f64x2 = wasm_f64x2_sub(rotated_a_x_high_f64x2, centered_b_x_high_f64x2);
|
|
445
|
-
v128_t delta_y_low_f64x2 = wasm_f64x2_sub(rotated_a_y_low_f64x2, centered_b_y_low_f64x2);
|
|
446
|
-
v128_t delta_y_high_f64x2 = wasm_f64x2_sub(rotated_a_y_high_f64x2, centered_b_y_high_f64x2);
|
|
447
|
-
v128_t delta_z_low_f64x2 = wasm_f64x2_sub(rotated_a_z_low_f64x2, centered_b_z_low_f64x2);
|
|
448
|
-
v128_t delta_z_high_f64x2 = wasm_f64x2_sub(rotated_a_z_high_f64x2, centered_b_z_high_f64x2);
|
|
449
|
-
|
|
450
|
-
sum_squared_low_f64x2 = wasm_f64x2_relaxed_madd(delta_x_low_f64x2, delta_x_low_f64x2, sum_squared_low_f64x2);
|
|
451
|
-
sum_squared_high_f64x2 = wasm_f64x2_relaxed_madd(delta_x_high_f64x2, delta_x_high_f64x2,
|
|
452
|
-
sum_squared_high_f64x2);
|
|
453
|
-
sum_squared_low_f64x2 = wasm_f64x2_relaxed_madd(delta_y_low_f64x2, delta_y_low_f64x2, sum_squared_low_f64x2);
|
|
454
|
-
sum_squared_high_f64x2 = wasm_f64x2_relaxed_madd(delta_y_high_f64x2, delta_y_high_f64x2,
|
|
455
|
-
sum_squared_high_f64x2);
|
|
456
|
-
sum_squared_low_f64x2 = wasm_f64x2_relaxed_madd(delta_z_low_f64x2, delta_z_low_f64x2, sum_squared_low_f64x2);
|
|
457
|
-
sum_squared_high_f64x2 = wasm_f64x2_relaxed_madd(delta_z_high_f64x2, delta_z_high_f64x2,
|
|
458
|
-
sum_squared_high_f64x2);
|
|
459
|
-
}
|
|
460
|
-
|
|
461
|
-
nk_f64_t sum_squared = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_squared_low_f64x2, sum_squared_high_f64x2));
|
|
462
|
-
for (; index < n; ++index) {
|
|
463
|
-
nk_f64_t centered_a_x = (nk_f64_t)a[index * 3 + 0] - centroid_a_x,
|
|
464
|
-
centered_a_y = (nk_f64_t)a[index * 3 + 1] - centroid_a_y,
|
|
465
|
-
centered_a_z = (nk_f64_t)a[index * 3 + 2] - centroid_a_z;
|
|
466
|
-
nk_f64_t centered_b_x = (nk_f64_t)b[index * 3 + 0] - centroid_b_x,
|
|
467
|
-
centered_b_y = (nk_f64_t)b[index * 3 + 1] - centroid_b_y,
|
|
468
|
-
centered_b_z = (nk_f64_t)b[index * 3 + 2] - centroid_b_z;
|
|
469
|
-
nk_f64_t rotated_a_x = scale * (r[0] * centered_a_x + r[1] * centered_a_y + r[2] * centered_a_z),
|
|
470
|
-
rotated_a_y = scale * (r[3] * centered_a_x + r[4] * centered_a_y + r[5] * centered_a_z),
|
|
471
|
-
rotated_a_z = scale * (r[6] * centered_a_x + r[7] * centered_a_y + r[8] * centered_a_z);
|
|
472
|
-
nk_f64_t delta_x = rotated_a_x - centered_b_x, delta_y = rotated_a_y - centered_b_y,
|
|
473
|
-
delta_z = rotated_a_z - centered_b_z;
|
|
474
|
-
sum_squared += delta_x * delta_x + delta_y * delta_y + delta_z * delta_z;
|
|
475
|
-
}
|
|
476
|
-
|
|
477
|
-
return sum_squared;
|
|
478
|
-
}
|
|
479
|
-
|
|
480
|
-
/* Compute sum of squared distances for f64 after applying rotation (and optional scale). */
|
|
481
|
-
NK_INTERNAL nk_f64_t nk_transformed_ssd_f64_v128relaxed_(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n,
|
|
482
|
-
nk_f64_t const *r, nk_f64_t scale, nk_f64_t centroid_a_x,
|
|
483
|
-
nk_f64_t centroid_a_y, nk_f64_t centroid_a_z,
|
|
484
|
-
nk_f64_t centroid_b_x, nk_f64_t centroid_b_y,
|
|
485
|
-
nk_f64_t centroid_b_z) {
|
|
486
|
-
// Broadcast scaled rotation matrix elements
|
|
487
|
-
v128_t scaled_rotation_x_x_f64x2 = wasm_f64x2_splat(scale * r[0]);
|
|
488
|
-
v128_t scaled_rotation_x_y_f64x2 = wasm_f64x2_splat(scale * r[1]);
|
|
489
|
-
v128_t scaled_rotation_x_z_f64x2 = wasm_f64x2_splat(scale * r[2]);
|
|
490
|
-
v128_t scaled_rotation_y_x_f64x2 = wasm_f64x2_splat(scale * r[3]);
|
|
491
|
-
v128_t scaled_rotation_y_y_f64x2 = wasm_f64x2_splat(scale * r[4]);
|
|
492
|
-
v128_t scaled_rotation_y_z_f64x2 = wasm_f64x2_splat(scale * r[5]);
|
|
493
|
-
v128_t scaled_rotation_z_x_f64x2 = wasm_f64x2_splat(scale * r[6]);
|
|
494
|
-
v128_t scaled_rotation_z_y_f64x2 = wasm_f64x2_splat(scale * r[7]);
|
|
495
|
-
v128_t scaled_rotation_z_z_f64x2 = wasm_f64x2_splat(scale * r[8]);
|
|
496
|
-
|
|
497
|
-
// Broadcast centroids
|
|
498
|
-
v128_t centroid_a_x_f64x2 = wasm_f64x2_splat(centroid_a_x);
|
|
499
|
-
v128_t centroid_a_y_f64x2 = wasm_f64x2_splat(centroid_a_y);
|
|
500
|
-
v128_t centroid_a_z_f64x2 = wasm_f64x2_splat(centroid_a_z);
|
|
501
|
-
v128_t centroid_b_x_f64x2 = wasm_f64x2_splat(centroid_b_x);
|
|
502
|
-
v128_t centroid_b_y_f64x2 = wasm_f64x2_splat(centroid_b_y);
|
|
503
|
-
v128_t centroid_b_z_f64x2 = wasm_f64x2_splat(centroid_b_z);
|
|
504
|
-
|
|
505
|
-
v128_t sum_squared_f64x2 = wasm_f64x2_splat(0), sum_squared_compensation_f64x2 = wasm_f64x2_splat(0);
|
|
506
|
-
nk_size_t j = 0;
|
|
507
|
-
|
|
508
|
-
// Main loop: process 2 points per iteration
|
|
509
|
-
for (; j + 2 <= n; j += 2) {
|
|
510
|
-
v128_t a_x_f64x2, a_y_f64x2, a_z_f64x2, b_x_f64x2, b_y_f64x2, b_z_f64x2;
|
|
511
|
-
nk_deinterleave_f64x2_v128relaxed_(a + j * 3, &a_x_f64x2, &a_y_f64x2, &a_z_f64x2);
|
|
512
|
-
nk_deinterleave_f64x2_v128relaxed_(b + j * 3, &b_x_f64x2, &b_y_f64x2, &b_z_f64x2);
|
|
513
|
-
|
|
514
|
-
v128_t centered_a_x_f64x2 = wasm_f64x2_sub(a_x_f64x2, centroid_a_x_f64x2);
|
|
515
|
-
v128_t centered_a_y_f64x2 = wasm_f64x2_sub(a_y_f64x2, centroid_a_y_f64x2);
|
|
516
|
-
v128_t centered_a_z_f64x2 = wasm_f64x2_sub(a_z_f64x2, centroid_a_z_f64x2);
|
|
517
|
-
v128_t centered_b_x_f64x2 = wasm_f64x2_sub(b_x_f64x2, centroid_b_x_f64x2);
|
|
518
|
-
v128_t centered_b_y_f64x2 = wasm_f64x2_sub(b_y_f64x2, centroid_b_y_f64x2);
|
|
519
|
-
v128_t centered_b_z_f64x2 = wasm_f64x2_sub(b_z_f64x2, centroid_b_z_f64x2);
|
|
520
|
-
|
|
521
|
-
// Rotate and scale: ra = scale * R * pa
|
|
522
|
-
v128_t rotated_a_x_f64x2 = wasm_f64x2_relaxed_madd(
|
|
523
|
-
scaled_rotation_x_z_f64x2, centered_a_z_f64x2,
|
|
524
|
-
wasm_f64x2_relaxed_madd(scaled_rotation_x_y_f64x2, centered_a_y_f64x2,
|
|
525
|
-
wasm_f64x2_mul(scaled_rotation_x_x_f64x2, centered_a_x_f64x2)));
|
|
526
|
-
v128_t rotated_a_y_f64x2 = wasm_f64x2_relaxed_madd(
|
|
527
|
-
scaled_rotation_y_z_f64x2, centered_a_z_f64x2,
|
|
528
|
-
wasm_f64x2_relaxed_madd(scaled_rotation_y_y_f64x2, centered_a_y_f64x2,
|
|
529
|
-
wasm_f64x2_mul(scaled_rotation_y_x_f64x2, centered_a_x_f64x2)));
|
|
530
|
-
v128_t rotated_a_z_f64x2 = wasm_f64x2_relaxed_madd(
|
|
531
|
-
scaled_rotation_z_z_f64x2, centered_a_z_f64x2,
|
|
532
|
-
wasm_f64x2_relaxed_madd(scaled_rotation_z_y_f64x2, centered_a_y_f64x2,
|
|
533
|
-
wasm_f64x2_mul(scaled_rotation_z_x_f64x2, centered_a_x_f64x2)));
|
|
534
|
-
|
|
535
|
-
v128_t delta_x_f64x2 = wasm_f64x2_sub(rotated_a_x_f64x2, centered_b_x_f64x2);
|
|
536
|
-
v128_t delta_y_f64x2 = wasm_f64x2_sub(rotated_a_y_f64x2, centered_b_y_f64x2);
|
|
537
|
-
v128_t delta_z_f64x2 = wasm_f64x2_sub(rotated_a_z_f64x2, centered_b_z_f64x2);
|
|
538
|
-
|
|
539
|
-
nk_accumulate_square_f64x2_v128relaxed_(&sum_squared_f64x2, &sum_squared_compensation_f64x2, delta_x_f64x2);
|
|
540
|
-
nk_accumulate_square_f64x2_v128relaxed_(&sum_squared_f64x2, &sum_squared_compensation_f64x2, delta_y_f64x2);
|
|
541
|
-
nk_accumulate_square_f64x2_v128relaxed_(&sum_squared_f64x2, &sum_squared_compensation_f64x2, delta_z_f64x2);
|
|
542
|
-
}
|
|
543
|
-
|
|
544
|
-
nk_f64_t sum_squared = nk_dot_stable_sum_f64x2_v128relaxed_(sum_squared_f64x2, sum_squared_compensation_f64x2);
|
|
545
|
-
nk_f64_t sum_squared_compensation = 0.0;
|
|
546
|
-
|
|
547
|
-
// Scalar tail
|
|
548
|
-
for (; j < n; ++j) {
|
|
549
|
-
nk_f64_t pa_x = a[j * 3 + 0] - centroid_a_x, pa_y = a[j * 3 + 1] - centroid_a_y,
|
|
550
|
-
pa_z = a[j * 3 + 2] - centroid_a_z;
|
|
551
|
-
nk_f64_t pb_x = b[j * 3 + 0] - centroid_b_x, pb_y = b[j * 3 + 1] - centroid_b_y,
|
|
552
|
-
pb_z = b[j * 3 + 2] - centroid_b_z;
|
|
553
|
-
|
|
554
|
-
nk_f64_t ra_x = scale * (r[0] * pa_x + r[1] * pa_y + r[2] * pa_z),
|
|
555
|
-
ra_y = scale * (r[3] * pa_x + r[4] * pa_y + r[5] * pa_z),
|
|
556
|
-
ra_z = scale * (r[6] * pa_x + r[7] * pa_y + r[8] * pa_z);
|
|
557
|
-
|
|
558
|
-
nk_f64_t delta_x = ra_x - pb_x, delta_y = ra_y - pb_y, delta_z = ra_z - pb_z;
|
|
559
|
-
nk_accumulate_square_f64_(&sum_squared, &sum_squared_compensation, delta_x);
|
|
560
|
-
nk_accumulate_square_f64_(&sum_squared, &sum_squared_compensation, delta_y);
|
|
561
|
-
nk_accumulate_square_f64_(&sum_squared, &sum_squared_compensation, delta_z);
|
|
562
|
-
}
|
|
563
|
-
|
|
564
|
-
return sum_squared + sum_squared_compensation;
|
|
398
|
+
cross_covariance[0] = cross_00 - n_f64 * (*centroid_a_x) * (*centroid_b_x),
|
|
399
|
+
cross_covariance[1] = cross_01 - n_f64 * (*centroid_a_x) * (*centroid_b_y),
|
|
400
|
+
cross_covariance[2] = cross_02 - n_f64 * (*centroid_a_x) * (*centroid_b_z);
|
|
401
|
+
cross_covariance[3] = cross_10 - n_f64 * (*centroid_a_y) * (*centroid_b_x),
|
|
402
|
+
cross_covariance[4] = cross_11 - n_f64 * (*centroid_a_y) * (*centroid_b_y),
|
|
403
|
+
cross_covariance[5] = cross_12 - n_f64 * (*centroid_a_y) * (*centroid_b_z);
|
|
404
|
+
cross_covariance[6] = cross_20 - n_f64 * (*centroid_a_z) * (*centroid_b_x),
|
|
405
|
+
cross_covariance[7] = cross_21 - n_f64 * (*centroid_a_z) * (*centroid_b_y),
|
|
406
|
+
cross_covariance[8] = cross_22 - n_f64 * (*centroid_a_z) * (*centroid_b_z);
|
|
407
|
+
|
|
408
|
+
*centered_norm_squared_a = norm_squared_a_sum -
|
|
409
|
+
n_f64 * ((*centroid_a_x) * (*centroid_a_x) + (*centroid_a_y) * (*centroid_a_y) +
|
|
410
|
+
(*centroid_a_z) * (*centroid_a_z));
|
|
411
|
+
*centered_norm_squared_b = norm_squared_b_sum -
|
|
412
|
+
n_f64 * ((*centroid_b_x) * (*centroid_b_x) + (*centroid_b_y) * (*centroid_b_y) +
|
|
413
|
+
(*centroid_b_z) * (*centroid_b_z));
|
|
414
|
+
if (*centered_norm_squared_a < 0.0) *centered_norm_squared_a = 0.0;
|
|
415
|
+
if (*centered_norm_squared_b < 0.0) *centered_norm_squared_b = 0.0;
|
|
565
416
|
}
|
|
566
417
|
|
|
567
418
|
NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
|
|
@@ -687,51 +538,79 @@ NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_
|
|
|
687
538
|
NK_PUBLIC void nk_kabsch_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
|
|
688
539
|
nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
|
|
689
540
|
nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z;
|
|
690
|
-
nk_f64_t
|
|
541
|
+
nk_f64_t centered_norm_squared_a, centered_norm_squared_b;
|
|
542
|
+
nk_f64_t cross_covariance[9];
|
|
691
543
|
nk_centroid_and_cross_covariance_f32_v128relaxed_(a, b, n, ¢roid_a_x, ¢roid_a_y, ¢roid_a_z,
|
|
692
|
-
¢roid_b_x, ¢roid_b_y, ¢roid_b_z,
|
|
544
|
+
¢roid_b_x, ¢roid_b_y, ¢roid_b_z, cross_covariance,
|
|
545
|
+
¢ered_norm_squared_a, ¢ered_norm_squared_b);
|
|
693
546
|
if (a_centroid)
|
|
694
547
|
a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
|
|
695
548
|
a_centroid[2] = (nk_f32_t)centroid_a_z;
|
|
696
549
|
if (b_centroid)
|
|
697
550
|
b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
|
|
698
551
|
b_centroid[2] = (nk_f32_t)centroid_b_z;
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
552
|
+
|
|
553
|
+
// Identity-dominant short-circuit: if H ≈ diag(positive entries), R = I and trace(R·H) = trace(H).
|
|
554
|
+
nk_f64_t covariance_diagonal_norm_squared = cross_covariance[0] * cross_covariance[0] +
|
|
555
|
+
cross_covariance[4] * cross_covariance[4] +
|
|
556
|
+
cross_covariance[8] * cross_covariance[8];
|
|
557
|
+
nk_f64_t covariance_offdiagonal_norm_squared =
|
|
558
|
+
cross_covariance[1] * cross_covariance[1] + cross_covariance[2] * cross_covariance[2] +
|
|
559
|
+
cross_covariance[3] * cross_covariance[3] + cross_covariance[5] * cross_covariance[5] +
|
|
560
|
+
cross_covariance[6] * cross_covariance[6] + cross_covariance[7] * cross_covariance[7];
|
|
561
|
+
nk_f64_t optimal_rotation[9];
|
|
562
|
+
nk_f64_t trace_rotation_covariance;
|
|
563
|
+
if (covariance_offdiagonal_norm_squared < 1e-20 * covariance_diagonal_norm_squared && cross_covariance[0] > 0.0 &&
|
|
564
|
+
cross_covariance[4] > 0.0 && cross_covariance[8] > 0.0) {
|
|
565
|
+
optimal_rotation[0] = 1, optimal_rotation[1] = 0, optimal_rotation[2] = 0, optimal_rotation[3] = 0,
|
|
566
|
+
optimal_rotation[4] = 1, optimal_rotation[5] = 0, optimal_rotation[6] = 0, optimal_rotation[7] = 0,
|
|
567
|
+
optimal_rotation[8] = 1;
|
|
568
|
+
trace_rotation_covariance = cross_covariance[0] + cross_covariance[4] + cross_covariance[8];
|
|
569
|
+
}
|
|
570
|
+
else {
|
|
571
|
+
nk_f64_t svd_left[9], svd_diagonal[9], svd_right[9];
|
|
572
|
+
nk_svd3x3_f64_(cross_covariance, svd_left, svd_diagonal, svd_right);
|
|
573
|
+
|
|
574
|
+
optimal_rotation[0] = svd_right[0] * svd_left[0] + svd_right[1] * svd_left[1] + svd_right[2] * svd_left[2];
|
|
575
|
+
optimal_rotation[1] = svd_right[0] * svd_left[3] + svd_right[1] * svd_left[4] + svd_right[2] * svd_left[5];
|
|
576
|
+
optimal_rotation[2] = svd_right[0] * svd_left[6] + svd_right[1] * svd_left[7] + svd_right[2] * svd_left[8];
|
|
577
|
+
optimal_rotation[3] = svd_right[3] * svd_left[0] + svd_right[4] * svd_left[1] + svd_right[5] * svd_left[2];
|
|
578
|
+
optimal_rotation[4] = svd_right[3] * svd_left[3] + svd_right[4] * svd_left[4] + svd_right[5] * svd_left[5];
|
|
579
|
+
optimal_rotation[5] = svd_right[3] * svd_left[6] + svd_right[4] * svd_left[7] + svd_right[5] * svd_left[8];
|
|
580
|
+
optimal_rotation[6] = svd_right[6] * svd_left[0] + svd_right[7] * svd_left[1] + svd_right[8] * svd_left[2];
|
|
581
|
+
optimal_rotation[7] = svd_right[6] * svd_left[3] + svd_right[7] * svd_left[4] + svd_right[8] * svd_left[5];
|
|
582
|
+
optimal_rotation[8] = svd_right[6] * svd_left[6] + svd_right[7] * svd_left[7] + svd_right[8] * svd_left[8];
|
|
583
|
+
|
|
584
|
+
// Handle reflection: if det(R) < 0, negate third column of V and recompute R.
|
|
585
|
+
if (nk_det3x3_f64_(optimal_rotation) < 0) {
|
|
586
|
+
svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8];
|
|
587
|
+
optimal_rotation[0] = svd_right[0] * svd_left[0] + svd_right[1] * svd_left[1] + svd_right[2] * svd_left[2];
|
|
588
|
+
optimal_rotation[1] = svd_right[0] * svd_left[3] + svd_right[1] * svd_left[4] + svd_right[2] * svd_left[5];
|
|
589
|
+
optimal_rotation[2] = svd_right[0] * svd_left[6] + svd_right[1] * svd_left[7] + svd_right[2] * svd_left[8];
|
|
590
|
+
optimal_rotation[3] = svd_right[3] * svd_left[0] + svd_right[4] * svd_left[1] + svd_right[5] * svd_left[2];
|
|
591
|
+
optimal_rotation[4] = svd_right[3] * svd_left[3] + svd_right[4] * svd_left[4] + svd_right[5] * svd_left[5];
|
|
592
|
+
optimal_rotation[5] = svd_right[3] * svd_left[6] + svd_right[4] * svd_left[7] + svd_right[5] * svd_left[8];
|
|
593
|
+
optimal_rotation[6] = svd_right[6] * svd_left[0] + svd_right[7] * svd_left[1] + svd_right[8] * svd_left[2];
|
|
594
|
+
optimal_rotation[7] = svd_right[6] * svd_left[3] + svd_right[7] * svd_left[4] + svd_right[8] * svd_left[5];
|
|
595
|
+
optimal_rotation[8] = svd_right[6] * svd_left[6] + svd_right[7] * svd_left[7] + svd_right[8] * svd_left[8];
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
trace_rotation_covariance =
|
|
599
|
+
optimal_rotation[0] * cross_covariance[0] + optimal_rotation[1] * cross_covariance[3] +
|
|
600
|
+
optimal_rotation[2] * cross_covariance[6] + optimal_rotation[3] * cross_covariance[1] +
|
|
601
|
+
optimal_rotation[4] * cross_covariance[4] + optimal_rotation[5] * cross_covariance[7] +
|
|
602
|
+
optimal_rotation[6] * cross_covariance[2] + optimal_rotation[7] * cross_covariance[5] +
|
|
603
|
+
optimal_rotation[8] * cross_covariance[8];
|
|
725
604
|
}
|
|
726
605
|
|
|
727
606
|
if (rotation)
|
|
728
|
-
for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)
|
|
607
|
+
for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)optimal_rotation[j];
|
|
729
608
|
if (scale) *scale = 1.0f;
|
|
730
609
|
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
610
|
+
// Folded SSD via trace identity: SSD = ‖a-ā‖² + ‖b-b̄‖² − 2·trace(R · H_centered).
|
|
611
|
+
nk_f64_t sum_squared = centered_norm_squared_a + centered_norm_squared_b - 2.0 * trace_rotation_covariance;
|
|
612
|
+
if (sum_squared < 0.0) sum_squared = 0.0;
|
|
613
|
+
*result = nk_f64_sqrt_v128relaxed(sum_squared / (nk_f64_t)n);
|
|
735
614
|
}
|
|
736
615
|
|
|
737
616
|
NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
|
|
@@ -742,9 +621,10 @@ NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, n
|
|
|
742
621
|
v128_t sum_a_x_f64x2 = zeros_f64x2, sum_a_y_f64x2 = zeros_f64x2, sum_a_z_f64x2 = zeros_f64x2;
|
|
743
622
|
v128_t sum_b_x_f64x2 = zeros_f64x2, sum_b_y_f64x2 = zeros_f64x2, sum_b_z_f64x2 = zeros_f64x2;
|
|
744
623
|
|
|
745
|
-
v128_t
|
|
746
|
-
v128_t
|
|
747
|
-
v128_t
|
|
624
|
+
v128_t covariance_xx_f64x2 = zeros_f64x2, covariance_xy_f64x2 = zeros_f64x2, covariance_xz_f64x2 = zeros_f64x2;
|
|
625
|
+
v128_t covariance_yx_f64x2 = zeros_f64x2, covariance_yy_f64x2 = zeros_f64x2, covariance_yz_f64x2 = zeros_f64x2;
|
|
626
|
+
v128_t covariance_zx_f64x2 = zeros_f64x2, covariance_zy_f64x2 = zeros_f64x2, covariance_zz_f64x2 = zeros_f64x2;
|
|
627
|
+
v128_t norm_squared_a_f64x2 = zeros_f64x2, norm_squared_b_f64x2 = zeros_f64x2;
|
|
748
628
|
|
|
749
629
|
nk_size_t i = 0;
|
|
750
630
|
v128_t a_x_f64x2, a_y_f64x2, a_z_f64x2, b_x_f64x2, b_y_f64x2, b_z_f64x2;
|
|
@@ -761,15 +641,21 @@ NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, n
|
|
|
761
641
|
sum_b_y_f64x2 = wasm_f64x2_add(sum_b_y_f64x2, b_y_f64x2);
|
|
762
642
|
sum_b_z_f64x2 = wasm_f64x2_add(sum_b_z_f64x2, b_z_f64x2);
|
|
763
643
|
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
644
|
+
covariance_xx_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, b_x_f64x2, covariance_xx_f64x2);
|
|
645
|
+
covariance_xy_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, b_y_f64x2, covariance_xy_f64x2);
|
|
646
|
+
covariance_xz_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, b_z_f64x2, covariance_xz_f64x2);
|
|
647
|
+
covariance_yx_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, b_x_f64x2, covariance_yx_f64x2);
|
|
648
|
+
covariance_yy_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, b_y_f64x2, covariance_yy_f64x2);
|
|
649
|
+
covariance_yz_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, b_z_f64x2, covariance_yz_f64x2);
|
|
650
|
+
covariance_zx_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, b_x_f64x2, covariance_zx_f64x2);
|
|
651
|
+
covariance_zy_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, b_y_f64x2, covariance_zy_f64x2);
|
|
652
|
+
covariance_zz_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, b_z_f64x2, covariance_zz_f64x2);
|
|
653
|
+
norm_squared_a_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, a_x_f64x2, norm_squared_a_f64x2);
|
|
654
|
+
norm_squared_a_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, a_y_f64x2, norm_squared_a_f64x2);
|
|
655
|
+
norm_squared_a_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, a_z_f64x2, norm_squared_a_f64x2);
|
|
656
|
+
norm_squared_b_f64x2 = wasm_f64x2_relaxed_madd(b_x_f64x2, b_x_f64x2, norm_squared_b_f64x2);
|
|
657
|
+
norm_squared_b_f64x2 = wasm_f64x2_relaxed_madd(b_y_f64x2, b_y_f64x2, norm_squared_b_f64x2);
|
|
658
|
+
norm_squared_b_f64x2 = wasm_f64x2_relaxed_madd(b_z_f64x2, b_z_f64x2, norm_squared_b_f64x2);
|
|
773
659
|
}
|
|
774
660
|
|
|
775
661
|
// Reduce vector accumulators.
|
|
@@ -780,15 +666,28 @@ NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, n
|
|
|
780
666
|
nk_f64_t sum_b_y = nk_reduce_stable_f64x2_v128relaxed_(sum_b_y_f64x2), sum_b_y_compensation = 0.0;
|
|
781
667
|
nk_f64_t sum_b_z = nk_reduce_stable_f64x2_v128relaxed_(sum_b_z_f64x2), sum_b_z_compensation = 0.0;
|
|
782
668
|
|
|
783
|
-
nk_f64_t covariance_x_x = nk_reduce_stable_f64x2_v128relaxed_(
|
|
784
|
-
|
|
785
|
-
nk_f64_t
|
|
786
|
-
|
|
787
|
-
nk_f64_t
|
|
788
|
-
|
|
789
|
-
nk_f64_t
|
|
790
|
-
|
|
791
|
-
nk_f64_t
|
|
669
|
+
nk_f64_t covariance_x_x = nk_reduce_stable_f64x2_v128relaxed_(covariance_xx_f64x2),
|
|
670
|
+
covariance_x_x_compensation = 0.0;
|
|
671
|
+
nk_f64_t covariance_x_y = nk_reduce_stable_f64x2_v128relaxed_(covariance_xy_f64x2),
|
|
672
|
+
covariance_x_y_compensation = 0.0;
|
|
673
|
+
nk_f64_t covariance_x_z = nk_reduce_stable_f64x2_v128relaxed_(covariance_xz_f64x2),
|
|
674
|
+
covariance_x_z_compensation = 0.0;
|
|
675
|
+
nk_f64_t covariance_y_x = nk_reduce_stable_f64x2_v128relaxed_(covariance_yx_f64x2),
|
|
676
|
+
covariance_y_x_compensation = 0.0;
|
|
677
|
+
nk_f64_t covariance_y_y = nk_reduce_stable_f64x2_v128relaxed_(covariance_yy_f64x2),
|
|
678
|
+
covariance_y_y_compensation = 0.0;
|
|
679
|
+
nk_f64_t covariance_y_z = nk_reduce_stable_f64x2_v128relaxed_(covariance_yz_f64x2),
|
|
680
|
+
covariance_y_z_compensation = 0.0;
|
|
681
|
+
nk_f64_t covariance_z_x = nk_reduce_stable_f64x2_v128relaxed_(covariance_zx_f64x2),
|
|
682
|
+
covariance_z_x_compensation = 0.0;
|
|
683
|
+
nk_f64_t covariance_z_y = nk_reduce_stable_f64x2_v128relaxed_(covariance_zy_f64x2),
|
|
684
|
+
covariance_z_y_compensation = 0.0;
|
|
685
|
+
nk_f64_t covariance_z_z = nk_reduce_stable_f64x2_v128relaxed_(covariance_zz_f64x2),
|
|
686
|
+
covariance_z_z_compensation = 0.0;
|
|
687
|
+
nk_f64_t norm_squared_a_sum = nk_reduce_stable_f64x2_v128relaxed_(norm_squared_a_f64x2),
|
|
688
|
+
norm_squared_a_compensation = 0.0;
|
|
689
|
+
nk_f64_t norm_squared_b_sum = nk_reduce_stable_f64x2_v128relaxed_(norm_squared_b_f64x2),
|
|
690
|
+
norm_squared_b_compensation = 0.0;
|
|
792
691
|
|
|
793
692
|
// Scalar tail
|
|
794
693
|
for (; i < n; ++i) {
|
|
@@ -809,6 +708,12 @@ NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, n
|
|
|
809
708
|
nk_accumulate_product_f64_(&covariance_z_x, &covariance_z_x_compensation, az, bx),
|
|
810
709
|
nk_accumulate_product_f64_(&covariance_z_y, &covariance_z_y_compensation, az, by),
|
|
811
710
|
nk_accumulate_product_f64_(&covariance_z_z, &covariance_z_z_compensation, az, bz);
|
|
711
|
+
nk_accumulate_square_f64_(&norm_squared_a_sum, &norm_squared_a_compensation, ax),
|
|
712
|
+
nk_accumulate_square_f64_(&norm_squared_a_sum, &norm_squared_a_compensation, ay),
|
|
713
|
+
nk_accumulate_square_f64_(&norm_squared_a_sum, &norm_squared_a_compensation, az);
|
|
714
|
+
nk_accumulate_square_f64_(&norm_squared_b_sum, &norm_squared_b_compensation, bx),
|
|
715
|
+
nk_accumulate_square_f64_(&norm_squared_b_sum, &norm_squared_b_compensation, by),
|
|
716
|
+
nk_accumulate_square_f64_(&norm_squared_b_sum, &norm_squared_b_compensation, bz);
|
|
812
717
|
}
|
|
813
718
|
|
|
814
719
|
sum_a_x += sum_a_x_compensation, sum_a_y += sum_a_y_compensation, sum_a_z += sum_a_z_compensation;
|
|
@@ -819,6 +724,8 @@ NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, n
|
|
|
819
724
|
covariance_y_z += covariance_y_z_compensation;
|
|
820
725
|
covariance_z_x += covariance_z_x_compensation, covariance_z_y += covariance_z_y_compensation,
|
|
821
726
|
covariance_z_z += covariance_z_z_compensation;
|
|
727
|
+
norm_squared_a_sum += norm_squared_a_compensation;
|
|
728
|
+
norm_squared_b_sum += norm_squared_b_compensation;
|
|
822
729
|
|
|
823
730
|
// Compute centroids
|
|
824
731
|
nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
|
|
@@ -829,6 +736,16 @@ NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, n
|
|
|
829
736
|
if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
|
|
830
737
|
if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
|
|
831
738
|
|
|
739
|
+
// Centered norm-squared via parallel-axis identity; clamp at zero for numeric safety.
|
|
740
|
+
nk_f64_t centered_norm_squared_a = norm_squared_a_sum -
|
|
741
|
+
(nk_f64_t)n * (centroid_a_x * centroid_a_x + centroid_a_y * centroid_a_y +
|
|
742
|
+
centroid_a_z * centroid_a_z);
|
|
743
|
+
nk_f64_t centered_norm_squared_b = norm_squared_b_sum -
|
|
744
|
+
(nk_f64_t)n * (centroid_b_x * centroid_b_x + centroid_b_y * centroid_b_y +
|
|
745
|
+
centroid_b_z * centroid_b_z);
|
|
746
|
+
if (centered_norm_squared_a < 0.0) centered_norm_squared_a = 0.0;
|
|
747
|
+
if (centered_norm_squared_b < 0.0) centered_norm_squared_b = 0.0;
|
|
748
|
+
|
|
832
749
|
// Apply centering correction: H_centered = H - n * centroid_a * centroid_bT
|
|
833
750
|
covariance_x_x -= n * centroid_a_x * centroid_b_x;
|
|
834
751
|
covariance_x_y -= n * centroid_a_x * centroid_b_y;
|
|
@@ -843,37 +760,64 @@ NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, n
|
|
|
843
760
|
// Compute SVD and optimal rotation
|
|
844
761
|
nk_f64_t cross_covariance[9] = {covariance_x_x, covariance_x_y, covariance_x_z, covariance_y_x, covariance_y_y,
|
|
845
762
|
covariance_y_z, covariance_z_x, covariance_z_y, covariance_z_z};
|
|
846
|
-
nk_f64_t svd_u[9], svd_s[9], svd_v[9];
|
|
847
|
-
nk_svd3x3_f64_(cross_covariance, svd_u, svd_s, svd_v);
|
|
848
|
-
|
|
849
|
-
nk_f64_t r[9];
|
|
850
|
-
nk_rotation_from_svd_f64_v128relaxed_(svd_u, svd_v, r);
|
|
851
763
|
|
|
852
|
-
//
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
764
|
+
// Identity-dominant short-circuit: if H ≈ diag(positive entries), R = I and trace(R·H) = trace(H).
|
|
765
|
+
nk_f64_t covariance_diagonal_norm_squared = cross_covariance[0] * cross_covariance[0] +
|
|
766
|
+
cross_covariance[4] * cross_covariance[4] +
|
|
767
|
+
cross_covariance[8] * cross_covariance[8];
|
|
768
|
+
nk_f64_t covariance_offdiagonal_norm_squared =
|
|
769
|
+
cross_covariance[1] * cross_covariance[1] + cross_covariance[2] * cross_covariance[2] +
|
|
770
|
+
cross_covariance[3] * cross_covariance[3] + cross_covariance[5] * cross_covariance[5] +
|
|
771
|
+
cross_covariance[6] * cross_covariance[6] + cross_covariance[7] * cross_covariance[7];
|
|
772
|
+
nk_f64_t optimal_rotation[9];
|
|
773
|
+
nk_f64_t trace_rotation_covariance;
|
|
774
|
+
if (covariance_offdiagonal_norm_squared < 1e-20 * covariance_diagonal_norm_squared && cross_covariance[0] > 0.0 &&
|
|
775
|
+
cross_covariance[4] > 0.0 && cross_covariance[8] > 0.0) {
|
|
776
|
+
optimal_rotation[0] = 1, optimal_rotation[1] = 0, optimal_rotation[2] = 0, optimal_rotation[3] = 0,
|
|
777
|
+
optimal_rotation[4] = 1, optimal_rotation[5] = 0, optimal_rotation[6] = 0, optimal_rotation[7] = 0,
|
|
778
|
+
optimal_rotation[8] = 1;
|
|
779
|
+
trace_rotation_covariance = cross_covariance[0] + cross_covariance[4] + cross_covariance[8];
|
|
780
|
+
}
|
|
781
|
+
else {
|
|
782
|
+
nk_f64_t svd_left[9], svd_diagonal[9], svd_right[9];
|
|
783
|
+
nk_svd3x3_f64_(cross_covariance, svd_left, svd_diagonal, svd_right);
|
|
784
|
+
nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
|
|
785
|
+
|
|
786
|
+
// Handle reflection: if det(R) < 0, negate third column of V and recompute R
|
|
787
|
+
if (nk_det3x3_f64_(optimal_rotation) < 0) {
|
|
788
|
+
svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8];
|
|
789
|
+
nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
trace_rotation_covariance =
|
|
793
|
+
optimal_rotation[0] * cross_covariance[0] + optimal_rotation[1] * cross_covariance[3] +
|
|
794
|
+
optimal_rotation[2] * cross_covariance[6] + optimal_rotation[3] * cross_covariance[1] +
|
|
795
|
+
optimal_rotation[4] * cross_covariance[4] + optimal_rotation[5] * cross_covariance[7] +
|
|
796
|
+
optimal_rotation[6] * cross_covariance[2] + optimal_rotation[7] * cross_covariance[5] +
|
|
797
|
+
optimal_rotation[8] * cross_covariance[8];
|
|
856
798
|
}
|
|
857
799
|
|
|
858
800
|
// Output rotation matrix and scale=1.0
|
|
859
801
|
if (rotation)
|
|
860
|
-
for (int j = 0; j < 9; ++j) rotation[j] =
|
|
802
|
+
for (int j = 0; j < 9; ++j) rotation[j] = optimal_rotation[j];
|
|
861
803
|
|
|
862
804
|
if (scale) *scale = 1.0;
|
|
863
805
|
|
|
864
|
-
//
|
|
865
|
-
nk_f64_t sum_squared =
|
|
866
|
-
|
|
806
|
+
// Folded SSD via trace identity: SSD = ‖a-ā‖² + ‖b-b̄‖² − 2·trace(R · H_centered).
|
|
807
|
+
nk_f64_t sum_squared = centered_norm_squared_a + centered_norm_squared_b - 2.0 * trace_rotation_covariance;
|
|
808
|
+
if (sum_squared < 0.0) sum_squared = 0.0;
|
|
867
809
|
*result = nk_f64_sqrt_v128relaxed(sum_squared * inv_points_count);
|
|
868
810
|
}
|
|
869
811
|
|
|
870
812
|
NK_PUBLIC void nk_umeyama_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
|
|
871
813
|
nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
|
|
872
|
-
nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z
|
|
873
|
-
nk_f64_t
|
|
814
|
+
nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z;
|
|
815
|
+
nk_f64_t centered_norm_squared_a, centered_norm_squared_b;
|
|
816
|
+
nk_f64_t cross_covariance[9];
|
|
874
817
|
nk_centroid_and_cross_covariance_and_variance_f32_v128relaxed_( //
|
|
875
818
|
a, b, n, ¢roid_a_x, ¢roid_a_y, ¢roid_a_z, //
|
|
876
|
-
¢roid_b_x, ¢roid_b_y, ¢roid_b_z,
|
|
819
|
+
¢roid_b_x, ¢roid_b_y, ¢roid_b_z, cross_covariance, ¢ered_norm_squared_a,
|
|
820
|
+
¢ered_norm_squared_b);
|
|
877
821
|
if (a_centroid)
|
|
878
822
|
a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
|
|
879
823
|
a_centroid[2] = (nk_f32_t)centroid_a_z;
|
|
@@ -881,44 +825,73 @@ NK_PUBLIC void nk_umeyama_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b,
|
|
|
881
825
|
b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
|
|
882
826
|
b_centroid[2] = (nk_f32_t)centroid_b_z;
|
|
883
827
|
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
828
|
+
// Identity-dominant short-circuit: if H ≈ diag(positive entries), R = I and trace(R·H) = trace(H).
|
|
829
|
+
nk_f64_t covariance_diagonal_norm_squared = cross_covariance[0] * cross_covariance[0] +
|
|
830
|
+
cross_covariance[4] * cross_covariance[4] +
|
|
831
|
+
cross_covariance[8] * cross_covariance[8];
|
|
832
|
+
nk_f64_t covariance_offdiagonal_norm_squared =
|
|
833
|
+
cross_covariance[1] * cross_covariance[1] + cross_covariance[2] * cross_covariance[2] +
|
|
834
|
+
cross_covariance[3] * cross_covariance[3] + cross_covariance[5] * cross_covariance[5] +
|
|
835
|
+
cross_covariance[6] * cross_covariance[6] + cross_covariance[7] * cross_covariance[7];
|
|
836
|
+
nk_f64_t optimal_rotation[9];
|
|
837
|
+
nk_f64_t trace_rotation_covariance;
|
|
838
|
+
nk_f64_t computed_scale;
|
|
839
|
+
if (covariance_offdiagonal_norm_squared < 1e-20 * covariance_diagonal_norm_squared && cross_covariance[0] > 0.0 &&
|
|
840
|
+
cross_covariance[4] > 0.0 && cross_covariance[8] > 0.0) {
|
|
841
|
+
optimal_rotation[0] = 1, optimal_rotation[1] = 0, optimal_rotation[2] = 0, optimal_rotation[3] = 0,
|
|
842
|
+
optimal_rotation[4] = 1, optimal_rotation[5] = 0, optimal_rotation[6] = 0, optimal_rotation[7] = 0,
|
|
843
|
+
optimal_rotation[8] = 1;
|
|
844
|
+
trace_rotation_covariance = cross_covariance[0] + cross_covariance[4] + cross_covariance[8];
|
|
845
|
+
computed_scale = centered_norm_squared_a > 0.0 ? trace_rotation_covariance / centered_norm_squared_a : 0.0;
|
|
846
|
+
}
|
|
847
|
+
else {
|
|
848
|
+
nk_f64_t svd_left[9], svd_diagonal[9], svd_right[9];
|
|
849
|
+
nk_svd3x3_f64_(cross_covariance, svd_left, svd_diagonal, svd_right);
|
|
850
|
+
|
|
851
|
+
optimal_rotation[0] = svd_right[0] * svd_left[0] + svd_right[1] * svd_left[1] + svd_right[2] * svd_left[2];
|
|
852
|
+
optimal_rotation[1] = svd_right[0] * svd_left[3] + svd_right[1] * svd_left[4] + svd_right[2] * svd_left[5];
|
|
853
|
+
optimal_rotation[2] = svd_right[0] * svd_left[6] + svd_right[1] * svd_left[7] + svd_right[2] * svd_left[8];
|
|
854
|
+
optimal_rotation[3] = svd_right[3] * svd_left[0] + svd_right[4] * svd_left[1] + svd_right[5] * svd_left[2];
|
|
855
|
+
optimal_rotation[4] = svd_right[3] * svd_left[3] + svd_right[4] * svd_left[4] + svd_right[5] * svd_left[5];
|
|
856
|
+
optimal_rotation[5] = svd_right[3] * svd_left[6] + svd_right[4] * svd_left[7] + svd_right[5] * svd_left[8];
|
|
857
|
+
optimal_rotation[6] = svd_right[6] * svd_left[0] + svd_right[7] * svd_left[1] + svd_right[8] * svd_left[2];
|
|
858
|
+
optimal_rotation[7] = svd_right[6] * svd_left[3] + svd_right[7] * svd_left[4] + svd_right[8] * svd_left[5];
|
|
859
|
+
optimal_rotation[8] = svd_right[6] * svd_left[6] + svd_right[7] * svd_left[7] + svd_right[8] * svd_left[8];
|
|
860
|
+
|
|
861
|
+
nk_f64_t det = nk_det3x3_f64_(optimal_rotation);
|
|
862
|
+
if (det < 0) {
|
|
863
|
+
svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8];
|
|
864
|
+
optimal_rotation[0] = svd_right[0] * svd_left[0] + svd_right[1] * svd_left[1] + svd_right[2] * svd_left[2];
|
|
865
|
+
optimal_rotation[1] = svd_right[0] * svd_left[3] + svd_right[1] * svd_left[4] + svd_right[2] * svd_left[5];
|
|
866
|
+
optimal_rotation[2] = svd_right[0] * svd_left[6] + svd_right[1] * svd_left[7] + svd_right[2] * svd_left[8];
|
|
867
|
+
optimal_rotation[3] = svd_right[3] * svd_left[0] + svd_right[4] * svd_left[1] + svd_right[5] * svd_left[2];
|
|
868
|
+
optimal_rotation[4] = svd_right[3] * svd_left[3] + svd_right[4] * svd_left[4] + svd_right[5] * svd_left[5];
|
|
869
|
+
optimal_rotation[5] = svd_right[3] * svd_left[6] + svd_right[4] * svd_left[7] + svd_right[5] * svd_left[8];
|
|
870
|
+
optimal_rotation[6] = svd_right[6] * svd_left[0] + svd_right[7] * svd_left[1] + svd_right[8] * svd_left[2];
|
|
871
|
+
optimal_rotation[7] = svd_right[6] * svd_left[3] + svd_right[7] * svd_left[4] + svd_right[8] * svd_left[5];
|
|
872
|
+
optimal_rotation[8] = svd_right[6] * svd_left[6] + svd_right[7] * svd_left[7] + svd_right[8] * svd_left[8];
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
nk_f64_t trace_signed_singular_values = svd_diagonal[0] + svd_diagonal[4] +
|
|
876
|
+
(det < 0 ? -svd_diagonal[8] : svd_diagonal[8]);
|
|
877
|
+
computed_scale = centered_norm_squared_a > 0.0 ? trace_signed_singular_values / centered_norm_squared_a : 0.0;
|
|
878
|
+
|
|
879
|
+
trace_rotation_covariance =
|
|
880
|
+
optimal_rotation[0] * cross_covariance[0] + optimal_rotation[1] * cross_covariance[3] +
|
|
881
|
+
optimal_rotation[2] * cross_covariance[6] + optimal_rotation[3] * cross_covariance[1] +
|
|
882
|
+
optimal_rotation[4] * cross_covariance[4] + optimal_rotation[5] * cross_covariance[7] +
|
|
883
|
+
optimal_rotation[6] * cross_covariance[2] + optimal_rotation[7] * cross_covariance[5] +
|
|
884
|
+
optimal_rotation[8] * cross_covariance[8];
|
|
910
885
|
}
|
|
911
|
-
|
|
912
|
-
nk_f64_t trace_signed_singular_values = svd_s[0] + svd_s[4] + (det < 0 ? -svd_s[8] : svd_s[8]);
|
|
913
|
-
nk_f64_t computed_scale = trace_signed_singular_values / ((nk_f64_t)n * variance_a);
|
|
914
886
|
if (rotation)
|
|
915
|
-
for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)
|
|
887
|
+
for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)optimal_rotation[j];
|
|
916
888
|
if (scale) *scale = (nk_f32_t)computed_scale;
|
|
917
889
|
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
890
|
+
// Folded SSD with scale: c²·‖a-ā‖² + ‖b-b̄‖² − 2c·trace(R · H_centered).
|
|
891
|
+
nk_f64_t sum_squared = computed_scale * computed_scale * centered_norm_squared_a + centered_norm_squared_b -
|
|
892
|
+
2.0 * computed_scale * trace_rotation_covariance;
|
|
893
|
+
if (sum_squared < 0.0) sum_squared = 0.0;
|
|
894
|
+
*result = nk_f64_sqrt_v128relaxed(sum_squared / (nk_f64_t)n);
|
|
922
895
|
}
|
|
923
896
|
|
|
924
897
|
NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
|
|
@@ -929,10 +902,10 @@ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b,
|
|
|
929
902
|
v128_t sum_a_x_f64x2 = zeros_f64x2, sum_a_y_f64x2 = zeros_f64x2, sum_a_z_f64x2 = zeros_f64x2;
|
|
930
903
|
v128_t sum_b_x_f64x2 = zeros_f64x2, sum_b_y_f64x2 = zeros_f64x2, sum_b_z_f64x2 = zeros_f64x2;
|
|
931
904
|
|
|
932
|
-
v128_t
|
|
933
|
-
v128_t
|
|
934
|
-
v128_t
|
|
935
|
-
v128_t
|
|
905
|
+
v128_t covariance_xx_f64x2 = zeros_f64x2, covariance_xy_f64x2 = zeros_f64x2, covariance_xz_f64x2 = zeros_f64x2;
|
|
906
|
+
v128_t covariance_yx_f64x2 = zeros_f64x2, covariance_yy_f64x2 = zeros_f64x2, covariance_yz_f64x2 = zeros_f64x2;
|
|
907
|
+
v128_t covariance_zx_f64x2 = zeros_f64x2, covariance_zy_f64x2 = zeros_f64x2, covariance_zz_f64x2 = zeros_f64x2;
|
|
908
|
+
v128_t norm_squared_a_f64x2 = zeros_f64x2, norm_squared_b_f64x2 = zeros_f64x2;
|
|
936
909
|
|
|
937
910
|
nk_size_t i = 0;
|
|
938
911
|
v128_t a_x_f64x2, a_y_f64x2, a_z_f64x2, b_x_f64x2, b_y_f64x2, b_z_f64x2;
|
|
@@ -949,19 +922,22 @@ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b,
|
|
|
949
922
|
sum_b_y_f64x2 = wasm_f64x2_add(sum_b_y_f64x2, b_y_f64x2);
|
|
950
923
|
sum_b_z_f64x2 = wasm_f64x2_add(sum_b_z_f64x2, b_z_f64x2);
|
|
951
924
|
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
925
|
+
covariance_xx_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, b_x_f64x2, covariance_xx_f64x2);
|
|
926
|
+
covariance_xy_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, b_y_f64x2, covariance_xy_f64x2);
|
|
927
|
+
covariance_xz_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, b_z_f64x2, covariance_xz_f64x2);
|
|
928
|
+
covariance_yx_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, b_x_f64x2, covariance_yx_f64x2);
|
|
929
|
+
covariance_yy_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, b_y_f64x2, covariance_yy_f64x2);
|
|
930
|
+
covariance_yz_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, b_z_f64x2, covariance_yz_f64x2);
|
|
931
|
+
covariance_zx_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, b_x_f64x2, covariance_zx_f64x2);
|
|
932
|
+
covariance_zy_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, b_y_f64x2, covariance_zy_f64x2);
|
|
933
|
+
covariance_zz_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, b_z_f64x2, covariance_zz_f64x2);
|
|
934
|
+
|
|
935
|
+
norm_squared_a_f64x2 = wasm_f64x2_relaxed_madd(a_x_f64x2, a_x_f64x2, norm_squared_a_f64x2);
|
|
936
|
+
norm_squared_a_f64x2 = wasm_f64x2_relaxed_madd(a_y_f64x2, a_y_f64x2, norm_squared_a_f64x2);
|
|
937
|
+
norm_squared_a_f64x2 = wasm_f64x2_relaxed_madd(a_z_f64x2, a_z_f64x2, norm_squared_a_f64x2);
|
|
938
|
+
norm_squared_b_f64x2 = wasm_f64x2_relaxed_madd(b_x_f64x2, b_x_f64x2, norm_squared_b_f64x2);
|
|
939
|
+
norm_squared_b_f64x2 = wasm_f64x2_relaxed_madd(b_y_f64x2, b_y_f64x2, norm_squared_b_f64x2);
|
|
940
|
+
norm_squared_b_f64x2 = wasm_f64x2_relaxed_madd(b_z_f64x2, b_z_f64x2, norm_squared_b_f64x2);
|
|
965
941
|
}
|
|
966
942
|
|
|
967
943
|
// Reduce vector accumulators.
|
|
@@ -971,16 +947,28 @@ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b,
|
|
|
971
947
|
nk_f64_t sum_b_x = nk_reduce_stable_f64x2_v128relaxed_(sum_b_x_f64x2), sum_b_x_compensation = 0.0;
|
|
972
948
|
nk_f64_t sum_b_y = nk_reduce_stable_f64x2_v128relaxed_(sum_b_y_f64x2), sum_b_y_compensation = 0.0;
|
|
973
949
|
nk_f64_t sum_b_z = nk_reduce_stable_f64x2_v128relaxed_(sum_b_z_f64x2), sum_b_z_compensation = 0.0;
|
|
974
|
-
nk_f64_t covariance_x_x = nk_reduce_stable_f64x2_v128relaxed_(
|
|
975
|
-
|
|
976
|
-
nk_f64_t
|
|
977
|
-
|
|
978
|
-
nk_f64_t
|
|
979
|
-
|
|
980
|
-
nk_f64_t
|
|
981
|
-
|
|
982
|
-
nk_f64_t
|
|
983
|
-
|
|
950
|
+
nk_f64_t covariance_x_x = nk_reduce_stable_f64x2_v128relaxed_(covariance_xx_f64x2),
|
|
951
|
+
covariance_x_x_compensation = 0.0;
|
|
952
|
+
nk_f64_t covariance_x_y = nk_reduce_stable_f64x2_v128relaxed_(covariance_xy_f64x2),
|
|
953
|
+
covariance_x_y_compensation = 0.0;
|
|
954
|
+
nk_f64_t covariance_x_z = nk_reduce_stable_f64x2_v128relaxed_(covariance_xz_f64x2),
|
|
955
|
+
covariance_x_z_compensation = 0.0;
|
|
956
|
+
nk_f64_t covariance_y_x = nk_reduce_stable_f64x2_v128relaxed_(covariance_yx_f64x2),
|
|
957
|
+
covariance_y_x_compensation = 0.0;
|
|
958
|
+
nk_f64_t covariance_y_y = nk_reduce_stable_f64x2_v128relaxed_(covariance_yy_f64x2),
|
|
959
|
+
covariance_y_y_compensation = 0.0;
|
|
960
|
+
nk_f64_t covariance_y_z = nk_reduce_stable_f64x2_v128relaxed_(covariance_yz_f64x2),
|
|
961
|
+
covariance_y_z_compensation = 0.0;
|
|
962
|
+
nk_f64_t covariance_z_x = nk_reduce_stable_f64x2_v128relaxed_(covariance_zx_f64x2),
|
|
963
|
+
covariance_z_x_compensation = 0.0;
|
|
964
|
+
nk_f64_t covariance_z_y = nk_reduce_stable_f64x2_v128relaxed_(covariance_zy_f64x2),
|
|
965
|
+
covariance_z_y_compensation = 0.0;
|
|
966
|
+
nk_f64_t covariance_z_z = nk_reduce_stable_f64x2_v128relaxed_(covariance_zz_f64x2),
|
|
967
|
+
covariance_z_z_compensation = 0.0;
|
|
968
|
+
nk_f64_t norm_squared_a_sum = nk_reduce_stable_f64x2_v128relaxed_(norm_squared_a_f64x2),
|
|
969
|
+
norm_squared_a_compensation = 0.0;
|
|
970
|
+
nk_f64_t norm_squared_b_sum = nk_reduce_stable_f64x2_v128relaxed_(norm_squared_b_f64x2),
|
|
971
|
+
norm_squared_b_compensation = 0.0;
|
|
984
972
|
|
|
985
973
|
// Scalar tail
|
|
986
974
|
for (; i < n; ++i) {
|
|
@@ -1001,9 +989,12 @@ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b,
|
|
|
1001
989
|
nk_accumulate_product_f64_(&covariance_z_x, &covariance_z_x_compensation, az, bx),
|
|
1002
990
|
nk_accumulate_product_f64_(&covariance_z_y, &covariance_z_y_compensation, az, by),
|
|
1003
991
|
nk_accumulate_product_f64_(&covariance_z_z, &covariance_z_z_compensation, az, bz);
|
|
1004
|
-
nk_accumulate_square_f64_(&
|
|
1005
|
-
nk_accumulate_square_f64_(&
|
|
1006
|
-
nk_accumulate_square_f64_(&
|
|
992
|
+
nk_accumulate_square_f64_(&norm_squared_a_sum, &norm_squared_a_compensation, ax),
|
|
993
|
+
nk_accumulate_square_f64_(&norm_squared_a_sum, &norm_squared_a_compensation, ay),
|
|
994
|
+
nk_accumulate_square_f64_(&norm_squared_a_sum, &norm_squared_a_compensation, az);
|
|
995
|
+
nk_accumulate_square_f64_(&norm_squared_b_sum, &norm_squared_b_compensation, bx),
|
|
996
|
+
nk_accumulate_square_f64_(&norm_squared_b_sum, &norm_squared_b_compensation, by),
|
|
997
|
+
nk_accumulate_square_f64_(&norm_squared_b_sum, &norm_squared_b_compensation, bz);
|
|
1007
998
|
}
|
|
1008
999
|
|
|
1009
1000
|
sum_a_x += sum_a_x_compensation, sum_a_y += sum_a_y_compensation, sum_a_z += sum_a_z_compensation;
|
|
@@ -1014,7 +1005,8 @@ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b,
|
|
|
1014
1005
|
covariance_y_z += covariance_y_z_compensation;
|
|
1015
1006
|
covariance_z_x += covariance_z_x_compensation, covariance_z_y += covariance_z_y_compensation,
|
|
1016
1007
|
covariance_z_z += covariance_z_z_compensation;
|
|
1017
|
-
|
|
1008
|
+
norm_squared_a_sum += norm_squared_a_compensation;
|
|
1009
|
+
norm_squared_b_sum += norm_squared_b_compensation;
|
|
1018
1010
|
|
|
1019
1011
|
// Compute centroids
|
|
1020
1012
|
nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
|
|
@@ -1025,9 +1017,15 @@ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b,
|
|
|
1025
1017
|
if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
|
|
1026
1018
|
if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
|
|
1027
1019
|
|
|
1028
|
-
//
|
|
1029
|
-
nk_f64_t
|
|
1030
|
-
|
|
1020
|
+
// Centered norm-squared via parallel-axis identity; clamp at zero for numeric safety.
|
|
1021
|
+
nk_f64_t centered_norm_squared_a = norm_squared_a_sum -
|
|
1022
|
+
(nk_f64_t)n * (centroid_a_x * centroid_a_x + centroid_a_y * centroid_a_y +
|
|
1023
|
+
centroid_a_z * centroid_a_z);
|
|
1024
|
+
nk_f64_t centered_norm_squared_b = norm_squared_b_sum -
|
|
1025
|
+
(nk_f64_t)n * (centroid_b_x * centroid_b_x + centroid_b_y * centroid_b_y +
|
|
1026
|
+
centroid_b_z * centroid_b_z);
|
|
1027
|
+
if (centered_norm_squared_a < 0.0) centered_norm_squared_a = 0.0;
|
|
1028
|
+
if (centered_norm_squared_b < 0.0) centered_norm_squared_b = 0.0;
|
|
1031
1029
|
|
|
1032
1030
|
// Apply centering correction: H_centered = H - n * centroid_a * centroid_bT
|
|
1033
1031
|
covariance_x_x -= n * centroid_a_x * centroid_b_x;
|
|
@@ -1043,29 +1041,57 @@ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b,
|
|
|
1043
1041
|
// Compute SVD
|
|
1044
1042
|
nk_f64_t cross_covariance[9] = {covariance_x_x, covariance_x_y, covariance_x_z, covariance_y_x, covariance_y_y,
|
|
1045
1043
|
covariance_y_z, covariance_z_x, covariance_z_y, covariance_z_z};
|
|
1046
|
-
nk_f64_t svd_u[9], svd_s[9], svd_v[9];
|
|
1047
|
-
nk_svd3x3_f64_(cross_covariance, svd_u, svd_s, svd_v);
|
|
1048
|
-
|
|
1049
|
-
nk_f64_t r[9];
|
|
1050
|
-
nk_rotation_from_svd_f64_v128relaxed_(svd_u, svd_v, r);
|
|
1051
1044
|
|
|
1052
|
-
//
|
|
1053
|
-
nk_f64_t
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1045
|
+
// Identity-dominant short-circuit: if H ≈ diag(positive entries), R = I and trace(R·H) = trace(H).
|
|
1046
|
+
nk_f64_t covariance_diagonal_norm_squared = cross_covariance[0] * cross_covariance[0] +
|
|
1047
|
+
cross_covariance[4] * cross_covariance[4] +
|
|
1048
|
+
cross_covariance[8] * cross_covariance[8];
|
|
1049
|
+
nk_f64_t covariance_offdiagonal_norm_squared =
|
|
1050
|
+
cross_covariance[1] * cross_covariance[1] + cross_covariance[2] * cross_covariance[2] +
|
|
1051
|
+
cross_covariance[3] * cross_covariance[3] + cross_covariance[5] * cross_covariance[5] +
|
|
1052
|
+
cross_covariance[6] * cross_covariance[6] + cross_covariance[7] * cross_covariance[7];
|
|
1053
|
+
nk_f64_t optimal_rotation[9];
|
|
1054
|
+
nk_f64_t trace_rotation_covariance;
|
|
1055
|
+
nk_f64_t computed_scale;
|
|
1056
|
+
if (covariance_offdiagonal_norm_squared < 1e-20 * covariance_diagonal_norm_squared && cross_covariance[0] > 0.0 &&
|
|
1057
|
+
cross_covariance[4] > 0.0 && cross_covariance[8] > 0.0) {
|
|
1058
|
+
optimal_rotation[0] = 1, optimal_rotation[1] = 0, optimal_rotation[2] = 0, optimal_rotation[3] = 0,
|
|
1059
|
+
optimal_rotation[4] = 1, optimal_rotation[5] = 0, optimal_rotation[6] = 0, optimal_rotation[7] = 0,
|
|
1060
|
+
optimal_rotation[8] = 1;
|
|
1061
|
+
trace_rotation_covariance = cross_covariance[0] + cross_covariance[4] + cross_covariance[8];
|
|
1062
|
+
computed_scale = centered_norm_squared_a > 0.0 ? trace_rotation_covariance / centered_norm_squared_a : 0.0;
|
|
1063
|
+
}
|
|
1064
|
+
else {
|
|
1065
|
+
nk_f64_t svd_left[9], svd_diagonal[9], svd_right[9];
|
|
1066
|
+
nk_svd3x3_f64_(cross_covariance, svd_left, svd_diagonal, svd_right);
|
|
1067
|
+
nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
|
|
1068
|
+
|
|
1069
|
+
// Handle reflection and compute scale
|
|
1070
|
+
nk_f64_t det = nk_det3x3_f64_(optimal_rotation);
|
|
1071
|
+
nk_f64_t trace_d_s = svd_diagonal[0] + svd_diagonal[4] + (det < 0 ? -svd_diagonal[8] : svd_diagonal[8]);
|
|
1072
|
+
computed_scale = centered_norm_squared_a > 0.0 ? trace_d_s / centered_norm_squared_a : 0.0;
|
|
1073
|
+
|
|
1074
|
+
if (det < 0) {
|
|
1075
|
+
svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8];
|
|
1076
|
+
nk_rotation_from_svd_f64_serial_(svd_left, svd_right, optimal_rotation);
|
|
1077
|
+
}
|
|
1078
|
+
|
|
1079
|
+
trace_rotation_covariance =
|
|
1080
|
+
optimal_rotation[0] * cross_covariance[0] + optimal_rotation[1] * cross_covariance[3] +
|
|
1081
|
+
optimal_rotation[2] * cross_covariance[6] + optimal_rotation[3] * cross_covariance[1] +
|
|
1082
|
+
optimal_rotation[4] * cross_covariance[4] + optimal_rotation[5] * cross_covariance[7] +
|
|
1083
|
+
optimal_rotation[6] * cross_covariance[2] + optimal_rotation[7] * cross_covariance[5] +
|
|
1084
|
+
optimal_rotation[8] * cross_covariance[8];
|
|
1060
1085
|
}
|
|
1061
1086
|
|
|
1062
1087
|
if (rotation)
|
|
1063
|
-
for (int j = 0; j < 9; ++j) rotation[j] =
|
|
1088
|
+
for (int j = 0; j < 9; ++j) rotation[j] = optimal_rotation[j];
|
|
1064
1089
|
if (scale) *scale = computed_scale;
|
|
1065
1090
|
|
|
1066
|
-
//
|
|
1067
|
-
nk_f64_t sum_squared =
|
|
1068
|
-
|
|
1091
|
+
// Folded SSD with scale: c²·‖a-ā‖² + ‖b-b̄‖² − 2c·trace(R · H_centered).
|
|
1092
|
+
nk_f64_t sum_squared = computed_scale * computed_scale * centered_norm_squared_a + centered_norm_squared_b -
|
|
1093
|
+
2.0 * computed_scale * trace_rotation_covariance;
|
|
1094
|
+
if (sum_squared < 0.0) sum_squared = 0.0;
|
|
1069
1095
|
*result = nk_f64_sqrt_v128relaxed(sum_squared * inv_points_count);
|
|
1070
1096
|
}
|
|
1071
1097
|
|