numkong 7.4.5 → 7.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/binding.gyp +99 -5
- package/c/dispatch_e5m2.c +23 -3
- package/c/dispatch_f16.c +23 -0
- package/c/numkong.c +0 -13
- package/include/numkong/attention/sme.h +34 -31
- package/include/numkong/capabilities.h +2 -15
- package/include/numkong/cast/README.md +3 -0
- package/include/numkong/cast/haswell.h +28 -64
- package/include/numkong/cast/neon.h +15 -0
- package/include/numkong/cast/serial.h +17 -0
- package/include/numkong/cast/skylake.h +67 -52
- package/include/numkong/cast.h +1 -0
- package/include/numkong/curved/smef64.h +82 -62
- package/include/numkong/dot/README.md +1 -0
- package/include/numkong/dot/haswell.h +92 -13
- package/include/numkong/dot/rvvbf16.h +1 -1
- package/include/numkong/dot/rvvhalf.h +1 -1
- package/include/numkong/dot/serial.h +15 -0
- package/include/numkong/dot/skylake.h +61 -14
- package/include/numkong/dot/sve.h +6 -5
- package/include/numkong/dot/svebfdot.h +2 -1
- package/include/numkong/dot/svehalf.h +6 -5
- package/include/numkong/dot/svesdot.h +3 -2
- package/include/numkong/dots/README.md +2 -0
- package/include/numkong/dots/graniteamx.h +1167 -0
- package/include/numkong/dots/haswell.h +28 -28
- package/include/numkong/dots/sapphireamx.h +1 -1
- package/include/numkong/dots/serial.h +33 -11
- package/include/numkong/dots/skylake.h +28 -23
- package/include/numkong/dots/sme.h +172 -140
- package/include/numkong/dots/smebi32.h +14 -11
- package/include/numkong/dots/smef64.h +31 -26
- package/include/numkong/dots.h +41 -3
- package/include/numkong/each/serial.h +39 -0
- package/include/numkong/geospatial/haswell.h +1 -1
- package/include/numkong/geospatial/neon.h +1 -1
- package/include/numkong/geospatial/serial.h +15 -4
- package/include/numkong/geospatial/skylake.h +1 -1
- package/include/numkong/maxsim/serial.h +15 -0
- package/include/numkong/maxsim/sme.h +34 -33
- package/include/numkong/mesh/README.md +50 -44
- package/include/numkong/mesh/genoa.h +462 -0
- package/include/numkong/mesh/haswell.h +806 -933
- package/include/numkong/mesh/neon.h +871 -943
- package/include/numkong/mesh/neonbfdot.h +382 -522
- package/include/numkong/mesh/neonfhm.h +676 -0
- package/include/numkong/mesh/rvv.h +404 -319
- package/include/numkong/mesh/serial.h +225 -161
- package/include/numkong/mesh/skylake.h +1029 -1585
- package/include/numkong/mesh/v128relaxed.h +403 -377
- package/include/numkong/mesh.h +38 -0
- package/include/numkong/reduce/neon.h +29 -0
- package/include/numkong/reduce/neonbfdot.h +2 -2
- package/include/numkong/reduce/neonfhm.h +4 -4
- package/include/numkong/reduce/serial.h +15 -1
- package/include/numkong/reduce/sve.h +52 -0
- package/include/numkong/reduce.h +4 -0
- package/include/numkong/set/sve.h +6 -5
- package/include/numkong/sets/smebi32.h +35 -30
- package/include/numkong/sparse/serial.h +17 -2
- package/include/numkong/sparse/sve2.h +3 -2
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +98 -56
- package/include/numkong/spatial/serial.h +15 -0
- package/include/numkong/spatial/skylake.h +114 -54
- package/include/numkong/spatial/sve.h +7 -6
- package/include/numkong/spatial/svebfdot.h +7 -4
- package/include/numkong/spatial/svehalf.h +5 -4
- package/include/numkong/spatial/svesdot.h +9 -8
- package/include/numkong/spatial.h +0 -12
- package/include/numkong/spatials/graniteamx.h +301 -0
- package/include/numkong/spatials/serial.h +39 -0
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +391 -350
- package/include/numkong/spatials/smef64.h +79 -70
- package/include/numkong/spatials.h +54 -4
- package/include/numkong/tensor.hpp +107 -23
- package/include/numkong/types.h +59 -0
- package/javascript/dist/cjs/numkong.js +13 -0
- package/javascript/dist/esm/numkong.js +13 -0
- package/javascript/numkong.c +59 -14
- package/javascript/numkong.ts +13 -0
- package/package.json +7 -7
- package/probes/probe.js +2 -2
- package/wasm/numkong.wasm +0 -0
|
@@ -146,28 +146,28 @@ extern "C" {
|
|
|
146
146
|
|
|
147
147
|
#define nk_define_sort_singular_values_(type) \
|
|
148
148
|
NK_INTERNAL void nk_sort_singular_values_##type##_(nk_##type##_t *b, nk_##type##_t *v) { \
|
|
149
|
-
nk_##type##_t
|
|
150
|
-
nk_##type##_t
|
|
151
|
-
nk_##type##_t
|
|
149
|
+
nk_##type##_t column_norm_squared_0 = b[0] * b[0] + b[3] * b[3] + b[6] * b[6]; \
|
|
150
|
+
nk_##type##_t column_norm_squared_1 = b[1] * b[1] + b[4] * b[4] + b[7] * b[7]; \
|
|
151
|
+
nk_##type##_t column_norm_squared_2 = b[2] * b[2] + b[5] * b[5] + b[8] * b[8]; \
|
|
152
152
|
int should_swap; \
|
|
153
153
|
/* Sort columns by descending singular value magnitude */ \
|
|
154
|
-
should_swap =
|
|
154
|
+
should_swap = column_norm_squared_0 < column_norm_squared_1; \
|
|
155
155
|
nk_conditional_negating_swap_##type##_(should_swap, &b[0], &b[1]); \
|
|
156
156
|
nk_conditional_negating_swap_##type##_(should_swap, &v[0], &v[1]); \
|
|
157
157
|
nk_conditional_negating_swap_##type##_(should_swap, &b[3], &b[4]); \
|
|
158
158
|
nk_conditional_negating_swap_##type##_(should_swap, &v[3], &v[4]); \
|
|
159
159
|
nk_conditional_negating_swap_##type##_(should_swap, &b[6], &b[7]); \
|
|
160
160
|
nk_conditional_negating_swap_##type##_(should_swap, &v[6], &v[7]); \
|
|
161
|
-
nk_cond_swap_##type##_(should_swap, &
|
|
162
|
-
should_swap =
|
|
161
|
+
nk_cond_swap_##type##_(should_swap, &column_norm_squared_0, &column_norm_squared_1); \
|
|
162
|
+
should_swap = column_norm_squared_0 < column_norm_squared_2; \
|
|
163
163
|
nk_conditional_negating_swap_##type##_(should_swap, &b[0], &b[2]); \
|
|
164
164
|
nk_conditional_negating_swap_##type##_(should_swap, &v[0], &v[2]); \
|
|
165
165
|
nk_conditional_negating_swap_##type##_(should_swap, &b[3], &b[5]); \
|
|
166
166
|
nk_conditional_negating_swap_##type##_(should_swap, &v[3], &v[5]); \
|
|
167
167
|
nk_conditional_negating_swap_##type##_(should_swap, &b[6], &b[8]); \
|
|
168
168
|
nk_conditional_negating_swap_##type##_(should_swap, &v[6], &v[8]); \
|
|
169
|
-
nk_cond_swap_##type##_(should_swap, &
|
|
170
|
-
should_swap =
|
|
169
|
+
nk_cond_swap_##type##_(should_swap, &column_norm_squared_0, &column_norm_squared_2); \
|
|
170
|
+
should_swap = column_norm_squared_1 < column_norm_squared_2; \
|
|
171
171
|
nk_conditional_negating_swap_##type##_(should_swap, &b[1], &b[2]); \
|
|
172
172
|
nk_conditional_negating_swap_##type##_(should_swap, &v[1], &v[2]); \
|
|
173
173
|
nk_conditional_negating_swap_##type##_(should_swap, &b[4], &b[5]); \
|
|
@@ -176,48 +176,32 @@ extern "C" {
|
|
|
176
176
|
nk_conditional_negating_swap_##type##_(should_swap, &v[7], &v[8]); \
|
|
177
177
|
}
|
|
178
178
|
|
|
179
|
-
|
|
180
|
-
|
|
179
|
+
/* Q-only QR via three Givens rotations. The R upper-triangular factor is unused by the SVD
|
|
180
|
+
* caller, so we skip computing it (saves the third cos_theta/sin_theta and 9 R stores per call).
|
|
181
|
+
*/
|
|
182
|
+
#define nk_define_qr_orthogonal_factor_(type) \
|
|
183
|
+
NK_INTERNAL void nk_qr_orthogonal_factor_##type##_(nk_##type##_t const *input, nk_##type##_t *q) { \
|
|
181
184
|
nk_##type##_t cos_half_1, sin_half_1; \
|
|
182
185
|
nk_##type##_t cos_half_2, sin_half_2; \
|
|
183
186
|
nk_##type##_t cos_half_3, sin_half_3; \
|
|
184
187
|
nk_##type##_t cos_theta, sin_theta; \
|
|
185
|
-
nk_##type##_t rotation_temp[9]
|
|
188
|
+
nk_##type##_t rotation_temp[9]; \
|
|
186
189
|
/* First Givens rotation (zero input[3]) */ \
|
|
187
190
|
nk_qr_givens_quaternion_##type##_(input[0], input[3], &cos_half_1, &sin_half_1); \
|
|
188
191
|
cos_theta = 1 - 2 * sin_half_1 * sin_half_1; \
|
|
189
192
|
sin_theta = 2 * cos_half_1 * sin_half_1; \
|
|
190
|
-
rotation_temp[0] = cos_theta * input[0] + sin_theta * input[3]; \
|
|
191
193
|
rotation_temp[1] = cos_theta * input[1] + sin_theta * input[4]; \
|
|
192
|
-
rotation_temp[2] = cos_theta * input[2] + sin_theta * input[5]; \
|
|
193
|
-
rotation_temp[3] = -sin_theta * input[0] + cos_theta * input[3]; \
|
|
194
194
|
rotation_temp[4] = -sin_theta * input[1] + cos_theta * input[4]; \
|
|
195
|
-
rotation_temp[
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
nk_qr_givens_quaternion_##type##_(rotation_temp[0], rotation_temp[6], &cos_half_2, &sin_half_2); \
|
|
195
|
+
rotation_temp[7] = input[7]; \
|
|
196
|
+
/* Second Givens rotation (only the [4]/[7] cells flow into the third Givens) */ \
|
|
197
|
+
nk_qr_givens_quaternion_##type##_(input[0] * cos_theta + sin_theta * input[3], input[6], &cos_half_2, \
|
|
198
|
+
&sin_half_2); \
|
|
200
199
|
cos_theta = 1 - 2 * sin_half_2 * sin_half_2; \
|
|
201
200
|
sin_theta = 2 * cos_half_2 * sin_half_2; \
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
matrix_temp[5] = rotation_temp[5]; \
|
|
207
|
-
matrix_temp[6] = -sin_theta * rotation_temp[0] + cos_theta * rotation_temp[6]; \
|
|
208
|
-
matrix_temp[7] = -sin_theta * rotation_temp[1] + cos_theta * rotation_temp[7]; \
|
|
209
|
-
matrix_temp[8] = -sin_theta * rotation_temp[2] + cos_theta * rotation_temp[8]; \
|
|
210
|
-
/* Third Givens rotation (zero matrix_temp[7]) */ \
|
|
211
|
-
nk_qr_givens_quaternion_##type##_(matrix_temp[4], matrix_temp[7], &cos_half_3, &sin_half_3); \
|
|
212
|
-
cos_theta = 1 - 2 * sin_half_3 * sin_half_3; \
|
|
213
|
-
sin_theta = 2 * cos_half_3 * sin_half_3; \
|
|
214
|
-
r[0] = matrix_temp[0], r[1] = matrix_temp[1], r[2] = matrix_temp[2]; \
|
|
215
|
-
r[3] = cos_theta * matrix_temp[3] + sin_theta * matrix_temp[6]; \
|
|
216
|
-
r[4] = cos_theta * matrix_temp[4] + sin_theta * matrix_temp[7]; \
|
|
217
|
-
r[5] = cos_theta * matrix_temp[5] + sin_theta * matrix_temp[8]; \
|
|
218
|
-
r[6] = -sin_theta * matrix_temp[3] + cos_theta * matrix_temp[6]; \
|
|
219
|
-
r[7] = -sin_theta * matrix_temp[4] + cos_theta * matrix_temp[7]; \
|
|
220
|
-
r[8] = -sin_theta * matrix_temp[5] + cos_theta * matrix_temp[8]; \
|
|
201
|
+
nk_##type##_t matrix_temp_4 = rotation_temp[4]; \
|
|
202
|
+
nk_##type##_t matrix_temp_7 = -sin_theta * rotation_temp[1] + cos_theta * rotation_temp[7]; \
|
|
203
|
+
/* Third Givens rotation (zero matrix_temp_7) — only sin_half_3 / cos_half_3 are needed for Q */ \
|
|
204
|
+
nk_qr_givens_quaternion_##type##_(matrix_temp_4, matrix_temp_7, &cos_half_3, &sin_half_3); \
|
|
221
205
|
/* Construct Q = Q1 * Q2 * Q3 (closed-form expressions) */ \
|
|
222
206
|
nk_##type##_t sin_half_1_sq = sin_half_1 * sin_half_1; \
|
|
223
207
|
nk_##type##_t sin_half_2_sq = sin_half_2 * sin_half_2; \
|
|
@@ -238,49 +222,48 @@ extern "C" {
|
|
|
238
222
|
q[8] = (-1 + 2 * sin_half_2_sq) * (-1 + 2 * sin_half_3_sq); \
|
|
239
223
|
}
|
|
240
224
|
|
|
241
|
-
#define nk_define_svd3x3_(type, compute_sqrt)
|
|
242
|
-
NK_INTERNAL void nk_svd3x3_##type##_(nk_##type##_t const *a, nk_##type##_t *
|
|
243
|
-
nk_##type##_t *
|
|
244
|
-
/* Compute Aᵀ * A (symmetric) */
|
|
245
|
-
nk_##type##_t ata[9];
|
|
246
|
-
ata[0] = nk_sum_three_squares_##type##_(a[0], a[3], a[6]);
|
|
247
|
-
ata[1] = nk_sum_three_products_##type##_(a[0], a[1], a[3], a[4], a[6], a[7]);
|
|
248
|
-
ata[2] = nk_sum_three_products_##type##_(a[0], a[2], a[3], a[5], a[6], a[8]);
|
|
249
|
-
ata[3] = ata[1];
|
|
250
|
-
ata[4] = nk_sum_three_squares_##type##_(a[1], a[4], a[7]);
|
|
251
|
-
ata[5] = nk_sum_three_products_##type##_(a[1], a[2], a[4], a[5], a[7], a[8]);
|
|
252
|
-
ata[6] = ata[2];
|
|
253
|
-
ata[7] = ata[5];
|
|
254
|
-
ata[8] = nk_sum_three_squares_##type##_(a[2], a[5], a[8]);
|
|
255
|
-
/* Jacobi eigenanalysis of Aᵀ * A */
|
|
256
|
-
nk_##type##_t quaternion[4];
|
|
257
|
-
nk_jacobi_eigenanalysis_##type##_(&ata[0], &ata[1], &ata[4], &ata[2], &ata[5], &ata[8], quaternion);
|
|
258
|
-
nk_quaternion_to_mat3x3_##type##_(quaternion,
|
|
259
|
-
/* B = A * V */
|
|
260
|
-
nk_##type##_t product[9];
|
|
261
|
-
product[0] = nk_sum_three_products_##type##_(a[0],
|
|
262
|
-
product[1] = nk_sum_three_products_##type##_(a[0],
|
|
263
|
-
product[2] = nk_sum_three_products_##type##_(a[0],
|
|
264
|
-
product[3] = nk_sum_three_products_##type##_(a[3],
|
|
265
|
-
product[4] = nk_sum_three_products_##type##_(a[3],
|
|
266
|
-
product[5] = nk_sum_three_products_##type##_(a[3],
|
|
267
|
-
product[6] = nk_sum_three_products_##type##_(a[6],
|
|
268
|
-
product[7] = nk_sum_three_products_##type##_(a[6],
|
|
269
|
-
product[8] = nk_sum_three_products_##type##_(a[6],
|
|
270
|
-
/* Sort singular values and update V */
|
|
271
|
-
nk_sort_singular_values_##type##_(product,
|
|
272
|
-
/* Compute singular values from column norms of sorted B (before QR orthogonalizes them) */
|
|
273
|
-
/* These are the true singular values: √(‖colᵢ‖²) */
|
|
274
|
-
nk_##type##_t
|
|
275
|
-
nk_##type##_t
|
|
276
|
-
nk_##type##_t
|
|
277
|
-
/* QR
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
svd_s[6] = 0, svd_s[7] = 0, svd_s[8] = compute_sqrt(s3_sq); \
|
|
225
|
+
#define nk_define_svd3x3_(type, compute_sqrt) \
|
|
226
|
+
NK_INTERNAL void nk_svd3x3_##type##_(nk_##type##_t const *a, nk_##type##_t *svd_left, nk_##type##_t *svd_diagonal, \
|
|
227
|
+
nk_##type##_t *svd_right) { \
|
|
228
|
+
/* Compute Aᵀ * A (symmetric) */ \
|
|
229
|
+
nk_##type##_t ata[9]; \
|
|
230
|
+
ata[0] = nk_sum_three_squares_##type##_(a[0], a[3], a[6]); \
|
|
231
|
+
ata[1] = nk_sum_three_products_##type##_(a[0], a[1], a[3], a[4], a[6], a[7]); \
|
|
232
|
+
ata[2] = nk_sum_three_products_##type##_(a[0], a[2], a[3], a[5], a[6], a[8]); \
|
|
233
|
+
ata[3] = ata[1]; \
|
|
234
|
+
ata[4] = nk_sum_three_squares_##type##_(a[1], a[4], a[7]); \
|
|
235
|
+
ata[5] = nk_sum_three_products_##type##_(a[1], a[2], a[4], a[5], a[7], a[8]); \
|
|
236
|
+
ata[6] = ata[2]; \
|
|
237
|
+
ata[7] = ata[5]; \
|
|
238
|
+
ata[8] = nk_sum_three_squares_##type##_(a[2], a[5], a[8]); \
|
|
239
|
+
/* Jacobi eigenanalysis of Aᵀ * A */ \
|
|
240
|
+
nk_##type##_t quaternion[4]; \
|
|
241
|
+
nk_jacobi_eigenanalysis_##type##_(&ata[0], &ata[1], &ata[4], &ata[2], &ata[5], &ata[8], quaternion); \
|
|
242
|
+
nk_quaternion_to_mat3x3_##type##_(quaternion, svd_right); \
|
|
243
|
+
/* B = A * V */ \
|
|
244
|
+
nk_##type##_t product[9]; \
|
|
245
|
+
product[0] = nk_sum_three_products_##type##_(a[0], svd_right[0], a[1], svd_right[3], a[2], svd_right[6]); \
|
|
246
|
+
product[1] = nk_sum_three_products_##type##_(a[0], svd_right[1], a[1], svd_right[4], a[2], svd_right[7]); \
|
|
247
|
+
product[2] = nk_sum_three_products_##type##_(a[0], svd_right[2], a[1], svd_right[5], a[2], svd_right[8]); \
|
|
248
|
+
product[3] = nk_sum_three_products_##type##_(a[3], svd_right[0], a[4], svd_right[3], a[5], svd_right[6]); \
|
|
249
|
+
product[4] = nk_sum_three_products_##type##_(a[3], svd_right[1], a[4], svd_right[4], a[5], svd_right[7]); \
|
|
250
|
+
product[5] = nk_sum_three_products_##type##_(a[3], svd_right[2], a[4], svd_right[5], a[5], svd_right[8]); \
|
|
251
|
+
product[6] = nk_sum_three_products_##type##_(a[6], svd_right[0], a[7], svd_right[3], a[8], svd_right[6]); \
|
|
252
|
+
product[7] = nk_sum_three_products_##type##_(a[6], svd_right[1], a[7], svd_right[4], a[8], svd_right[7]); \
|
|
253
|
+
product[8] = nk_sum_three_products_##type##_(a[6], svd_right[2], a[7], svd_right[5], a[8], svd_right[8]); \
|
|
254
|
+
/* Sort singular values and update V */ \
|
|
255
|
+
nk_sort_singular_values_##type##_(product, svd_right); \
|
|
256
|
+
/* Compute singular values from column norms of sorted B (before QR orthogonalizes them) */ \
|
|
257
|
+
/* These are the true singular values: √(‖colᵢ‖²) */ \
|
|
258
|
+
nk_##type##_t singular_value_squared_0 = nk_sum_three_squares_##type##_(product[0], product[3], product[6]); \
|
|
259
|
+
nk_##type##_t singular_value_squared_1 = nk_sum_three_squares_##type##_(product[1], product[4], product[7]); \
|
|
260
|
+
nk_##type##_t singular_value_squared_2 = nk_sum_three_squares_##type##_(product[2], product[5], product[8]); \
|
|
261
|
+
/* Q-only QR: extract U (the orthogonal factor); R is unused by SVD */ \
|
|
262
|
+
nk_qr_orthogonal_factor_##type##_(product, svd_left); \
|
|
263
|
+
/* Store singular values on diagonal positions [0], [4], [8]; off-diagonals never read by callers */ \
|
|
264
|
+
svd_diagonal[0] = compute_sqrt(singular_value_squared_0); \
|
|
265
|
+
svd_diagonal[4] = compute_sqrt(singular_value_squared_1); \
|
|
266
|
+
svd_diagonal[8] = compute_sqrt(singular_value_squared_2); \
|
|
284
267
|
}
|
|
285
268
|
|
|
286
269
|
#define nk_define_det3x3_(type) \
|
|
@@ -289,6 +272,29 @@ extern "C" {
|
|
|
289
272
|
m[2] * (m[3] * m[7] - m[4] * m[6]); \
|
|
290
273
|
}
|
|
291
274
|
|
|
275
|
+
/* Keep the serial instantiations below actually scalar, regardless of build type.
|
|
276
|
+
* Without this, -O3 + LTO can vectorize or clone the serial kernels under AVX-512
|
|
277
|
+
* callers in dispatch_*.c, which wastes binary and breaks the nk_*_serial-as-scalar-oracle
|
|
278
|
+
* contract that tests and numerical-stability docs rely on. See dots/serial.h. */
|
|
279
|
+
#if defined(__clang__)
|
|
280
|
+
#pragma clang attribute push(__attribute__((noinline)), apply_to = function)
|
|
281
|
+
#elif defined(__GNUC__)
|
|
282
|
+
#pragma GCC push_options
|
|
283
|
+
#pragma GCC optimize("no-tree-vectorize", "no-tree-slp-vectorize", "no-ipa-cp-clone", "no-inline")
|
|
284
|
+
#endif
|
|
285
|
+
|
|
286
|
+
/* Size bias for release. Gated on NDEBUG so Debug builds keep -O0 for stepping. */
|
|
287
|
+
#if defined(NDEBUG)
|
|
288
|
+
#if defined(_MSC_VER)
|
|
289
|
+
#pragma optimize("s", on)
|
|
290
|
+
#elif defined(__clang__)
|
|
291
|
+
#pragma clang attribute push(__attribute__((minsize)), apply_to = function)
|
|
292
|
+
#elif defined(__GNUC__)
|
|
293
|
+
#pragma GCC push_options
|
|
294
|
+
#pragma GCC optimize("Os")
|
|
295
|
+
#endif
|
|
296
|
+
#endif
|
|
297
|
+
|
|
292
298
|
NK_INTERNAL nk_f32_t nk_sum_three_products_f32_(nk_f32_t left_0, nk_f32_t right_0, nk_f32_t left_1, nk_f32_t right_1,
|
|
293
299
|
nk_f32_t left_2, nk_f32_t right_2) {
|
|
294
300
|
return left_0 * right_0 + left_1 * right_1 + left_2 * right_2;
|
|
@@ -340,27 +346,47 @@ NK_INTERNAL void nk_accumulate_square_f64_(nk_f64_t *sum, nk_f64_t *compensation
|
|
|
340
346
|
nk_f64_dot2_(sum, compensation, value, value);
|
|
341
347
|
}
|
|
342
348
|
|
|
343
|
-
NK_INTERNAL void nk_rotation_from_svd_f32_serial_(nk_f32_t const *
|
|
344
|
-
|
|
345
|
-
rotation[
|
|
346
|
-
|
|
347
|
-
rotation[
|
|
348
|
-
|
|
349
|
-
rotation[
|
|
350
|
-
|
|
351
|
-
rotation[
|
|
352
|
-
|
|
349
|
+
NK_INTERNAL void nk_rotation_from_svd_f32_serial_(nk_f32_t const *svd_left, nk_f32_t const *svd_right,
|
|
350
|
+
nk_f32_t *rotation) {
|
|
351
|
+
rotation[0] = nk_sum_three_products_f32_( //
|
|
352
|
+
svd_right[0], svd_left[0], svd_right[1], svd_left[1], svd_right[2], svd_left[2]);
|
|
353
|
+
rotation[1] = nk_sum_three_products_f32_( //
|
|
354
|
+
svd_right[0], svd_left[3], svd_right[1], svd_left[4], svd_right[2], svd_left[5]);
|
|
355
|
+
rotation[2] = nk_sum_three_products_f32_( //
|
|
356
|
+
svd_right[0], svd_left[6], svd_right[1], svd_left[7], svd_right[2], svd_left[8]);
|
|
357
|
+
rotation[3] = nk_sum_three_products_f32_( //
|
|
358
|
+
svd_right[3], svd_left[0], svd_right[4], svd_left[1], svd_right[5], svd_left[2]);
|
|
359
|
+
rotation[4] = nk_sum_three_products_f32_( //
|
|
360
|
+
svd_right[3], svd_left[3], svd_right[4], svd_left[4], svd_right[5], svd_left[5]);
|
|
361
|
+
rotation[5] = nk_sum_three_products_f32_( //
|
|
362
|
+
svd_right[3], svd_left[6], svd_right[4], svd_left[7], svd_right[5], svd_left[8]);
|
|
363
|
+
rotation[6] = nk_sum_three_products_f32_( //
|
|
364
|
+
svd_right[6], svd_left[0], svd_right[7], svd_left[1], svd_right[8], svd_left[2]);
|
|
365
|
+
rotation[7] = nk_sum_three_products_f32_( //
|
|
366
|
+
svd_right[6], svd_left[3], svd_right[7], svd_left[4], svd_right[8], svd_left[5]);
|
|
367
|
+
rotation[8] = nk_sum_three_products_f32_( //
|
|
368
|
+
svd_right[6], svd_left[6], svd_right[7], svd_left[7], svd_right[8], svd_left[8]);
|
|
353
369
|
}
|
|
354
|
-
NK_INTERNAL void nk_rotation_from_svd_f64_serial_(nk_f64_t const *
|
|
355
|
-
|
|
356
|
-
rotation[
|
|
357
|
-
|
|
358
|
-
rotation[
|
|
359
|
-
|
|
360
|
-
rotation[
|
|
361
|
-
|
|
362
|
-
rotation[
|
|
363
|
-
|
|
370
|
+
NK_INTERNAL void nk_rotation_from_svd_f64_serial_(nk_f64_t const *svd_left, nk_f64_t const *svd_right,
|
|
371
|
+
nk_f64_t *rotation) {
|
|
372
|
+
rotation[0] = nk_sum_three_products_f64_( //
|
|
373
|
+
svd_right[0], svd_left[0], svd_right[1], svd_left[1], svd_right[2], svd_left[2]);
|
|
374
|
+
rotation[1] = nk_sum_three_products_f64_( //
|
|
375
|
+
svd_right[0], svd_left[3], svd_right[1], svd_left[4], svd_right[2], svd_left[5]);
|
|
376
|
+
rotation[2] = nk_sum_three_products_f64_( //
|
|
377
|
+
svd_right[0], svd_left[6], svd_right[1], svd_left[7], svd_right[2], svd_left[8]);
|
|
378
|
+
rotation[3] = nk_sum_three_products_f64_( //
|
|
379
|
+
svd_right[3], svd_left[0], svd_right[4], svd_left[1], svd_right[5], svd_left[2]);
|
|
380
|
+
rotation[4] = nk_sum_three_products_f64_( //
|
|
381
|
+
svd_right[3], svd_left[3], svd_right[4], svd_left[4], svd_right[5], svd_left[5]);
|
|
382
|
+
rotation[5] = nk_sum_three_products_f64_( //
|
|
383
|
+
svd_right[3], svd_left[6], svd_right[4], svd_left[7], svd_right[5], svd_left[8]);
|
|
384
|
+
rotation[6] = nk_sum_three_products_f64_( //
|
|
385
|
+
svd_right[6], svd_left[0], svd_right[7], svd_left[1], svd_right[8], svd_left[2]);
|
|
386
|
+
rotation[7] = nk_sum_three_products_f64_( //
|
|
387
|
+
svd_right[6], svd_left[3], svd_right[7], svd_left[4], svd_right[8], svd_left[5]);
|
|
388
|
+
rotation[8] = nk_sum_three_products_f64_( //
|
|
389
|
+
svd_right[6], svd_left[6], svd_right[7], svd_left[7], svd_right[8], svd_left[8]);
|
|
364
390
|
}
|
|
365
391
|
|
|
366
392
|
nk_define_cond_swap_(f32)
|
|
@@ -372,7 +398,7 @@ nk_define_quaternion_to_mat3x3_(f32)
|
|
|
372
398
|
nk_define_jacobi_eigenanalysis_(f32, nk_f32_rsqrt_serial)
|
|
373
399
|
nk_define_qr_givens_quaternion_(f32, NK_F32_SVD_EPSILON_, nk_f32_rsqrt_serial)
|
|
374
400
|
nk_define_sort_singular_values_(f32)
|
|
375
|
-
|
|
401
|
+
nk_define_qr_orthogonal_factor_(f32)
|
|
376
402
|
nk_define_svd3x3_(f32, nk_f32_sqrt_serial)
|
|
377
403
|
nk_define_det3x3_(f32)
|
|
378
404
|
|
|
@@ -385,7 +411,7 @@ nk_define_quaternion_to_mat3x3_(f64)
|
|
|
385
411
|
nk_define_jacobi_eigenanalysis_(f64, nk_f64_rsqrt_serial)
|
|
386
412
|
nk_define_qr_givens_quaternion_(f64, NK_F64_SVD_EPSILON_, nk_f64_rsqrt_serial)
|
|
387
413
|
nk_define_sort_singular_values_(f64)
|
|
388
|
-
|
|
414
|
+
nk_define_qr_orthogonal_factor_(f64)
|
|
389
415
|
nk_define_svd3x3_(f64, nk_f64_sqrt_serial)
|
|
390
416
|
nk_define_det3x3_(f64)
|
|
391
417
|
|
|
@@ -460,42 +486,53 @@ nk_define_det3x3_(f64)
|
|
|
460
486
|
b_centroid[0] = (nk_##output_type##_t)centroid_b_x, b_centroid[1] = (nk_##output_type##_t)centroid_b_y, \
|
|
461
487
|
b_centroid[2] = (nk_##output_type##_t)centroid_b_z; \
|
|
462
488
|
/* Step 2: Build 3×3 covariance matrix H = (A - Ā)ᵀ × (B - B̄) */ \
|
|
463
|
-
nk_##accumulator_type##_t
|
|
464
|
-
nk_##accumulator_type##_t
|
|
489
|
+
nk_##accumulator_type##_t cross_covariance_kahan_sum[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
|
|
490
|
+
nk_##accumulator_type##_t cross_covariance_kahan_compensation[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
|
|
465
491
|
for (nk_size_t i = 0; i < n; ++i) { \
|
|
466
492
|
load_and_convert(a + i * 3 + 0, &val_a_x), load_and_convert(b + i * 3 + 0, &val_b_x); \
|
|
467
493
|
load_and_convert(a + i * 3 + 1, &val_a_y), load_and_convert(b + i * 3 + 1, &val_b_y); \
|
|
468
494
|
load_and_convert(a + i * 3 + 2, &val_a_z), load_and_convert(b + i * 3 + 2, &val_b_z); \
|
|
469
495
|
val_a_x -= centroid_a_x, val_a_y -= centroid_a_y, val_a_z -= centroid_a_z; \
|
|
470
496
|
val_b_x -= centroid_b_x, val_b_y -= centroid_b_y, val_b_z -= centroid_b_z; \
|
|
471
|
-
nk_accumulate_product_##accumulator_type##_(&
|
|
472
|
-
|
|
473
|
-
nk_accumulate_product_##accumulator_type##_(&
|
|
474
|
-
|
|
475
|
-
nk_accumulate_product_##accumulator_type##_(&
|
|
476
|
-
|
|
477
|
-
nk_accumulate_product_##accumulator_type##_(&
|
|
478
|
-
|
|
479
|
-
nk_accumulate_product_##accumulator_type##_(&
|
|
497
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[0], \
|
|
498
|
+
&cross_covariance_kahan_compensation[0], val_a_x, val_b_x); \
|
|
499
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[1], \
|
|
500
|
+
&cross_covariance_kahan_compensation[1], val_a_x, val_b_y); \
|
|
501
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[2], \
|
|
502
|
+
&cross_covariance_kahan_compensation[2], val_a_x, val_b_z); \
|
|
503
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[3], \
|
|
504
|
+
&cross_covariance_kahan_compensation[3], val_a_y, val_b_x); \
|
|
505
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[4], \
|
|
506
|
+
&cross_covariance_kahan_compensation[4], val_a_y, val_b_y); \
|
|
507
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[5], \
|
|
508
|
+
&cross_covariance_kahan_compensation[5], val_a_y, val_b_z); \
|
|
509
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[6], \
|
|
510
|
+
&cross_covariance_kahan_compensation[6], val_a_z, val_b_x); \
|
|
511
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[7], \
|
|
512
|
+
&cross_covariance_kahan_compensation[7], val_a_z, val_b_y); \
|
|
513
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[8], \
|
|
514
|
+
&cross_covariance_kahan_compensation[8], val_a_z, val_b_z); \
|
|
480
515
|
} \
|
|
481
516
|
/* Convert to svd_type for SVD */ \
|
|
482
517
|
nk_##svd_type##_t cross_covariance[9]; \
|
|
483
|
-
for (int j = 0; j < 9; ++j)
|
|
518
|
+
for (int j = 0; j < 9; ++j) \
|
|
519
|
+
cross_covariance[j] = (nk_##svd_type##_t)(cross_covariance_kahan_sum[j] + \
|
|
520
|
+
cross_covariance_kahan_compensation[j]); \
|
|
484
521
|
/* Step 3: SVD of H = U * S * Vᵀ */ \
|
|
485
|
-
nk_##svd_type##_t
|
|
486
|
-
nk_svd3x3_##svd_type##_(cross_covariance,
|
|
522
|
+
nk_##svd_type##_t svd_left[9], svd_diagonal[9], svd_right[9]; \
|
|
523
|
+
nk_svd3x3_##svd_type##_(cross_covariance, svd_left, svd_diagonal, svd_right); \
|
|
487
524
|
/* Step 4: R = V * Uᵀ */ \
|
|
488
|
-
nk_##svd_type##_t
|
|
489
|
-
nk_rotation_from_svd_##svd_type##_serial_(
|
|
525
|
+
nk_##svd_type##_t optimal_rotation[9]; \
|
|
526
|
+
nk_rotation_from_svd_##svd_type##_serial_(svd_left, svd_right, optimal_rotation); \
|
|
490
527
|
/* Handle reflection: if det(R) < 0, negate third column of V and recompute R */ \
|
|
491
|
-
nk_##svd_type##_t
|
|
492
|
-
if (
|
|
493
|
-
|
|
494
|
-
nk_rotation_from_svd_##svd_type##_serial_(
|
|
528
|
+
nk_##svd_type##_t rotation_determinant = nk_det3x3_##svd_type##_(optimal_rotation); \
|
|
529
|
+
if (rotation_determinant < 0) { \
|
|
530
|
+
svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8]; \
|
|
531
|
+
nk_rotation_from_svd_##svd_type##_serial_(svd_left, svd_right, optimal_rotation); \
|
|
495
532
|
} \
|
|
496
533
|
/* Output rotation matrix and scale=1.0 */ \
|
|
497
534
|
if (rotation) \
|
|
498
|
-
for (int j = 0; j < 9; ++j) rotation[j] = (nk_##output_type##_t)
|
|
535
|
+
for (int j = 0; j < 9; ++j) rotation[j] = (nk_##output_type##_t)optimal_rotation[j]; \
|
|
499
536
|
if (scale) *scale = (nk_##output_type##_t)1; \
|
|
500
537
|
/* Step 5: Compute RMSD after rotation */ \
|
|
501
538
|
nk_##accumulator_type##_t sum_squared = 0, sum_squared_compensation = 0; \
|
|
@@ -510,12 +547,12 @@ nk_define_det3x3_(f64)
|
|
|
510
547
|
point_b[0] = (nk_##svd_type##_t)(val_b_x - centroid_b_x), \
|
|
511
548
|
point_b[1] = (nk_##svd_type##_t)(val_b_y - centroid_b_y), \
|
|
512
549
|
point_b[2] = (nk_##svd_type##_t)(val_b_z - centroid_b_z); \
|
|
513
|
-
rotated_point_a[0] =
|
|
514
|
-
|
|
515
|
-
rotated_point_a[1] =
|
|
516
|
-
|
|
517
|
-
rotated_point_a[2] =
|
|
518
|
-
|
|
550
|
+
rotated_point_a[0] = optimal_rotation[0] * point_a[0] + optimal_rotation[1] * point_a[1] + \
|
|
551
|
+
optimal_rotation[2] * point_a[2]; \
|
|
552
|
+
rotated_point_a[1] = optimal_rotation[3] * point_a[0] + optimal_rotation[4] * point_a[1] + \
|
|
553
|
+
optimal_rotation[5] * point_a[2]; \
|
|
554
|
+
rotated_point_a[2] = optimal_rotation[6] * point_a[0] + optimal_rotation[7] * point_a[1] + \
|
|
555
|
+
optimal_rotation[8] * point_a[2]; \
|
|
519
556
|
nk_##svd_type##_t dx = rotated_point_a[0] - point_b[0]; \
|
|
520
557
|
nk_##svd_type##_t dy = rotated_point_a[1] - point_b[1]; \
|
|
521
558
|
nk_##svd_type##_t dz = rotated_point_a[2] - point_b[2]; \
|
|
@@ -571,8 +608,8 @@ nk_define_det3x3_(f64)
|
|
|
571
608
|
b_centroid[0] = (nk_##output_type##_t)centroid_b_x, b_centroid[1] = (nk_##output_type##_t)centroid_b_y, \
|
|
572
609
|
b_centroid[2] = (nk_##output_type##_t)centroid_b_z; \
|
|
573
610
|
/* Step 2: Build covariance matrix H and compute variance of A */ \
|
|
574
|
-
nk_##accumulator_type##_t
|
|
575
|
-
nk_##accumulator_type##_t
|
|
611
|
+
nk_##accumulator_type##_t cross_covariance_kahan_sum[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
|
|
612
|
+
nk_##accumulator_type##_t cross_covariance_kahan_compensation[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
|
|
576
613
|
nk_##accumulator_type##_t variance_a = 0, variance_a_compensation = 0; \
|
|
577
614
|
for (nk_size_t i = 0; i < n; ++i) { \
|
|
578
615
|
load_and_convert(a + i * 3 + 0, &val_a_x), load_and_convert(b + i * 3 + 0, &val_b_x); \
|
|
@@ -583,41 +620,52 @@ nk_define_det3x3_(f64)
|
|
|
583
620
|
nk_accumulate_square_##accumulator_type##_(&variance_a, &variance_a_compensation, val_a_x); \
|
|
584
621
|
nk_accumulate_square_##accumulator_type##_(&variance_a, &variance_a_compensation, val_a_y); \
|
|
585
622
|
nk_accumulate_square_##accumulator_type##_(&variance_a, &variance_a_compensation, val_a_z); \
|
|
586
|
-
nk_accumulate_product_##accumulator_type##_(&
|
|
587
|
-
|
|
588
|
-
nk_accumulate_product_##accumulator_type##_(&
|
|
589
|
-
|
|
590
|
-
nk_accumulate_product_##accumulator_type##_(&
|
|
591
|
-
|
|
592
|
-
nk_accumulate_product_##accumulator_type##_(&
|
|
593
|
-
|
|
594
|
-
nk_accumulate_product_##accumulator_type##_(&
|
|
623
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[0], \
|
|
624
|
+
&cross_covariance_kahan_compensation[0], val_a_x, val_b_x); \
|
|
625
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[1], \
|
|
626
|
+
&cross_covariance_kahan_compensation[1], val_a_x, val_b_y); \
|
|
627
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[2], \
|
|
628
|
+
&cross_covariance_kahan_compensation[2], val_a_x, val_b_z); \
|
|
629
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[3], \
|
|
630
|
+
&cross_covariance_kahan_compensation[3], val_a_y, val_b_x); \
|
|
631
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[4], \
|
|
632
|
+
&cross_covariance_kahan_compensation[4], val_a_y, val_b_y); \
|
|
633
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[5], \
|
|
634
|
+
&cross_covariance_kahan_compensation[5], val_a_y, val_b_z); \
|
|
635
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[6], \
|
|
636
|
+
&cross_covariance_kahan_compensation[6], val_a_z, val_b_x); \
|
|
637
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[7], \
|
|
638
|
+
&cross_covariance_kahan_compensation[7], val_a_z, val_b_y); \
|
|
639
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[8], \
|
|
640
|
+
&cross_covariance_kahan_compensation[8], val_a_z, val_b_z); \
|
|
595
641
|
} \
|
|
596
642
|
variance_a = (variance_a + variance_a_compensation) * inv_n; \
|
|
597
643
|
/* Convert to svd_type for SVD */ \
|
|
598
644
|
nk_##svd_type##_t cross_covariance[9]; \
|
|
599
|
-
for (int j = 0; j < 9; ++j)
|
|
645
|
+
for (int j = 0; j < 9; ++j) \
|
|
646
|
+
cross_covariance[j] = (nk_##svd_type##_t)(cross_covariance_kahan_sum[j] + \
|
|
647
|
+
cross_covariance_kahan_compensation[j]); \
|
|
600
648
|
/* Step 3: SVD of H = U * S * Vᵀ */ \
|
|
601
|
-
nk_##svd_type##_t
|
|
602
|
-
nk_svd3x3_##svd_type##_(cross_covariance,
|
|
649
|
+
nk_##svd_type##_t svd_left[9], svd_diagonal[9], svd_right[9]; \
|
|
650
|
+
nk_svd3x3_##svd_type##_(cross_covariance, svd_left, svd_diagonal, svd_right); \
|
|
603
651
|
/* Step 4: R = V * Uᵀ */ \
|
|
604
|
-
nk_##svd_type##_t
|
|
605
|
-
nk_rotation_from_svd_##svd_type##_serial_(
|
|
652
|
+
nk_##svd_type##_t optimal_rotation[9]; \
|
|
653
|
+
nk_rotation_from_svd_##svd_type##_serial_(svd_left, svd_right, optimal_rotation); \
|
|
606
654
|
/* Handle reflection and compute scale: c = trace(D × S) / variance(a) */ \
|
|
607
|
-
/* D = diag(1, 1, det(R)),
|
|
608
|
-
nk_##svd_type##_t
|
|
609
|
-
nk_##svd_type##_t sign_det =
|
|
610
|
-
nk_##svd_type##_t trace_scaled_s =
|
|
655
|
+
/* D = diag(1, 1, det(R)), svd_diagonal contains proper positive singular values on diagonal */ \
|
|
656
|
+
nk_##svd_type##_t rotation_determinant = nk_det3x3_##svd_type##_(optimal_rotation); \
|
|
657
|
+
nk_##svd_type##_t sign_det = rotation_determinant < 0 ? (nk_##svd_type##_t) - 1.0 : (nk_##svd_type##_t)1.0; \
|
|
658
|
+
nk_##svd_type##_t trace_scaled_s = svd_diagonal[0] + svd_diagonal[4] + sign_det * svd_diagonal[8]; \
|
|
611
659
|
nk_##accumulator_type##_t scale_factor = (nk_##accumulator_type##_t)trace_scaled_s / \
|
|
612
660
|
((nk_##accumulator_type##_t)n * variance_a); \
|
|
613
661
|
if (scale) *scale = (nk_##output_type##_t)scale_factor; \
|
|
614
|
-
if (
|
|
615
|
-
|
|
616
|
-
nk_rotation_from_svd_##svd_type##_serial_(
|
|
662
|
+
if (rotation_determinant < 0) { \
|
|
663
|
+
svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8]; \
|
|
664
|
+
nk_rotation_from_svd_##svd_type##_serial_(svd_left, svd_right, optimal_rotation); \
|
|
617
665
|
} \
|
|
618
666
|
/* Output rotation matrix */ \
|
|
619
667
|
if (rotation) \
|
|
620
|
-
for (int j = 0; j < 9; ++j) rotation[j] = (nk_##output_type##_t)
|
|
668
|
+
for (int j = 0; j < 9; ++j) rotation[j] = (nk_##output_type##_t)optimal_rotation[j]; \
|
|
621
669
|
/* Step 5: Compute RMSD after similarity transform: ‖c × R × a - b‖ */ \
|
|
622
670
|
nk_##accumulator_type##_t sum_squared = 0, sum_squared_compensation = 0; \
|
|
623
671
|
for (nk_size_t i = 0; i < n; ++i) { \
|
|
@@ -632,14 +680,14 @@ nk_define_det3x3_(f64)
|
|
|
632
680
|
point_b[1] = (nk_##svd_type##_t)(val_b_y - centroid_b_y), \
|
|
633
681
|
point_b[2] = (nk_##svd_type##_t)(val_b_z - centroid_b_z); \
|
|
634
682
|
rotated_point_a[0] = (nk_##svd_type##_t)scale_factor * \
|
|
635
|
-
(
|
|
636
|
-
|
|
683
|
+
(optimal_rotation[0] * point_a[0] + optimal_rotation[1] * point_a[1] + \
|
|
684
|
+
optimal_rotation[2] * point_a[2]); \
|
|
637
685
|
rotated_point_a[1] = (nk_##svd_type##_t)scale_factor * \
|
|
638
|
-
(
|
|
639
|
-
|
|
686
|
+
(optimal_rotation[3] * point_a[0] + optimal_rotation[4] * point_a[1] + \
|
|
687
|
+
optimal_rotation[5] * point_a[2]); \
|
|
640
688
|
rotated_point_a[2] = (nk_##svd_type##_t)scale_factor * \
|
|
641
|
-
(
|
|
642
|
-
|
|
689
|
+
(optimal_rotation[6] * point_a[0] + optimal_rotation[7] * point_a[1] + \
|
|
690
|
+
optimal_rotation[8] * point_a[2]); \
|
|
643
691
|
nk_##svd_type##_t dx = rotated_point_a[0] - point_b[0]; \
|
|
644
692
|
nk_##svd_type##_t dy = rotated_point_a[1] - point_b[1]; \
|
|
645
693
|
nk_##svd_type##_t dz = rotated_point_a[2] - point_b[2]; \
|
|
@@ -685,13 +733,29 @@ nk_define_umeyama_(bf16, f32, f32, f32, f32, nk_bf16_to_f32_serial, nk_f32_sqrt_
|
|
|
685
733
|
#undef nk_define_jacobi_eigenanalysis_
|
|
686
734
|
#undef nk_define_qr_givens_quaternion_
|
|
687
735
|
#undef nk_define_sort_singular_values_
|
|
688
|
-
#undef
|
|
736
|
+
#undef nk_define_qr_orthogonal_factor_
|
|
689
737
|
#undef nk_define_svd3x3_
|
|
690
738
|
#undef nk_define_det3x3_
|
|
691
739
|
#undef nk_define_rmsd_
|
|
692
740
|
#undef nk_define_kabsch_
|
|
693
741
|
#undef nk_define_umeyama_
|
|
694
742
|
|
|
743
|
+
#if defined(NDEBUG)
|
|
744
|
+
#if defined(_MSC_VER)
|
|
745
|
+
#pragma optimize("", on)
|
|
746
|
+
#elif defined(__clang__)
|
|
747
|
+
#pragma clang attribute pop
|
|
748
|
+
#elif defined(__GNUC__)
|
|
749
|
+
#pragma GCC pop_options
|
|
750
|
+
#endif
|
|
751
|
+
#endif
|
|
752
|
+
|
|
753
|
+
#if defined(__clang__)
|
|
754
|
+
#pragma clang attribute pop
|
|
755
|
+
#elif defined(__GNUC__)
|
|
756
|
+
#pragma GCC pop_options
|
|
757
|
+
#endif
|
|
758
|
+
|
|
695
759
|
#if defined(__cplusplus)
|
|
696
760
|
} // extern "C"
|
|
697
761
|
#endif
|