numkong 7.5.0 → 7.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +18 -0
- package/c/dispatch_e5m2.c +23 -3
- package/include/numkong/capabilities.h +1 -1
- package/include/numkong/cast/README.md +3 -0
- package/include/numkong/cast/haswell.h +28 -64
- package/include/numkong/cast/serial.h +17 -0
- package/include/numkong/cast/skylake.h +67 -52
- package/include/numkong/cast.h +1 -0
- package/include/numkong/dot/README.md +1 -0
- package/include/numkong/dot/haswell.h +92 -13
- package/include/numkong/dot/serial.h +15 -0
- package/include/numkong/dot/skylake.h +61 -14
- package/include/numkong/dots/README.md +2 -0
- package/include/numkong/dots/graniteamx.h +434 -0
- package/include/numkong/dots/haswell.h +28 -28
- package/include/numkong/dots/sapphireamx.h +1 -1
- package/include/numkong/dots/serial.h +23 -8
- package/include/numkong/dots/skylake.h +28 -23
- package/include/numkong/dots.h +12 -0
- package/include/numkong/each/serial.h +18 -1
- package/include/numkong/geospatial/serial.h +14 -3
- package/include/numkong/maxsim/serial.h +15 -0
- package/include/numkong/mesh/README.md +50 -44
- package/include/numkong/mesh/genoa.h +462 -0
- package/include/numkong/mesh/haswell.h +806 -933
- package/include/numkong/mesh/neon.h +871 -943
- package/include/numkong/mesh/neonbfdot.h +382 -522
- package/include/numkong/mesh/neonfhm.h +676 -0
- package/include/numkong/mesh/rvv.h +404 -319
- package/include/numkong/mesh/serial.h +204 -162
- package/include/numkong/mesh/skylake.h +1029 -1585
- package/include/numkong/mesh/v128relaxed.h +403 -377
- package/include/numkong/mesh.h +38 -0
- package/include/numkong/reduce/serial.h +15 -1
- package/include/numkong/sparse/serial.h +17 -2
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +98 -56
- package/include/numkong/spatial/serial.h +15 -0
- package/include/numkong/spatial/skylake.h +114 -54
- package/include/numkong/spatial.h +0 -12
- package/include/numkong/spatials/graniteamx.h +128 -0
- package/include/numkong/spatials/serial.h +18 -1
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials.h +17 -0
- package/include/numkong/tensor.hpp +107 -23
- package/javascript/numkong.c +3 -2
- package/package.json +7 -7
- package/wasm/numkong.wasm +0 -0
|
@@ -146,28 +146,28 @@ extern "C" {
|
|
|
146
146
|
|
|
147
147
|
#define nk_define_sort_singular_values_(type) \
|
|
148
148
|
NK_INTERNAL void nk_sort_singular_values_##type##_(nk_##type##_t *b, nk_##type##_t *v) { \
|
|
149
|
-
nk_##type##_t
|
|
150
|
-
nk_##type##_t
|
|
151
|
-
nk_##type##_t
|
|
149
|
+
nk_##type##_t column_norm_squared_0 = b[0] * b[0] + b[3] * b[3] + b[6] * b[6]; \
|
|
150
|
+
nk_##type##_t column_norm_squared_1 = b[1] * b[1] + b[4] * b[4] + b[7] * b[7]; \
|
|
151
|
+
nk_##type##_t column_norm_squared_2 = b[2] * b[2] + b[5] * b[5] + b[8] * b[8]; \
|
|
152
152
|
int should_swap; \
|
|
153
153
|
/* Sort columns by descending singular value magnitude */ \
|
|
154
|
-
should_swap =
|
|
154
|
+
should_swap = column_norm_squared_0 < column_norm_squared_1; \
|
|
155
155
|
nk_conditional_negating_swap_##type##_(should_swap, &b[0], &b[1]); \
|
|
156
156
|
nk_conditional_negating_swap_##type##_(should_swap, &v[0], &v[1]); \
|
|
157
157
|
nk_conditional_negating_swap_##type##_(should_swap, &b[3], &b[4]); \
|
|
158
158
|
nk_conditional_negating_swap_##type##_(should_swap, &v[3], &v[4]); \
|
|
159
159
|
nk_conditional_negating_swap_##type##_(should_swap, &b[6], &b[7]); \
|
|
160
160
|
nk_conditional_negating_swap_##type##_(should_swap, &v[6], &v[7]); \
|
|
161
|
-
nk_cond_swap_##type##_(should_swap, &
|
|
162
|
-
should_swap =
|
|
161
|
+
nk_cond_swap_##type##_(should_swap, &column_norm_squared_0, &column_norm_squared_1); \
|
|
162
|
+
should_swap = column_norm_squared_0 < column_norm_squared_2; \
|
|
163
163
|
nk_conditional_negating_swap_##type##_(should_swap, &b[0], &b[2]); \
|
|
164
164
|
nk_conditional_negating_swap_##type##_(should_swap, &v[0], &v[2]); \
|
|
165
165
|
nk_conditional_negating_swap_##type##_(should_swap, &b[3], &b[5]); \
|
|
166
166
|
nk_conditional_negating_swap_##type##_(should_swap, &v[3], &v[5]); \
|
|
167
167
|
nk_conditional_negating_swap_##type##_(should_swap, &b[6], &b[8]); \
|
|
168
168
|
nk_conditional_negating_swap_##type##_(should_swap, &v[6], &v[8]); \
|
|
169
|
-
nk_cond_swap_##type##_(should_swap, &
|
|
170
|
-
should_swap =
|
|
169
|
+
nk_cond_swap_##type##_(should_swap, &column_norm_squared_0, &column_norm_squared_2); \
|
|
170
|
+
should_swap = column_norm_squared_1 < column_norm_squared_2; \
|
|
171
171
|
nk_conditional_negating_swap_##type##_(should_swap, &b[1], &b[2]); \
|
|
172
172
|
nk_conditional_negating_swap_##type##_(should_swap, &v[1], &v[2]); \
|
|
173
173
|
nk_conditional_negating_swap_##type##_(should_swap, &b[4], &b[5]); \
|
|
@@ -176,48 +176,32 @@ extern "C" {
|
|
|
176
176
|
nk_conditional_negating_swap_##type##_(should_swap, &v[7], &v[8]); \
|
|
177
177
|
}
|
|
178
178
|
|
|
179
|
-
|
|
180
|
-
|
|
179
|
+
/* Q-only QR via three Givens rotations. The R upper-triangular factor is unused by the SVD
|
|
180
|
+
* caller, so we skip computing it (saves the third cos_theta/sin_theta and 9 R stores per call).
|
|
181
|
+
*/
|
|
182
|
+
#define nk_define_qr_orthogonal_factor_(type) \
|
|
183
|
+
NK_INTERNAL void nk_qr_orthogonal_factor_##type##_(nk_##type##_t const *input, nk_##type##_t *q) { \
|
|
181
184
|
nk_##type##_t cos_half_1, sin_half_1; \
|
|
182
185
|
nk_##type##_t cos_half_2, sin_half_2; \
|
|
183
186
|
nk_##type##_t cos_half_3, sin_half_3; \
|
|
184
187
|
nk_##type##_t cos_theta, sin_theta; \
|
|
185
|
-
nk_##type##_t rotation_temp[9]
|
|
188
|
+
nk_##type##_t rotation_temp[9]; \
|
|
186
189
|
/* First Givens rotation (zero input[3]) */ \
|
|
187
190
|
nk_qr_givens_quaternion_##type##_(input[0], input[3], &cos_half_1, &sin_half_1); \
|
|
188
191
|
cos_theta = 1 - 2 * sin_half_1 * sin_half_1; \
|
|
189
192
|
sin_theta = 2 * cos_half_1 * sin_half_1; \
|
|
190
|
-
rotation_temp[0] = cos_theta * input[0] + sin_theta * input[3]; \
|
|
191
193
|
rotation_temp[1] = cos_theta * input[1] + sin_theta * input[4]; \
|
|
192
|
-
rotation_temp[2] = cos_theta * input[2] + sin_theta * input[5]; \
|
|
193
|
-
rotation_temp[3] = -sin_theta * input[0] + cos_theta * input[3]; \
|
|
194
194
|
rotation_temp[4] = -sin_theta * input[1] + cos_theta * input[4]; \
|
|
195
|
-
rotation_temp[
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
nk_qr_givens_quaternion_##type##_(rotation_temp[0], rotation_temp[6], &cos_half_2, &sin_half_2); \
|
|
195
|
+
rotation_temp[7] = input[7]; \
|
|
196
|
+
/* Second Givens rotation (only the [4]/[7] cells flow into the third Givens) */ \
|
|
197
|
+
nk_qr_givens_quaternion_##type##_(input[0] * cos_theta + sin_theta * input[3], input[6], &cos_half_2, \
|
|
198
|
+
&sin_half_2); \
|
|
200
199
|
cos_theta = 1 - 2 * sin_half_2 * sin_half_2; \
|
|
201
200
|
sin_theta = 2 * cos_half_2 * sin_half_2; \
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
matrix_temp[5] = rotation_temp[5]; \
|
|
207
|
-
matrix_temp[6] = -sin_theta * rotation_temp[0] + cos_theta * rotation_temp[6]; \
|
|
208
|
-
matrix_temp[7] = -sin_theta * rotation_temp[1] + cos_theta * rotation_temp[7]; \
|
|
209
|
-
matrix_temp[8] = -sin_theta * rotation_temp[2] + cos_theta * rotation_temp[8]; \
|
|
210
|
-
/* Third Givens rotation (zero matrix_temp[7]) */ \
|
|
211
|
-
nk_qr_givens_quaternion_##type##_(matrix_temp[4], matrix_temp[7], &cos_half_3, &sin_half_3); \
|
|
212
|
-
cos_theta = 1 - 2 * sin_half_3 * sin_half_3; \
|
|
213
|
-
sin_theta = 2 * cos_half_3 * sin_half_3; \
|
|
214
|
-
r[0] = matrix_temp[0], r[1] = matrix_temp[1], r[2] = matrix_temp[2]; \
|
|
215
|
-
r[3] = cos_theta * matrix_temp[3] + sin_theta * matrix_temp[6]; \
|
|
216
|
-
r[4] = cos_theta * matrix_temp[4] + sin_theta * matrix_temp[7]; \
|
|
217
|
-
r[5] = cos_theta * matrix_temp[5] + sin_theta * matrix_temp[8]; \
|
|
218
|
-
r[6] = -sin_theta * matrix_temp[3] + cos_theta * matrix_temp[6]; \
|
|
219
|
-
r[7] = -sin_theta * matrix_temp[4] + cos_theta * matrix_temp[7]; \
|
|
220
|
-
r[8] = -sin_theta * matrix_temp[5] + cos_theta * matrix_temp[8]; \
|
|
201
|
+
nk_##type##_t matrix_temp_4 = rotation_temp[4]; \
|
|
202
|
+
nk_##type##_t matrix_temp_7 = -sin_theta * rotation_temp[1] + cos_theta * rotation_temp[7]; \
|
|
203
|
+
/* Third Givens rotation (zero matrix_temp_7) — only sin_half_3 / cos_half_3 are needed for Q */ \
|
|
204
|
+
nk_qr_givens_quaternion_##type##_(matrix_temp_4, matrix_temp_7, &cos_half_3, &sin_half_3); \
|
|
221
205
|
/* Construct Q = Q1 * Q2 * Q3 (closed-form expressions) */ \
|
|
222
206
|
nk_##type##_t sin_half_1_sq = sin_half_1 * sin_half_1; \
|
|
223
207
|
nk_##type##_t sin_half_2_sq = sin_half_2 * sin_half_2; \
|
|
@@ -238,49 +222,48 @@ extern "C" {
|
|
|
238
222
|
q[8] = (-1 + 2 * sin_half_2_sq) * (-1 + 2 * sin_half_3_sq); \
|
|
239
223
|
}
|
|
240
224
|
|
|
241
|
-
#define nk_define_svd3x3_(type, compute_sqrt)
|
|
242
|
-
NK_INTERNAL void nk_svd3x3_##type##_(nk_##type##_t const *a, nk_##type##_t *
|
|
243
|
-
nk_##type##_t *
|
|
244
|
-
/* Compute Aᵀ * A (symmetric) */
|
|
245
|
-
nk_##type##_t ata[9];
|
|
246
|
-
ata[0] = nk_sum_three_squares_##type##_(a[0], a[3], a[6]);
|
|
247
|
-
ata[1] = nk_sum_three_products_##type##_(a[0], a[1], a[3], a[4], a[6], a[7]);
|
|
248
|
-
ata[2] = nk_sum_three_products_##type##_(a[0], a[2], a[3], a[5], a[6], a[8]);
|
|
249
|
-
ata[3] = ata[1];
|
|
250
|
-
ata[4] = nk_sum_three_squares_##type##_(a[1], a[4], a[7]);
|
|
251
|
-
ata[5] = nk_sum_three_products_##type##_(a[1], a[2], a[4], a[5], a[7], a[8]);
|
|
252
|
-
ata[6] = ata[2];
|
|
253
|
-
ata[7] = ata[5];
|
|
254
|
-
ata[8] = nk_sum_three_squares_##type##_(a[2], a[5], a[8]);
|
|
255
|
-
/* Jacobi eigenanalysis of Aᵀ * A */
|
|
256
|
-
nk_##type##_t quaternion[4];
|
|
257
|
-
nk_jacobi_eigenanalysis_##type##_(&ata[0], &ata[1], &ata[4], &ata[2], &ata[5], &ata[8], quaternion);
|
|
258
|
-
nk_quaternion_to_mat3x3_##type##_(quaternion,
|
|
259
|
-
/* B = A * V */
|
|
260
|
-
nk_##type##_t product[9];
|
|
261
|
-
product[0] = nk_sum_three_products_##type##_(a[0],
|
|
262
|
-
product[1] = nk_sum_three_products_##type##_(a[0],
|
|
263
|
-
product[2] = nk_sum_three_products_##type##_(a[0],
|
|
264
|
-
product[3] = nk_sum_three_products_##type##_(a[3],
|
|
265
|
-
product[4] = nk_sum_three_products_##type##_(a[3],
|
|
266
|
-
product[5] = nk_sum_three_products_##type##_(a[3],
|
|
267
|
-
product[6] = nk_sum_three_products_##type##_(a[6],
|
|
268
|
-
product[7] = nk_sum_three_products_##type##_(a[6],
|
|
269
|
-
product[8] = nk_sum_three_products_##type##_(a[6],
|
|
270
|
-
/* Sort singular values and update V */
|
|
271
|
-
nk_sort_singular_values_##type##_(product,
|
|
272
|
-
/* Compute singular values from column norms of sorted B (before QR orthogonalizes them) */
|
|
273
|
-
/* These are the true singular values: √(‖colᵢ‖²) */
|
|
274
|
-
nk_##type##_t
|
|
275
|
-
nk_##type##_t
|
|
276
|
-
nk_##type##_t
|
|
277
|
-
/* QR
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
svd_s[6] = 0, svd_s[7] = 0, svd_s[8] = compute_sqrt(s3_sq); \
|
|
225
|
+
#define nk_define_svd3x3_(type, compute_sqrt) \
|
|
226
|
+
NK_INTERNAL void nk_svd3x3_##type##_(nk_##type##_t const *a, nk_##type##_t *svd_left, nk_##type##_t *svd_diagonal, \
|
|
227
|
+
nk_##type##_t *svd_right) { \
|
|
228
|
+
/* Compute Aᵀ * A (symmetric) */ \
|
|
229
|
+
nk_##type##_t ata[9]; \
|
|
230
|
+
ata[0] = nk_sum_three_squares_##type##_(a[0], a[3], a[6]); \
|
|
231
|
+
ata[1] = nk_sum_three_products_##type##_(a[0], a[1], a[3], a[4], a[6], a[7]); \
|
|
232
|
+
ata[2] = nk_sum_three_products_##type##_(a[0], a[2], a[3], a[5], a[6], a[8]); \
|
|
233
|
+
ata[3] = ata[1]; \
|
|
234
|
+
ata[4] = nk_sum_three_squares_##type##_(a[1], a[4], a[7]); \
|
|
235
|
+
ata[5] = nk_sum_three_products_##type##_(a[1], a[2], a[4], a[5], a[7], a[8]); \
|
|
236
|
+
ata[6] = ata[2]; \
|
|
237
|
+
ata[7] = ata[5]; \
|
|
238
|
+
ata[8] = nk_sum_three_squares_##type##_(a[2], a[5], a[8]); \
|
|
239
|
+
/* Jacobi eigenanalysis of Aᵀ * A */ \
|
|
240
|
+
nk_##type##_t quaternion[4]; \
|
|
241
|
+
nk_jacobi_eigenanalysis_##type##_(&ata[0], &ata[1], &ata[4], &ata[2], &ata[5], &ata[8], quaternion); \
|
|
242
|
+
nk_quaternion_to_mat3x3_##type##_(quaternion, svd_right); \
|
|
243
|
+
/* B = A * V */ \
|
|
244
|
+
nk_##type##_t product[9]; \
|
|
245
|
+
product[0] = nk_sum_three_products_##type##_(a[0], svd_right[0], a[1], svd_right[3], a[2], svd_right[6]); \
|
|
246
|
+
product[1] = nk_sum_three_products_##type##_(a[0], svd_right[1], a[1], svd_right[4], a[2], svd_right[7]); \
|
|
247
|
+
product[2] = nk_sum_three_products_##type##_(a[0], svd_right[2], a[1], svd_right[5], a[2], svd_right[8]); \
|
|
248
|
+
product[3] = nk_sum_three_products_##type##_(a[3], svd_right[0], a[4], svd_right[3], a[5], svd_right[6]); \
|
|
249
|
+
product[4] = nk_sum_three_products_##type##_(a[3], svd_right[1], a[4], svd_right[4], a[5], svd_right[7]); \
|
|
250
|
+
product[5] = nk_sum_three_products_##type##_(a[3], svd_right[2], a[4], svd_right[5], a[5], svd_right[8]); \
|
|
251
|
+
product[6] = nk_sum_three_products_##type##_(a[6], svd_right[0], a[7], svd_right[3], a[8], svd_right[6]); \
|
|
252
|
+
product[7] = nk_sum_three_products_##type##_(a[6], svd_right[1], a[7], svd_right[4], a[8], svd_right[7]); \
|
|
253
|
+
product[8] = nk_sum_three_products_##type##_(a[6], svd_right[2], a[7], svd_right[5], a[8], svd_right[8]); \
|
|
254
|
+
/* Sort singular values and update V */ \
|
|
255
|
+
nk_sort_singular_values_##type##_(product, svd_right); \
|
|
256
|
+
/* Compute singular values from column norms of sorted B (before QR orthogonalizes them) */ \
|
|
257
|
+
/* These are the true singular values: √(‖colᵢ‖²) */ \
|
|
258
|
+
nk_##type##_t singular_value_squared_0 = nk_sum_three_squares_##type##_(product[0], product[3], product[6]); \
|
|
259
|
+
nk_##type##_t singular_value_squared_1 = nk_sum_three_squares_##type##_(product[1], product[4], product[7]); \
|
|
260
|
+
nk_##type##_t singular_value_squared_2 = nk_sum_three_squares_##type##_(product[2], product[5], product[8]); \
|
|
261
|
+
/* Q-only QR: extract U (the orthogonal factor); R is unused by SVD */ \
|
|
262
|
+
nk_qr_orthogonal_factor_##type##_(product, svd_left); \
|
|
263
|
+
/* Store singular values on diagonal positions [0], [4], [8]; off-diagonals never read by callers */ \
|
|
264
|
+
svd_diagonal[0] = compute_sqrt(singular_value_squared_0); \
|
|
265
|
+
svd_diagonal[4] = compute_sqrt(singular_value_squared_1); \
|
|
266
|
+
svd_diagonal[8] = compute_sqrt(singular_value_squared_2); \
|
|
284
267
|
}
|
|
285
268
|
|
|
286
269
|
#define nk_define_det3x3_(type) \
|
|
@@ -289,7 +272,18 @@ extern "C" {
|
|
|
289
272
|
m[2] * (m[3] * m[7] - m[4] * m[6]); \
|
|
290
273
|
}
|
|
291
274
|
|
|
292
|
-
/*
|
|
275
|
+
/* Keep the serial instantiations below actually scalar, regardless of build type.
|
|
276
|
+
* Without this, -O3 + LTO can vectorize or clone the serial kernels under AVX-512
|
|
277
|
+
* callers in dispatch_*.c, which wastes binary and breaks the nk_*_serial-as-scalar-oracle
|
|
278
|
+
* contract that tests and numerical-stability docs rely on. See dots/serial.h. */
|
|
279
|
+
#if defined(__clang__)
|
|
280
|
+
#pragma clang attribute push(__attribute__((noinline)), apply_to = function)
|
|
281
|
+
#elif defined(__GNUC__)
|
|
282
|
+
#pragma GCC push_options
|
|
283
|
+
#pragma GCC optimize("no-tree-vectorize", "no-tree-slp-vectorize", "no-ipa-cp-clone", "no-inline")
|
|
284
|
+
#endif
|
|
285
|
+
|
|
286
|
+
/* Size bias for release. Gated on NDEBUG so Debug builds keep -O0 for stepping. */
|
|
293
287
|
#if defined(NDEBUG)
|
|
294
288
|
#if defined(_MSC_VER)
|
|
295
289
|
#pragma optimize("s", on)
|
|
@@ -352,27 +346,47 @@ NK_INTERNAL void nk_accumulate_square_f64_(nk_f64_t *sum, nk_f64_t *compensation
|
|
|
352
346
|
nk_f64_dot2_(sum, compensation, value, value);
|
|
353
347
|
}
|
|
354
348
|
|
|
355
|
-
NK_INTERNAL void nk_rotation_from_svd_f32_serial_(nk_f32_t const *
|
|
356
|
-
|
|
357
|
-
rotation[
|
|
358
|
-
|
|
359
|
-
rotation[
|
|
360
|
-
|
|
361
|
-
rotation[
|
|
362
|
-
|
|
363
|
-
rotation[
|
|
364
|
-
|
|
349
|
+
NK_INTERNAL void nk_rotation_from_svd_f32_serial_(nk_f32_t const *svd_left, nk_f32_t const *svd_right,
|
|
350
|
+
nk_f32_t *rotation) {
|
|
351
|
+
rotation[0] = nk_sum_three_products_f32_( //
|
|
352
|
+
svd_right[0], svd_left[0], svd_right[1], svd_left[1], svd_right[2], svd_left[2]);
|
|
353
|
+
rotation[1] = nk_sum_three_products_f32_( //
|
|
354
|
+
svd_right[0], svd_left[3], svd_right[1], svd_left[4], svd_right[2], svd_left[5]);
|
|
355
|
+
rotation[2] = nk_sum_three_products_f32_( //
|
|
356
|
+
svd_right[0], svd_left[6], svd_right[1], svd_left[7], svd_right[2], svd_left[8]);
|
|
357
|
+
rotation[3] = nk_sum_three_products_f32_( //
|
|
358
|
+
svd_right[3], svd_left[0], svd_right[4], svd_left[1], svd_right[5], svd_left[2]);
|
|
359
|
+
rotation[4] = nk_sum_three_products_f32_( //
|
|
360
|
+
svd_right[3], svd_left[3], svd_right[4], svd_left[4], svd_right[5], svd_left[5]);
|
|
361
|
+
rotation[5] = nk_sum_three_products_f32_( //
|
|
362
|
+
svd_right[3], svd_left[6], svd_right[4], svd_left[7], svd_right[5], svd_left[8]);
|
|
363
|
+
rotation[6] = nk_sum_three_products_f32_( //
|
|
364
|
+
svd_right[6], svd_left[0], svd_right[7], svd_left[1], svd_right[8], svd_left[2]);
|
|
365
|
+
rotation[7] = nk_sum_three_products_f32_( //
|
|
366
|
+
svd_right[6], svd_left[3], svd_right[7], svd_left[4], svd_right[8], svd_left[5]);
|
|
367
|
+
rotation[8] = nk_sum_three_products_f32_( //
|
|
368
|
+
svd_right[6], svd_left[6], svd_right[7], svd_left[7], svd_right[8], svd_left[8]);
|
|
365
369
|
}
|
|
366
|
-
NK_INTERNAL void nk_rotation_from_svd_f64_serial_(nk_f64_t const *
|
|
367
|
-
|
|
368
|
-
rotation[
|
|
369
|
-
|
|
370
|
-
rotation[
|
|
371
|
-
|
|
372
|
-
rotation[
|
|
373
|
-
|
|
374
|
-
rotation[
|
|
375
|
-
|
|
370
|
+
NK_INTERNAL void nk_rotation_from_svd_f64_serial_(nk_f64_t const *svd_left, nk_f64_t const *svd_right,
|
|
371
|
+
nk_f64_t *rotation) {
|
|
372
|
+
rotation[0] = nk_sum_three_products_f64_( //
|
|
373
|
+
svd_right[0], svd_left[0], svd_right[1], svd_left[1], svd_right[2], svd_left[2]);
|
|
374
|
+
rotation[1] = nk_sum_three_products_f64_( //
|
|
375
|
+
svd_right[0], svd_left[3], svd_right[1], svd_left[4], svd_right[2], svd_left[5]);
|
|
376
|
+
rotation[2] = nk_sum_three_products_f64_( //
|
|
377
|
+
svd_right[0], svd_left[6], svd_right[1], svd_left[7], svd_right[2], svd_left[8]);
|
|
378
|
+
rotation[3] = nk_sum_three_products_f64_( //
|
|
379
|
+
svd_right[3], svd_left[0], svd_right[4], svd_left[1], svd_right[5], svd_left[2]);
|
|
380
|
+
rotation[4] = nk_sum_three_products_f64_( //
|
|
381
|
+
svd_right[3], svd_left[3], svd_right[4], svd_left[4], svd_right[5], svd_left[5]);
|
|
382
|
+
rotation[5] = nk_sum_three_products_f64_( //
|
|
383
|
+
svd_right[3], svd_left[6], svd_right[4], svd_left[7], svd_right[5], svd_left[8]);
|
|
384
|
+
rotation[6] = nk_sum_three_products_f64_( //
|
|
385
|
+
svd_right[6], svd_left[0], svd_right[7], svd_left[1], svd_right[8], svd_left[2]);
|
|
386
|
+
rotation[7] = nk_sum_three_products_f64_( //
|
|
387
|
+
svd_right[6], svd_left[3], svd_right[7], svd_left[4], svd_right[8], svd_left[5]);
|
|
388
|
+
rotation[8] = nk_sum_three_products_f64_( //
|
|
389
|
+
svd_right[6], svd_left[6], svd_right[7], svd_left[7], svd_right[8], svd_left[8]);
|
|
376
390
|
}
|
|
377
391
|
|
|
378
392
|
nk_define_cond_swap_(f32)
|
|
@@ -384,7 +398,7 @@ nk_define_quaternion_to_mat3x3_(f32)
|
|
|
384
398
|
nk_define_jacobi_eigenanalysis_(f32, nk_f32_rsqrt_serial)
|
|
385
399
|
nk_define_qr_givens_quaternion_(f32, NK_F32_SVD_EPSILON_, nk_f32_rsqrt_serial)
|
|
386
400
|
nk_define_sort_singular_values_(f32)
|
|
387
|
-
|
|
401
|
+
nk_define_qr_orthogonal_factor_(f32)
|
|
388
402
|
nk_define_svd3x3_(f32, nk_f32_sqrt_serial)
|
|
389
403
|
nk_define_det3x3_(f32)
|
|
390
404
|
|
|
@@ -397,7 +411,7 @@ nk_define_quaternion_to_mat3x3_(f64)
|
|
|
397
411
|
nk_define_jacobi_eigenanalysis_(f64, nk_f64_rsqrt_serial)
|
|
398
412
|
nk_define_qr_givens_quaternion_(f64, NK_F64_SVD_EPSILON_, nk_f64_rsqrt_serial)
|
|
399
413
|
nk_define_sort_singular_values_(f64)
|
|
400
|
-
|
|
414
|
+
nk_define_qr_orthogonal_factor_(f64)
|
|
401
415
|
nk_define_svd3x3_(f64, nk_f64_sqrt_serial)
|
|
402
416
|
nk_define_det3x3_(f64)
|
|
403
417
|
|
|
@@ -472,42 +486,53 @@ nk_define_det3x3_(f64)
|
|
|
472
486
|
b_centroid[0] = (nk_##output_type##_t)centroid_b_x, b_centroid[1] = (nk_##output_type##_t)centroid_b_y, \
|
|
473
487
|
b_centroid[2] = (nk_##output_type##_t)centroid_b_z; \
|
|
474
488
|
/* Step 2: Build 3×3 covariance matrix H = (A - Ā)ᵀ × (B - B̄) */ \
|
|
475
|
-
nk_##accumulator_type##_t
|
|
476
|
-
nk_##accumulator_type##_t
|
|
489
|
+
nk_##accumulator_type##_t cross_covariance_kahan_sum[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
|
|
490
|
+
nk_##accumulator_type##_t cross_covariance_kahan_compensation[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
|
|
477
491
|
for (nk_size_t i = 0; i < n; ++i) { \
|
|
478
492
|
load_and_convert(a + i * 3 + 0, &val_a_x), load_and_convert(b + i * 3 + 0, &val_b_x); \
|
|
479
493
|
load_and_convert(a + i * 3 + 1, &val_a_y), load_and_convert(b + i * 3 + 1, &val_b_y); \
|
|
480
494
|
load_and_convert(a + i * 3 + 2, &val_a_z), load_and_convert(b + i * 3 + 2, &val_b_z); \
|
|
481
495
|
val_a_x -= centroid_a_x, val_a_y -= centroid_a_y, val_a_z -= centroid_a_z; \
|
|
482
496
|
val_b_x -= centroid_b_x, val_b_y -= centroid_b_y, val_b_z -= centroid_b_z; \
|
|
483
|
-
nk_accumulate_product_##accumulator_type##_(&
|
|
484
|
-
|
|
485
|
-
nk_accumulate_product_##accumulator_type##_(&
|
|
486
|
-
|
|
487
|
-
nk_accumulate_product_##accumulator_type##_(&
|
|
488
|
-
|
|
489
|
-
nk_accumulate_product_##accumulator_type##_(&
|
|
490
|
-
|
|
491
|
-
nk_accumulate_product_##accumulator_type##_(&
|
|
497
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[0], \
|
|
498
|
+
&cross_covariance_kahan_compensation[0], val_a_x, val_b_x); \
|
|
499
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[1], \
|
|
500
|
+
&cross_covariance_kahan_compensation[1], val_a_x, val_b_y); \
|
|
501
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[2], \
|
|
502
|
+
&cross_covariance_kahan_compensation[2], val_a_x, val_b_z); \
|
|
503
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[3], \
|
|
504
|
+
&cross_covariance_kahan_compensation[3], val_a_y, val_b_x); \
|
|
505
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[4], \
|
|
506
|
+
&cross_covariance_kahan_compensation[4], val_a_y, val_b_y); \
|
|
507
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[5], \
|
|
508
|
+
&cross_covariance_kahan_compensation[5], val_a_y, val_b_z); \
|
|
509
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[6], \
|
|
510
|
+
&cross_covariance_kahan_compensation[6], val_a_z, val_b_x); \
|
|
511
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[7], \
|
|
512
|
+
&cross_covariance_kahan_compensation[7], val_a_z, val_b_y); \
|
|
513
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[8], \
|
|
514
|
+
&cross_covariance_kahan_compensation[8], val_a_z, val_b_z); \
|
|
492
515
|
} \
|
|
493
516
|
/* Convert to svd_type for SVD */ \
|
|
494
517
|
nk_##svd_type##_t cross_covariance[9]; \
|
|
495
|
-
for (int j = 0; j < 9; ++j)
|
|
518
|
+
for (int j = 0; j < 9; ++j) \
|
|
519
|
+
cross_covariance[j] = (nk_##svd_type##_t)(cross_covariance_kahan_sum[j] + \
|
|
520
|
+
cross_covariance_kahan_compensation[j]); \
|
|
496
521
|
/* Step 3: SVD of H = U * S * Vᵀ */ \
|
|
497
|
-
nk_##svd_type##_t
|
|
498
|
-
nk_svd3x3_##svd_type##_(cross_covariance,
|
|
522
|
+
nk_##svd_type##_t svd_left[9], svd_diagonal[9], svd_right[9]; \
|
|
523
|
+
nk_svd3x3_##svd_type##_(cross_covariance, svd_left, svd_diagonal, svd_right); \
|
|
499
524
|
/* Step 4: R = V * Uᵀ */ \
|
|
500
|
-
nk_##svd_type##_t
|
|
501
|
-
nk_rotation_from_svd_##svd_type##_serial_(
|
|
525
|
+
nk_##svd_type##_t optimal_rotation[9]; \
|
|
526
|
+
nk_rotation_from_svd_##svd_type##_serial_(svd_left, svd_right, optimal_rotation); \
|
|
502
527
|
/* Handle reflection: if det(R) < 0, negate third column of V and recompute R */ \
|
|
503
|
-
nk_##svd_type##_t
|
|
504
|
-
if (
|
|
505
|
-
|
|
506
|
-
nk_rotation_from_svd_##svd_type##_serial_(
|
|
528
|
+
nk_##svd_type##_t rotation_determinant = nk_det3x3_##svd_type##_(optimal_rotation); \
|
|
529
|
+
if (rotation_determinant < 0) { \
|
|
530
|
+
svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8]; \
|
|
531
|
+
nk_rotation_from_svd_##svd_type##_serial_(svd_left, svd_right, optimal_rotation); \
|
|
507
532
|
} \
|
|
508
533
|
/* Output rotation matrix and scale=1.0 */ \
|
|
509
534
|
if (rotation) \
|
|
510
|
-
for (int j = 0; j < 9; ++j) rotation[j] = (nk_##output_type##_t)
|
|
535
|
+
for (int j = 0; j < 9; ++j) rotation[j] = (nk_##output_type##_t)optimal_rotation[j]; \
|
|
511
536
|
if (scale) *scale = (nk_##output_type##_t)1; \
|
|
512
537
|
/* Step 5: Compute RMSD after rotation */ \
|
|
513
538
|
nk_##accumulator_type##_t sum_squared = 0, sum_squared_compensation = 0; \
|
|
@@ -522,12 +547,12 @@ nk_define_det3x3_(f64)
|
|
|
522
547
|
point_b[0] = (nk_##svd_type##_t)(val_b_x - centroid_b_x), \
|
|
523
548
|
point_b[1] = (nk_##svd_type##_t)(val_b_y - centroid_b_y), \
|
|
524
549
|
point_b[2] = (nk_##svd_type##_t)(val_b_z - centroid_b_z); \
|
|
525
|
-
rotated_point_a[0] =
|
|
526
|
-
|
|
527
|
-
rotated_point_a[1] =
|
|
528
|
-
|
|
529
|
-
rotated_point_a[2] =
|
|
530
|
-
|
|
550
|
+
rotated_point_a[0] = optimal_rotation[0] * point_a[0] + optimal_rotation[1] * point_a[1] + \
|
|
551
|
+
optimal_rotation[2] * point_a[2]; \
|
|
552
|
+
rotated_point_a[1] = optimal_rotation[3] * point_a[0] + optimal_rotation[4] * point_a[1] + \
|
|
553
|
+
optimal_rotation[5] * point_a[2]; \
|
|
554
|
+
rotated_point_a[2] = optimal_rotation[6] * point_a[0] + optimal_rotation[7] * point_a[1] + \
|
|
555
|
+
optimal_rotation[8] * point_a[2]; \
|
|
531
556
|
nk_##svd_type##_t dx = rotated_point_a[0] - point_b[0]; \
|
|
532
557
|
nk_##svd_type##_t dy = rotated_point_a[1] - point_b[1]; \
|
|
533
558
|
nk_##svd_type##_t dz = rotated_point_a[2] - point_b[2]; \
|
|
@@ -583,8 +608,8 @@ nk_define_det3x3_(f64)
|
|
|
583
608
|
b_centroid[0] = (nk_##output_type##_t)centroid_b_x, b_centroid[1] = (nk_##output_type##_t)centroid_b_y, \
|
|
584
609
|
b_centroid[2] = (nk_##output_type##_t)centroid_b_z; \
|
|
585
610
|
/* Step 2: Build covariance matrix H and compute variance of A */ \
|
|
586
|
-
nk_##accumulator_type##_t
|
|
587
|
-
nk_##accumulator_type##_t
|
|
611
|
+
nk_##accumulator_type##_t cross_covariance_kahan_sum[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
|
|
612
|
+
nk_##accumulator_type##_t cross_covariance_kahan_compensation[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
|
|
588
613
|
nk_##accumulator_type##_t variance_a = 0, variance_a_compensation = 0; \
|
|
589
614
|
for (nk_size_t i = 0; i < n; ++i) { \
|
|
590
615
|
load_and_convert(a + i * 3 + 0, &val_a_x), load_and_convert(b + i * 3 + 0, &val_b_x); \
|
|
@@ -595,41 +620,52 @@ nk_define_det3x3_(f64)
|
|
|
595
620
|
nk_accumulate_square_##accumulator_type##_(&variance_a, &variance_a_compensation, val_a_x); \
|
|
596
621
|
nk_accumulate_square_##accumulator_type##_(&variance_a, &variance_a_compensation, val_a_y); \
|
|
597
622
|
nk_accumulate_square_##accumulator_type##_(&variance_a, &variance_a_compensation, val_a_z); \
|
|
598
|
-
nk_accumulate_product_##accumulator_type##_(&
|
|
599
|
-
|
|
600
|
-
nk_accumulate_product_##accumulator_type##_(&
|
|
601
|
-
|
|
602
|
-
nk_accumulate_product_##accumulator_type##_(&
|
|
603
|
-
|
|
604
|
-
nk_accumulate_product_##accumulator_type##_(&
|
|
605
|
-
|
|
606
|
-
nk_accumulate_product_##accumulator_type##_(&
|
|
623
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[0], \
|
|
624
|
+
&cross_covariance_kahan_compensation[0], val_a_x, val_b_x); \
|
|
625
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[1], \
|
|
626
|
+
&cross_covariance_kahan_compensation[1], val_a_x, val_b_y); \
|
|
627
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[2], \
|
|
628
|
+
&cross_covariance_kahan_compensation[2], val_a_x, val_b_z); \
|
|
629
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[3], \
|
|
630
|
+
&cross_covariance_kahan_compensation[3], val_a_y, val_b_x); \
|
|
631
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[4], \
|
|
632
|
+
&cross_covariance_kahan_compensation[4], val_a_y, val_b_y); \
|
|
633
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[5], \
|
|
634
|
+
&cross_covariance_kahan_compensation[5], val_a_y, val_b_z); \
|
|
635
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[6], \
|
|
636
|
+
&cross_covariance_kahan_compensation[6], val_a_z, val_b_x); \
|
|
637
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[7], \
|
|
638
|
+
&cross_covariance_kahan_compensation[7], val_a_z, val_b_y); \
|
|
639
|
+
nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[8], \
|
|
640
|
+
&cross_covariance_kahan_compensation[8], val_a_z, val_b_z); \
|
|
607
641
|
} \
|
|
608
642
|
variance_a = (variance_a + variance_a_compensation) * inv_n; \
|
|
609
643
|
/* Convert to svd_type for SVD */ \
|
|
610
644
|
nk_##svd_type##_t cross_covariance[9]; \
|
|
611
|
-
for (int j = 0; j < 9; ++j)
|
|
645
|
+
for (int j = 0; j < 9; ++j) \
|
|
646
|
+
cross_covariance[j] = (nk_##svd_type##_t)(cross_covariance_kahan_sum[j] + \
|
|
647
|
+
cross_covariance_kahan_compensation[j]); \
|
|
612
648
|
/* Step 3: SVD of H = U * S * Vᵀ */ \
|
|
613
|
-
nk_##svd_type##_t
|
|
614
|
-
nk_svd3x3_##svd_type##_(cross_covariance,
|
|
649
|
+
nk_##svd_type##_t svd_left[9], svd_diagonal[9], svd_right[9]; \
|
|
650
|
+
nk_svd3x3_##svd_type##_(cross_covariance, svd_left, svd_diagonal, svd_right); \
|
|
615
651
|
/* Step 4: R = V * Uᵀ */ \
|
|
616
|
-
nk_##svd_type##_t
|
|
617
|
-
nk_rotation_from_svd_##svd_type##_serial_(
|
|
652
|
+
nk_##svd_type##_t optimal_rotation[9]; \
|
|
653
|
+
nk_rotation_from_svd_##svd_type##_serial_(svd_left, svd_right, optimal_rotation); \
|
|
618
654
|
/* Handle reflection and compute scale: c = trace(D × S) / variance(a) */ \
|
|
619
|
-
/* D = diag(1, 1, det(R)),
|
|
620
|
-
nk_##svd_type##_t
|
|
621
|
-
nk_##svd_type##_t sign_det =
|
|
622
|
-
nk_##svd_type##_t trace_scaled_s =
|
|
655
|
+
/* D = diag(1, 1, det(R)), svd_diagonal contains proper positive singular values on diagonal */ \
|
|
656
|
+
nk_##svd_type##_t rotation_determinant = nk_det3x3_##svd_type##_(optimal_rotation); \
|
|
657
|
+
nk_##svd_type##_t sign_det = rotation_determinant < 0 ? (nk_##svd_type##_t) - 1.0 : (nk_##svd_type##_t)1.0; \
|
|
658
|
+
nk_##svd_type##_t trace_scaled_s = svd_diagonal[0] + svd_diagonal[4] + sign_det * svd_diagonal[8]; \
|
|
623
659
|
nk_##accumulator_type##_t scale_factor = (nk_##accumulator_type##_t)trace_scaled_s / \
|
|
624
660
|
((nk_##accumulator_type##_t)n * variance_a); \
|
|
625
661
|
if (scale) *scale = (nk_##output_type##_t)scale_factor; \
|
|
626
|
-
if (
|
|
627
|
-
|
|
628
|
-
nk_rotation_from_svd_##svd_type##_serial_(
|
|
662
|
+
if (rotation_determinant < 0) { \
|
|
663
|
+
svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8]; \
|
|
664
|
+
nk_rotation_from_svd_##svd_type##_serial_(svd_left, svd_right, optimal_rotation); \
|
|
629
665
|
} \
|
|
630
666
|
/* Output rotation matrix */ \
|
|
631
667
|
if (rotation) \
|
|
632
|
-
for (int j = 0; j < 9; ++j) rotation[j] = (nk_##output_type##_t)
|
|
668
|
+
for (int j = 0; j < 9; ++j) rotation[j] = (nk_##output_type##_t)optimal_rotation[j]; \
|
|
633
669
|
/* Step 5: Compute RMSD after similarity transform: ‖c × R × a - b‖ */ \
|
|
634
670
|
nk_##accumulator_type##_t sum_squared = 0, sum_squared_compensation = 0; \
|
|
635
671
|
for (nk_size_t i = 0; i < n; ++i) { \
|
|
@@ -644,14 +680,14 @@ nk_define_det3x3_(f64)
|
|
|
644
680
|
point_b[1] = (nk_##svd_type##_t)(val_b_y - centroid_b_y), \
|
|
645
681
|
point_b[2] = (nk_##svd_type##_t)(val_b_z - centroid_b_z); \
|
|
646
682
|
rotated_point_a[0] = (nk_##svd_type##_t)scale_factor * \
|
|
647
|
-
(
|
|
648
|
-
|
|
683
|
+
(optimal_rotation[0] * point_a[0] + optimal_rotation[1] * point_a[1] + \
|
|
684
|
+
optimal_rotation[2] * point_a[2]); \
|
|
649
685
|
rotated_point_a[1] = (nk_##svd_type##_t)scale_factor * \
|
|
650
|
-
(
|
|
651
|
-
|
|
686
|
+
(optimal_rotation[3] * point_a[0] + optimal_rotation[4] * point_a[1] + \
|
|
687
|
+
optimal_rotation[5] * point_a[2]); \
|
|
652
688
|
rotated_point_a[2] = (nk_##svd_type##_t)scale_factor * \
|
|
653
|
-
(
|
|
654
|
-
|
|
689
|
+
(optimal_rotation[6] * point_a[0] + optimal_rotation[7] * point_a[1] + \
|
|
690
|
+
optimal_rotation[8] * point_a[2]); \
|
|
655
691
|
nk_##svd_type##_t dx = rotated_point_a[0] - point_b[0]; \
|
|
656
692
|
nk_##svd_type##_t dy = rotated_point_a[1] - point_b[1]; \
|
|
657
693
|
nk_##svd_type##_t dz = rotated_point_a[2] - point_b[2]; \
|
|
@@ -697,7 +733,7 @@ nk_define_umeyama_(bf16, f32, f32, f32, f32, nk_bf16_to_f32_serial, nk_f32_sqrt_
|
|
|
697
733
|
#undef nk_define_jacobi_eigenanalysis_
|
|
698
734
|
#undef nk_define_qr_givens_quaternion_
|
|
699
735
|
#undef nk_define_sort_singular_values_
|
|
700
|
-
#undef
|
|
736
|
+
#undef nk_define_qr_orthogonal_factor_
|
|
701
737
|
#undef nk_define_svd3x3_
|
|
702
738
|
#undef nk_define_det3x3_
|
|
703
739
|
#undef nk_define_rmsd_
|
|
@@ -714,6 +750,12 @@ nk_define_umeyama_(bf16, f32, f32, f32, f32, nk_bf16_to_f32_serial, nk_f32_sqrt_
|
|
|
714
750
|
#endif
|
|
715
751
|
#endif
|
|
716
752
|
|
|
753
|
+
#if defined(__clang__)
|
|
754
|
+
#pragma clang attribute pop
|
|
755
|
+
#elif defined(__GNUC__)
|
|
756
|
+
#pragma GCC pop_options
|
|
757
|
+
#endif
|
|
758
|
+
|
|
717
759
|
#if defined(__cplusplus)
|
|
718
760
|
} // extern "C"
|
|
719
761
|
#endif
|