numkong 7.5.0 → 7.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/binding.gyp +18 -0
  2. package/c/dispatch_e5m2.c +23 -3
  3. package/include/numkong/capabilities.h +1 -1
  4. package/include/numkong/cast/README.md +3 -0
  5. package/include/numkong/cast/haswell.h +28 -64
  6. package/include/numkong/cast/serial.h +17 -0
  7. package/include/numkong/cast/skylake.h +67 -52
  8. package/include/numkong/cast.h +1 -0
  9. package/include/numkong/dot/README.md +1 -0
  10. package/include/numkong/dot/haswell.h +92 -13
  11. package/include/numkong/dot/serial.h +15 -0
  12. package/include/numkong/dot/skylake.h +61 -14
  13. package/include/numkong/dots/README.md +2 -0
  14. package/include/numkong/dots/graniteamx.h +434 -0
  15. package/include/numkong/dots/haswell.h +28 -28
  16. package/include/numkong/dots/sapphireamx.h +1 -1
  17. package/include/numkong/dots/serial.h +23 -8
  18. package/include/numkong/dots/skylake.h +28 -23
  19. package/include/numkong/dots.h +12 -0
  20. package/include/numkong/each/serial.h +18 -1
  21. package/include/numkong/geospatial/serial.h +14 -3
  22. package/include/numkong/maxsim/serial.h +15 -0
  23. package/include/numkong/mesh/README.md +50 -44
  24. package/include/numkong/mesh/genoa.h +462 -0
  25. package/include/numkong/mesh/haswell.h +806 -933
  26. package/include/numkong/mesh/neon.h +871 -943
  27. package/include/numkong/mesh/neonbfdot.h +382 -522
  28. package/include/numkong/mesh/neonfhm.h +676 -0
  29. package/include/numkong/mesh/rvv.h +404 -319
  30. package/include/numkong/mesh/serial.h +204 -162
  31. package/include/numkong/mesh/skylake.h +1029 -1585
  32. package/include/numkong/mesh/v128relaxed.h +403 -377
  33. package/include/numkong/mesh.h +38 -0
  34. package/include/numkong/reduce/serial.h +15 -1
  35. package/include/numkong/sparse/serial.h +17 -2
  36. package/include/numkong/spatial/genoa.h +0 -68
  37. package/include/numkong/spatial/haswell.h +98 -56
  38. package/include/numkong/spatial/serial.h +15 -0
  39. package/include/numkong/spatial/skylake.h +114 -54
  40. package/include/numkong/spatial.h +0 -12
  41. package/include/numkong/spatials/graniteamx.h +128 -0
  42. package/include/numkong/spatials/serial.h +18 -1
  43. package/include/numkong/spatials/skylake.h +2 -2
  44. package/include/numkong/spatials.h +17 -0
  45. package/include/numkong/tensor.hpp +107 -23
  46. package/javascript/numkong.c +3 -2
  47. package/package.json +7 -7
  48. package/wasm/numkong.wasm +0 -0
@@ -146,28 +146,28 @@ extern "C" {
146
146
 
147
147
  #define nk_define_sort_singular_values_(type) \
148
148
  NK_INTERNAL void nk_sort_singular_values_##type##_(nk_##type##_t *b, nk_##type##_t *v) { \
149
- nk_##type##_t rho1 = b[0] * b[0] + b[3] * b[3] + b[6] * b[6]; \
150
- nk_##type##_t rho2 = b[1] * b[1] + b[4] * b[4] + b[7] * b[7]; \
151
- nk_##type##_t rho3 = b[2] * b[2] + b[5] * b[5] + b[8] * b[8]; \
149
+ nk_##type##_t column_norm_squared_0 = b[0] * b[0] + b[3] * b[3] + b[6] * b[6]; \
150
+ nk_##type##_t column_norm_squared_1 = b[1] * b[1] + b[4] * b[4] + b[7] * b[7]; \
151
+ nk_##type##_t column_norm_squared_2 = b[2] * b[2] + b[5] * b[5] + b[8] * b[8]; \
152
152
  int should_swap; \
153
153
  /* Sort columns by descending singular value magnitude */ \
154
- should_swap = rho1 < rho2; \
154
+ should_swap = column_norm_squared_0 < column_norm_squared_1; \
155
155
  nk_conditional_negating_swap_##type##_(should_swap, &b[0], &b[1]); \
156
156
  nk_conditional_negating_swap_##type##_(should_swap, &v[0], &v[1]); \
157
157
  nk_conditional_negating_swap_##type##_(should_swap, &b[3], &b[4]); \
158
158
  nk_conditional_negating_swap_##type##_(should_swap, &v[3], &v[4]); \
159
159
  nk_conditional_negating_swap_##type##_(should_swap, &b[6], &b[7]); \
160
160
  nk_conditional_negating_swap_##type##_(should_swap, &v[6], &v[7]); \
161
- nk_cond_swap_##type##_(should_swap, &rho1, &rho2); \
162
- should_swap = rho1 < rho3; \
161
+ nk_cond_swap_##type##_(should_swap, &column_norm_squared_0, &column_norm_squared_1); \
162
+ should_swap = column_norm_squared_0 < column_norm_squared_2; \
163
163
  nk_conditional_negating_swap_##type##_(should_swap, &b[0], &b[2]); \
164
164
  nk_conditional_negating_swap_##type##_(should_swap, &v[0], &v[2]); \
165
165
  nk_conditional_negating_swap_##type##_(should_swap, &b[3], &b[5]); \
166
166
  nk_conditional_negating_swap_##type##_(should_swap, &v[3], &v[5]); \
167
167
  nk_conditional_negating_swap_##type##_(should_swap, &b[6], &b[8]); \
168
168
  nk_conditional_negating_swap_##type##_(should_swap, &v[6], &v[8]); \
169
- nk_cond_swap_##type##_(should_swap, &rho1, &rho3); \
170
- should_swap = rho2 < rho3; \
169
+ nk_cond_swap_##type##_(should_swap, &column_norm_squared_0, &column_norm_squared_2); \
170
+ should_swap = column_norm_squared_1 < column_norm_squared_2; \
171
171
  nk_conditional_negating_swap_##type##_(should_swap, &b[1], &b[2]); \
172
172
  nk_conditional_negating_swap_##type##_(should_swap, &v[1], &v[2]); \
173
173
  nk_conditional_negating_swap_##type##_(should_swap, &b[4], &b[5]); \
@@ -176,48 +176,32 @@ extern "C" {
176
176
  nk_conditional_negating_swap_##type##_(should_swap, &v[7], &v[8]); \
177
177
  }
178
178
 
179
- #define nk_define_qr_decomposition_(type) \
180
- NK_INTERNAL void nk_qr_decomposition_##type##_(nk_##type##_t const *input, nk_##type##_t *q, nk_##type##_t *r) { \
179
+ /* Q-only QR via three Givens rotations. The R upper-triangular factor is unused by the SVD
180
+ * caller, so we skip computing it (saves the third cos_theta/sin_theta and 9 R stores per call).
181
+ */
182
+ #define nk_define_qr_orthogonal_factor_(type) \
183
+ NK_INTERNAL void nk_qr_orthogonal_factor_##type##_(nk_##type##_t const *input, nk_##type##_t *q) { \
181
184
  nk_##type##_t cos_half_1, sin_half_1; \
182
185
  nk_##type##_t cos_half_2, sin_half_2; \
183
186
  nk_##type##_t cos_half_3, sin_half_3; \
184
187
  nk_##type##_t cos_theta, sin_theta; \
185
- nk_##type##_t rotation_temp[9], matrix_temp[9]; \
188
+ nk_##type##_t rotation_temp[9]; \
186
189
  /* First Givens rotation (zero input[3]) */ \
187
190
  nk_qr_givens_quaternion_##type##_(input[0], input[3], &cos_half_1, &sin_half_1); \
188
191
  cos_theta = 1 - 2 * sin_half_1 * sin_half_1; \
189
192
  sin_theta = 2 * cos_half_1 * sin_half_1; \
190
- rotation_temp[0] = cos_theta * input[0] + sin_theta * input[3]; \
191
193
  rotation_temp[1] = cos_theta * input[1] + sin_theta * input[4]; \
192
- rotation_temp[2] = cos_theta * input[2] + sin_theta * input[5]; \
193
- rotation_temp[3] = -sin_theta * input[0] + cos_theta * input[3]; \
194
194
  rotation_temp[4] = -sin_theta * input[1] + cos_theta * input[4]; \
195
- rotation_temp[5] = -sin_theta * input[2] + cos_theta * input[5]; \
196
- rotation_temp[6] = input[6], rotation_temp[7] = input[7]; \
197
- rotation_temp[8] = input[8]; \
198
- /* Second Givens rotation (zero rotation_temp[6]) */ \
199
- nk_qr_givens_quaternion_##type##_(rotation_temp[0], rotation_temp[6], &cos_half_2, &sin_half_2); \
195
+ rotation_temp[7] = input[7]; \
196
+ /* Second Givens rotation (only the [4]/[7] cells flow into the third Givens) */ \
197
+ nk_qr_givens_quaternion_##type##_(input[0] * cos_theta + sin_theta * input[3], input[6], &cos_half_2, \
198
+ &sin_half_2); \
200
199
  cos_theta = 1 - 2 * sin_half_2 * sin_half_2; \
201
200
  sin_theta = 2 * cos_half_2 * sin_half_2; \
202
- matrix_temp[0] = cos_theta * rotation_temp[0] + sin_theta * rotation_temp[6]; \
203
- matrix_temp[1] = cos_theta * rotation_temp[1] + sin_theta * rotation_temp[7]; \
204
- matrix_temp[2] = cos_theta * rotation_temp[2] + sin_theta * rotation_temp[8]; \
205
- matrix_temp[3] = rotation_temp[3], matrix_temp[4] = rotation_temp[4]; \
206
- matrix_temp[5] = rotation_temp[5]; \
207
- matrix_temp[6] = -sin_theta * rotation_temp[0] + cos_theta * rotation_temp[6]; \
208
- matrix_temp[7] = -sin_theta * rotation_temp[1] + cos_theta * rotation_temp[7]; \
209
- matrix_temp[8] = -sin_theta * rotation_temp[2] + cos_theta * rotation_temp[8]; \
210
- /* Third Givens rotation (zero matrix_temp[7]) */ \
211
- nk_qr_givens_quaternion_##type##_(matrix_temp[4], matrix_temp[7], &cos_half_3, &sin_half_3); \
212
- cos_theta = 1 - 2 * sin_half_3 * sin_half_3; \
213
- sin_theta = 2 * cos_half_3 * sin_half_3; \
214
- r[0] = matrix_temp[0], r[1] = matrix_temp[1], r[2] = matrix_temp[2]; \
215
- r[3] = cos_theta * matrix_temp[3] + sin_theta * matrix_temp[6]; \
216
- r[4] = cos_theta * matrix_temp[4] + sin_theta * matrix_temp[7]; \
217
- r[5] = cos_theta * matrix_temp[5] + sin_theta * matrix_temp[8]; \
218
- r[6] = -sin_theta * matrix_temp[3] + cos_theta * matrix_temp[6]; \
219
- r[7] = -sin_theta * matrix_temp[4] + cos_theta * matrix_temp[7]; \
220
- r[8] = -sin_theta * matrix_temp[5] + cos_theta * matrix_temp[8]; \
201
+ nk_##type##_t matrix_temp_4 = rotation_temp[4]; \
202
+ nk_##type##_t matrix_temp_7 = -sin_theta * rotation_temp[1] + cos_theta * rotation_temp[7]; \
203
+ /* Third Givens rotation (zero matrix_temp_7) only sin_half_3 / cos_half_3 are needed for Q */ \
204
+ nk_qr_givens_quaternion_##type##_(matrix_temp_4, matrix_temp_7, &cos_half_3, &sin_half_3); \
221
205
  /* Construct Q = Q1 * Q2 * Q3 (closed-form expressions) */ \
222
206
  nk_##type##_t sin_half_1_sq = sin_half_1 * sin_half_1; \
223
207
  nk_##type##_t sin_half_2_sq = sin_half_2 * sin_half_2; \
@@ -238,49 +222,48 @@ extern "C" {
238
222
  q[8] = (-1 + 2 * sin_half_2_sq) * (-1 + 2 * sin_half_3_sq); \
239
223
  }
240
224
 
241
- #define nk_define_svd3x3_(type, compute_sqrt) \
242
- NK_INTERNAL void nk_svd3x3_##type##_(nk_##type##_t const *a, nk_##type##_t *svd_u, nk_##type##_t *svd_s, \
243
- nk_##type##_t *svd_v) { \
244
- /* Compute Aᵀ * A (symmetric) */ \
245
- nk_##type##_t ata[9]; \
246
- ata[0] = nk_sum_three_squares_##type##_(a[0], a[3], a[6]); \
247
- ata[1] = nk_sum_three_products_##type##_(a[0], a[1], a[3], a[4], a[6], a[7]); \
248
- ata[2] = nk_sum_three_products_##type##_(a[0], a[2], a[3], a[5], a[6], a[8]); \
249
- ata[3] = ata[1]; \
250
- ata[4] = nk_sum_three_squares_##type##_(a[1], a[4], a[7]); \
251
- ata[5] = nk_sum_three_products_##type##_(a[1], a[2], a[4], a[5], a[7], a[8]); \
252
- ata[6] = ata[2]; \
253
- ata[7] = ata[5]; \
254
- ata[8] = nk_sum_three_squares_##type##_(a[2], a[5], a[8]); \
255
- /* Jacobi eigenanalysis of Aᵀ * A */ \
256
- nk_##type##_t quaternion[4]; \
257
- nk_jacobi_eigenanalysis_##type##_(&ata[0], &ata[1], &ata[4], &ata[2], &ata[5], &ata[8], quaternion); \
258
- nk_quaternion_to_mat3x3_##type##_(quaternion, svd_v); \
259
- /* B = A * V */ \
260
- nk_##type##_t product[9]; \
261
- product[0] = nk_sum_three_products_##type##_(a[0], svd_v[0], a[1], svd_v[3], a[2], svd_v[6]); \
262
- product[1] = nk_sum_three_products_##type##_(a[0], svd_v[1], a[1], svd_v[4], a[2], svd_v[7]); \
263
- product[2] = nk_sum_three_products_##type##_(a[0], svd_v[2], a[1], svd_v[5], a[2], svd_v[8]); \
264
- product[3] = nk_sum_three_products_##type##_(a[3], svd_v[0], a[4], svd_v[3], a[5], svd_v[6]); \
265
- product[4] = nk_sum_three_products_##type##_(a[3], svd_v[1], a[4], svd_v[4], a[5], svd_v[7]); \
266
- product[5] = nk_sum_three_products_##type##_(a[3], svd_v[2], a[4], svd_v[5], a[5], svd_v[8]); \
267
- product[6] = nk_sum_three_products_##type##_(a[6], svd_v[0], a[7], svd_v[3], a[8], svd_v[6]); \
268
- product[7] = nk_sum_three_products_##type##_(a[6], svd_v[1], a[7], svd_v[4], a[8], svd_v[7]); \
269
- product[8] = nk_sum_three_products_##type##_(a[6], svd_v[2], a[7], svd_v[5], a[8], svd_v[8]); \
270
- /* Sort singular values and update V */ \
271
- nk_sort_singular_values_##type##_(product, svd_v); \
272
- /* Compute singular values from column norms of sorted B (before QR orthogonalizes them) */ \
273
- /* These are the true singular values: √(‖colᵢ‖²) */ \
274
- nk_##type##_t s1_sq = nk_sum_three_squares_##type##_(product[0], product[3], product[6]); \
275
- nk_##type##_t s2_sq = nk_sum_three_squares_##type##_(product[1], product[4], product[7]); \
276
- nk_##type##_t s3_sq = nk_sum_three_squares_##type##_(product[2], product[5], product[8]); \
277
- /* QR decomposition: B = U * R (we only need U for the rotation) */ \
278
- nk_##type##_t qr_r[9]; \
279
- nk_qr_decomposition_##type##_(product, svd_u, qr_r); \
280
- /* Store singular values in diagonal of svd_s (rest is zero for compatibility) */ \
281
- svd_s[0] = compute_sqrt(s1_sq), svd_s[1] = 0, svd_s[2] = 0; \
282
- svd_s[3] = 0, svd_s[4] = compute_sqrt(s2_sq), svd_s[5] = 0; \
283
- svd_s[6] = 0, svd_s[7] = 0, svd_s[8] = compute_sqrt(s3_sq); \
225
+ #define nk_define_svd3x3_(type, compute_sqrt) \
226
+ NK_INTERNAL void nk_svd3x3_##type##_(nk_##type##_t const *a, nk_##type##_t *svd_left, nk_##type##_t *svd_diagonal, \
227
+ nk_##type##_t *svd_right) { \
228
+ /* Compute Aᵀ * A (symmetric) */ \
229
+ nk_##type##_t ata[9]; \
230
+ ata[0] = nk_sum_three_squares_##type##_(a[0], a[3], a[6]); \
231
+ ata[1] = nk_sum_three_products_##type##_(a[0], a[1], a[3], a[4], a[6], a[7]); \
232
+ ata[2] = nk_sum_three_products_##type##_(a[0], a[2], a[3], a[5], a[6], a[8]); \
233
+ ata[3] = ata[1]; \
234
+ ata[4] = nk_sum_three_squares_##type##_(a[1], a[4], a[7]); \
235
+ ata[5] = nk_sum_three_products_##type##_(a[1], a[2], a[4], a[5], a[7], a[8]); \
236
+ ata[6] = ata[2]; \
237
+ ata[7] = ata[5]; \
238
+ ata[8] = nk_sum_three_squares_##type##_(a[2], a[5], a[8]); \
239
+ /* Jacobi eigenanalysis of Aᵀ * A */ \
240
+ nk_##type##_t quaternion[4]; \
241
+ nk_jacobi_eigenanalysis_##type##_(&ata[0], &ata[1], &ata[4], &ata[2], &ata[5], &ata[8], quaternion); \
242
+ nk_quaternion_to_mat3x3_##type##_(quaternion, svd_right); \
243
+ /* B = A * V */ \
244
+ nk_##type##_t product[9]; \
245
+ product[0] = nk_sum_three_products_##type##_(a[0], svd_right[0], a[1], svd_right[3], a[2], svd_right[6]); \
246
+ product[1] = nk_sum_three_products_##type##_(a[0], svd_right[1], a[1], svd_right[4], a[2], svd_right[7]); \
247
+ product[2] = nk_sum_three_products_##type##_(a[0], svd_right[2], a[1], svd_right[5], a[2], svd_right[8]); \
248
+ product[3] = nk_sum_three_products_##type##_(a[3], svd_right[0], a[4], svd_right[3], a[5], svd_right[6]); \
249
+ product[4] = nk_sum_three_products_##type##_(a[3], svd_right[1], a[4], svd_right[4], a[5], svd_right[7]); \
250
+ product[5] = nk_sum_three_products_##type##_(a[3], svd_right[2], a[4], svd_right[5], a[5], svd_right[8]); \
251
+ product[6] = nk_sum_three_products_##type##_(a[6], svd_right[0], a[7], svd_right[3], a[8], svd_right[6]); \
252
+ product[7] = nk_sum_three_products_##type##_(a[6], svd_right[1], a[7], svd_right[4], a[8], svd_right[7]); \
253
+ product[8] = nk_sum_three_products_##type##_(a[6], svd_right[2], a[7], svd_right[5], a[8], svd_right[8]); \
254
+ /* Sort singular values and update V */ \
255
+ nk_sort_singular_values_##type##_(product, svd_right); \
256
+ /* Compute singular values from column norms of sorted B (before QR orthogonalizes them) */ \
257
+ /* These are the true singular values: √(‖colᵢ‖²) */ \
258
+ nk_##type##_t singular_value_squared_0 = nk_sum_three_squares_##type##_(product[0], product[3], product[6]); \
259
+ nk_##type##_t singular_value_squared_1 = nk_sum_three_squares_##type##_(product[1], product[4], product[7]); \
260
+ nk_##type##_t singular_value_squared_2 = nk_sum_three_squares_##type##_(product[2], product[5], product[8]); \
261
+ /* Q-only QR: extract U (the orthogonal factor); R is unused by SVD */ \
262
+ nk_qr_orthogonal_factor_##type##_(product, svd_left); \
263
+ /* Store singular values on diagonal positions [0], [4], [8]; off-diagonals never read by callers */ \
264
+ svd_diagonal[0] = compute_sqrt(singular_value_squared_0); \
265
+ svd_diagonal[4] = compute_sqrt(singular_value_squared_1); \
266
+ svd_diagonal[8] = compute_sqrt(singular_value_squared_2); \
284
267
  }
285
268
 
286
269
  #define nk_define_det3x3_(type) \
@@ -289,7 +272,18 @@ extern "C" {
289
272
  m[2] * (m[3] * m[7] - m[4] * m[6]); \
290
273
  }
291
274
 
292
- /* Optimize serial fallbacks for size see dots/serial.h for rationale. */
275
+ /* Keep the serial instantiations below actually scalar, regardless of build type.
276
+ * Without this, -O3 + LTO can vectorize or clone the serial kernels under AVX-512
277
+ * callers in dispatch_*.c, which wastes binary and breaks the nk_*_serial-as-scalar-oracle
278
+ * contract that tests and numerical-stability docs rely on. See dots/serial.h. */
279
+ #if defined(__clang__)
280
+ #pragma clang attribute push(__attribute__((noinline)), apply_to = function)
281
+ #elif defined(__GNUC__)
282
+ #pragma GCC push_options
283
+ #pragma GCC optimize("no-tree-vectorize", "no-tree-slp-vectorize", "no-ipa-cp-clone", "no-inline")
284
+ #endif
285
+
286
+ /* Size bias for release. Gated on NDEBUG so Debug builds keep -O0 for stepping. */
293
287
  #if defined(NDEBUG)
294
288
  #if defined(_MSC_VER)
295
289
  #pragma optimize("s", on)
@@ -352,27 +346,47 @@ NK_INTERNAL void nk_accumulate_square_f64_(nk_f64_t *sum, nk_f64_t *compensation
352
346
  nk_f64_dot2_(sum, compensation, value, value);
353
347
  }
354
348
 
355
- NK_INTERNAL void nk_rotation_from_svd_f32_serial_(nk_f32_t const *svd_u, nk_f32_t const *svd_v, nk_f32_t *rotation) {
356
- rotation[0] = nk_sum_three_products_f32_(svd_v[0], svd_u[0], svd_v[1], svd_u[1], svd_v[2], svd_u[2]);
357
- rotation[1] = nk_sum_three_products_f32_(svd_v[0], svd_u[3], svd_v[1], svd_u[4], svd_v[2], svd_u[5]);
358
- rotation[2] = nk_sum_three_products_f32_(svd_v[0], svd_u[6], svd_v[1], svd_u[7], svd_v[2], svd_u[8]);
359
- rotation[3] = nk_sum_three_products_f32_(svd_v[3], svd_u[0], svd_v[4], svd_u[1], svd_v[5], svd_u[2]);
360
- rotation[4] = nk_sum_three_products_f32_(svd_v[3], svd_u[3], svd_v[4], svd_u[4], svd_v[5], svd_u[5]);
361
- rotation[5] = nk_sum_three_products_f32_(svd_v[3], svd_u[6], svd_v[4], svd_u[7], svd_v[5], svd_u[8]);
362
- rotation[6] = nk_sum_three_products_f32_(svd_v[6], svd_u[0], svd_v[7], svd_u[1], svd_v[8], svd_u[2]);
363
- rotation[7] = nk_sum_three_products_f32_(svd_v[6], svd_u[3], svd_v[7], svd_u[4], svd_v[8], svd_u[5]);
364
- rotation[8] = nk_sum_three_products_f32_(svd_v[6], svd_u[6], svd_v[7], svd_u[7], svd_v[8], svd_u[8]);
349
+ NK_INTERNAL void nk_rotation_from_svd_f32_serial_(nk_f32_t const *svd_left, nk_f32_t const *svd_right,
350
+ nk_f32_t *rotation) {
351
+ rotation[0] = nk_sum_three_products_f32_( //
352
+ svd_right[0], svd_left[0], svd_right[1], svd_left[1], svd_right[2], svd_left[2]);
353
+ rotation[1] = nk_sum_three_products_f32_( //
354
+ svd_right[0], svd_left[3], svd_right[1], svd_left[4], svd_right[2], svd_left[5]);
355
+ rotation[2] = nk_sum_three_products_f32_( //
356
+ svd_right[0], svd_left[6], svd_right[1], svd_left[7], svd_right[2], svd_left[8]);
357
+ rotation[3] = nk_sum_three_products_f32_( //
358
+ svd_right[3], svd_left[0], svd_right[4], svd_left[1], svd_right[5], svd_left[2]);
359
+ rotation[4] = nk_sum_three_products_f32_( //
360
+ svd_right[3], svd_left[3], svd_right[4], svd_left[4], svd_right[5], svd_left[5]);
361
+ rotation[5] = nk_sum_three_products_f32_( //
362
+ svd_right[3], svd_left[6], svd_right[4], svd_left[7], svd_right[5], svd_left[8]);
363
+ rotation[6] = nk_sum_three_products_f32_( //
364
+ svd_right[6], svd_left[0], svd_right[7], svd_left[1], svd_right[8], svd_left[2]);
365
+ rotation[7] = nk_sum_three_products_f32_( //
366
+ svd_right[6], svd_left[3], svd_right[7], svd_left[4], svd_right[8], svd_left[5]);
367
+ rotation[8] = nk_sum_three_products_f32_( //
368
+ svd_right[6], svd_left[6], svd_right[7], svd_left[7], svd_right[8], svd_left[8]);
365
369
  }
366
- NK_INTERNAL void nk_rotation_from_svd_f64_serial_(nk_f64_t const *svd_u, nk_f64_t const *svd_v, nk_f64_t *rotation) {
367
- rotation[0] = nk_sum_three_products_f64_(svd_v[0], svd_u[0], svd_v[1], svd_u[1], svd_v[2], svd_u[2]);
368
- rotation[1] = nk_sum_three_products_f64_(svd_v[0], svd_u[3], svd_v[1], svd_u[4], svd_v[2], svd_u[5]);
369
- rotation[2] = nk_sum_three_products_f64_(svd_v[0], svd_u[6], svd_v[1], svd_u[7], svd_v[2], svd_u[8]);
370
- rotation[3] = nk_sum_three_products_f64_(svd_v[3], svd_u[0], svd_v[4], svd_u[1], svd_v[5], svd_u[2]);
371
- rotation[4] = nk_sum_three_products_f64_(svd_v[3], svd_u[3], svd_v[4], svd_u[4], svd_v[5], svd_u[5]);
372
- rotation[5] = nk_sum_three_products_f64_(svd_v[3], svd_u[6], svd_v[4], svd_u[7], svd_v[5], svd_u[8]);
373
- rotation[6] = nk_sum_three_products_f64_(svd_v[6], svd_u[0], svd_v[7], svd_u[1], svd_v[8], svd_u[2]);
374
- rotation[7] = nk_sum_three_products_f64_(svd_v[6], svd_u[3], svd_v[7], svd_u[4], svd_v[8], svd_u[5]);
375
- rotation[8] = nk_sum_three_products_f64_(svd_v[6], svd_u[6], svd_v[7], svd_u[7], svd_v[8], svd_u[8]);
370
+ NK_INTERNAL void nk_rotation_from_svd_f64_serial_(nk_f64_t const *svd_left, nk_f64_t const *svd_right,
371
+ nk_f64_t *rotation) {
372
+ rotation[0] = nk_sum_three_products_f64_( //
373
+ svd_right[0], svd_left[0], svd_right[1], svd_left[1], svd_right[2], svd_left[2]);
374
+ rotation[1] = nk_sum_three_products_f64_( //
375
+ svd_right[0], svd_left[3], svd_right[1], svd_left[4], svd_right[2], svd_left[5]);
376
+ rotation[2] = nk_sum_three_products_f64_( //
377
+ svd_right[0], svd_left[6], svd_right[1], svd_left[7], svd_right[2], svd_left[8]);
378
+ rotation[3] = nk_sum_three_products_f64_( //
379
+ svd_right[3], svd_left[0], svd_right[4], svd_left[1], svd_right[5], svd_left[2]);
380
+ rotation[4] = nk_sum_three_products_f64_( //
381
+ svd_right[3], svd_left[3], svd_right[4], svd_left[4], svd_right[5], svd_left[5]);
382
+ rotation[5] = nk_sum_three_products_f64_( //
383
+ svd_right[3], svd_left[6], svd_right[4], svd_left[7], svd_right[5], svd_left[8]);
384
+ rotation[6] = nk_sum_three_products_f64_( //
385
+ svd_right[6], svd_left[0], svd_right[7], svd_left[1], svd_right[8], svd_left[2]);
386
+ rotation[7] = nk_sum_three_products_f64_( //
387
+ svd_right[6], svd_left[3], svd_right[7], svd_left[4], svd_right[8], svd_left[5]);
388
+ rotation[8] = nk_sum_three_products_f64_( //
389
+ svd_right[6], svd_left[6], svd_right[7], svd_left[7], svd_right[8], svd_left[8]);
376
390
  }
377
391
 
378
392
  nk_define_cond_swap_(f32)
@@ -384,7 +398,7 @@ nk_define_quaternion_to_mat3x3_(f32)
384
398
  nk_define_jacobi_eigenanalysis_(f32, nk_f32_rsqrt_serial)
385
399
  nk_define_qr_givens_quaternion_(f32, NK_F32_SVD_EPSILON_, nk_f32_rsqrt_serial)
386
400
  nk_define_sort_singular_values_(f32)
387
- nk_define_qr_decomposition_(f32)
401
+ nk_define_qr_orthogonal_factor_(f32)
388
402
  nk_define_svd3x3_(f32, nk_f32_sqrt_serial)
389
403
  nk_define_det3x3_(f32)
390
404
 
@@ -397,7 +411,7 @@ nk_define_quaternion_to_mat3x3_(f64)
397
411
  nk_define_jacobi_eigenanalysis_(f64, nk_f64_rsqrt_serial)
398
412
  nk_define_qr_givens_quaternion_(f64, NK_F64_SVD_EPSILON_, nk_f64_rsqrt_serial)
399
413
  nk_define_sort_singular_values_(f64)
400
- nk_define_qr_decomposition_(f64)
414
+ nk_define_qr_orthogonal_factor_(f64)
401
415
  nk_define_svd3x3_(f64, nk_f64_sqrt_serial)
402
416
  nk_define_det3x3_(f64)
403
417
 
@@ -472,42 +486,53 @@ nk_define_det3x3_(f64)
472
486
  b_centroid[0] = (nk_##output_type##_t)centroid_b_x, b_centroid[1] = (nk_##output_type##_t)centroid_b_y, \
473
487
  b_centroid[2] = (nk_##output_type##_t)centroid_b_z; \
474
488
  /* Step 2: Build 3×3 covariance matrix H = (A - Ā)ᵀ × (B - B̄) */ \
475
- nk_##accumulator_type##_t h[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
476
- nk_##accumulator_type##_t h_compensation[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
489
+ nk_##accumulator_type##_t cross_covariance_kahan_sum[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
490
+ nk_##accumulator_type##_t cross_covariance_kahan_compensation[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
477
491
  for (nk_size_t i = 0; i < n; ++i) { \
478
492
  load_and_convert(a + i * 3 + 0, &val_a_x), load_and_convert(b + i * 3 + 0, &val_b_x); \
479
493
  load_and_convert(a + i * 3 + 1, &val_a_y), load_and_convert(b + i * 3 + 1, &val_b_y); \
480
494
  load_and_convert(a + i * 3 + 2, &val_a_z), load_and_convert(b + i * 3 + 2, &val_b_z); \
481
495
  val_a_x -= centroid_a_x, val_a_y -= centroid_a_y, val_a_z -= centroid_a_z; \
482
496
  val_b_x -= centroid_b_x, val_b_y -= centroid_b_y, val_b_z -= centroid_b_z; \
483
- nk_accumulate_product_##accumulator_type##_(&h[0], &h_compensation[0], val_a_x, val_b_x); \
484
- nk_accumulate_product_##accumulator_type##_(&h[1], &h_compensation[1], val_a_x, val_b_y); \
485
- nk_accumulate_product_##accumulator_type##_(&h[2], &h_compensation[2], val_a_x, val_b_z); \
486
- nk_accumulate_product_##accumulator_type##_(&h[3], &h_compensation[3], val_a_y, val_b_x); \
487
- nk_accumulate_product_##accumulator_type##_(&h[4], &h_compensation[4], val_a_y, val_b_y); \
488
- nk_accumulate_product_##accumulator_type##_(&h[5], &h_compensation[5], val_a_y, val_b_z); \
489
- nk_accumulate_product_##accumulator_type##_(&h[6], &h_compensation[6], val_a_z, val_b_x); \
490
- nk_accumulate_product_##accumulator_type##_(&h[7], &h_compensation[7], val_a_z, val_b_y); \
491
- nk_accumulate_product_##accumulator_type##_(&h[8], &h_compensation[8], val_a_z, val_b_z); \
497
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[0], \
498
+ &cross_covariance_kahan_compensation[0], val_a_x, val_b_x); \
499
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[1], \
500
+ &cross_covariance_kahan_compensation[1], val_a_x, val_b_y); \
501
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[2], \
502
+ &cross_covariance_kahan_compensation[2], val_a_x, val_b_z); \
503
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[3], \
504
+ &cross_covariance_kahan_compensation[3], val_a_y, val_b_x); \
505
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[4], \
506
+ &cross_covariance_kahan_compensation[4], val_a_y, val_b_y); \
507
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[5], \
508
+ &cross_covariance_kahan_compensation[5], val_a_y, val_b_z); \
509
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[6], \
510
+ &cross_covariance_kahan_compensation[6], val_a_z, val_b_x); \
511
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[7], \
512
+ &cross_covariance_kahan_compensation[7], val_a_z, val_b_y); \
513
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[8], \
514
+ &cross_covariance_kahan_compensation[8], val_a_z, val_b_z); \
492
515
  } \
493
516
  /* Convert to svd_type for SVD */ \
494
517
  nk_##svd_type##_t cross_covariance[9]; \
495
- for (int j = 0; j < 9; ++j) cross_covariance[j] = (nk_##svd_type##_t)(h[j] + h_compensation[j]); \
518
+ for (int j = 0; j < 9; ++j) \
519
+ cross_covariance[j] = (nk_##svd_type##_t)(cross_covariance_kahan_sum[j] + \
520
+ cross_covariance_kahan_compensation[j]); \
496
521
  /* Step 3: SVD of H = U * S * Vᵀ */ \
497
- nk_##svd_type##_t svd_u[9], svd_s[9], svd_v[9]; \
498
- nk_svd3x3_##svd_type##_(cross_covariance, svd_u, svd_s, svd_v); \
522
+ nk_##svd_type##_t svd_left[9], svd_diagonal[9], svd_right[9]; \
523
+ nk_svd3x3_##svd_type##_(cross_covariance, svd_left, svd_diagonal, svd_right); \
499
524
  /* Step 4: R = V * Uᵀ */ \
500
- nk_##svd_type##_t rotation_matrix[9]; \
501
- nk_rotation_from_svd_##svd_type##_serial_(svd_u, svd_v, rotation_matrix); \
525
+ nk_##svd_type##_t optimal_rotation[9]; \
526
+ nk_rotation_from_svd_##svd_type##_serial_(svd_left, svd_right, optimal_rotation); \
502
527
  /* Handle reflection: if det(R) < 0, negate third column of V and recompute R */ \
503
- nk_##svd_type##_t rotation_det = nk_det3x3_##svd_type##_(rotation_matrix); \
504
- if (rotation_det < 0) { \
505
- svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8]; \
506
- nk_rotation_from_svd_##svd_type##_serial_(svd_u, svd_v, rotation_matrix); \
528
+ nk_##svd_type##_t rotation_determinant = nk_det3x3_##svd_type##_(optimal_rotation); \
529
+ if (rotation_determinant < 0) { \
530
+ svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8]; \
531
+ nk_rotation_from_svd_##svd_type##_serial_(svd_left, svd_right, optimal_rotation); \
507
532
  } \
508
533
  /* Output rotation matrix and scale=1.0 */ \
509
534
  if (rotation) \
510
- for (int j = 0; j < 9; ++j) rotation[j] = (nk_##output_type##_t)rotation_matrix[j]; \
535
+ for (int j = 0; j < 9; ++j) rotation[j] = (nk_##output_type##_t)optimal_rotation[j]; \
511
536
  if (scale) *scale = (nk_##output_type##_t)1; \
512
537
  /* Step 5: Compute RMSD after rotation */ \
513
538
  nk_##accumulator_type##_t sum_squared = 0, sum_squared_compensation = 0; \
@@ -522,12 +547,12 @@ nk_define_det3x3_(f64)
522
547
  point_b[0] = (nk_##svd_type##_t)(val_b_x - centroid_b_x), \
523
548
  point_b[1] = (nk_##svd_type##_t)(val_b_y - centroid_b_y), \
524
549
  point_b[2] = (nk_##svd_type##_t)(val_b_z - centroid_b_z); \
525
- rotated_point_a[0] = rotation_matrix[0] * point_a[0] + rotation_matrix[1] * point_a[1] + \
526
- rotation_matrix[2] * point_a[2]; \
527
- rotated_point_a[1] = rotation_matrix[3] * point_a[0] + rotation_matrix[4] * point_a[1] + \
528
- rotation_matrix[5] * point_a[2]; \
529
- rotated_point_a[2] = rotation_matrix[6] * point_a[0] + rotation_matrix[7] * point_a[1] + \
530
- rotation_matrix[8] * point_a[2]; \
550
+ rotated_point_a[0] = optimal_rotation[0] * point_a[0] + optimal_rotation[1] * point_a[1] + \
551
+ optimal_rotation[2] * point_a[2]; \
552
+ rotated_point_a[1] = optimal_rotation[3] * point_a[0] + optimal_rotation[4] * point_a[1] + \
553
+ optimal_rotation[5] * point_a[2]; \
554
+ rotated_point_a[2] = optimal_rotation[6] * point_a[0] + optimal_rotation[7] * point_a[1] + \
555
+ optimal_rotation[8] * point_a[2]; \
531
556
  nk_##svd_type##_t dx = rotated_point_a[0] - point_b[0]; \
532
557
  nk_##svd_type##_t dy = rotated_point_a[1] - point_b[1]; \
533
558
  nk_##svd_type##_t dz = rotated_point_a[2] - point_b[2]; \
@@ -583,8 +608,8 @@ nk_define_det3x3_(f64)
583
608
  b_centroid[0] = (nk_##output_type##_t)centroid_b_x, b_centroid[1] = (nk_##output_type##_t)centroid_b_y, \
584
609
  b_centroid[2] = (nk_##output_type##_t)centroid_b_z; \
585
610
  /* Step 2: Build covariance matrix H and compute variance of A */ \
586
- nk_##accumulator_type##_t h[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
587
- nk_##accumulator_type##_t h_compensation[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
611
+ nk_##accumulator_type##_t cross_covariance_kahan_sum[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
612
+ nk_##accumulator_type##_t cross_covariance_kahan_compensation[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
588
613
  nk_##accumulator_type##_t variance_a = 0, variance_a_compensation = 0; \
589
614
  for (nk_size_t i = 0; i < n; ++i) { \
590
615
  load_and_convert(a + i * 3 + 0, &val_a_x), load_and_convert(b + i * 3 + 0, &val_b_x); \
@@ -595,41 +620,52 @@ nk_define_det3x3_(f64)
595
620
  nk_accumulate_square_##accumulator_type##_(&variance_a, &variance_a_compensation, val_a_x); \
596
621
  nk_accumulate_square_##accumulator_type##_(&variance_a, &variance_a_compensation, val_a_y); \
597
622
  nk_accumulate_square_##accumulator_type##_(&variance_a, &variance_a_compensation, val_a_z); \
598
- nk_accumulate_product_##accumulator_type##_(&h[0], &h_compensation[0], val_a_x, val_b_x); \
599
- nk_accumulate_product_##accumulator_type##_(&h[1], &h_compensation[1], val_a_x, val_b_y); \
600
- nk_accumulate_product_##accumulator_type##_(&h[2], &h_compensation[2], val_a_x, val_b_z); \
601
- nk_accumulate_product_##accumulator_type##_(&h[3], &h_compensation[3], val_a_y, val_b_x); \
602
- nk_accumulate_product_##accumulator_type##_(&h[4], &h_compensation[4], val_a_y, val_b_y); \
603
- nk_accumulate_product_##accumulator_type##_(&h[5], &h_compensation[5], val_a_y, val_b_z); \
604
- nk_accumulate_product_##accumulator_type##_(&h[6], &h_compensation[6], val_a_z, val_b_x); \
605
- nk_accumulate_product_##accumulator_type##_(&h[7], &h_compensation[7], val_a_z, val_b_y); \
606
- nk_accumulate_product_##accumulator_type##_(&h[8], &h_compensation[8], val_a_z, val_b_z); \
623
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[0], \
624
+ &cross_covariance_kahan_compensation[0], val_a_x, val_b_x); \
625
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[1], \
626
+ &cross_covariance_kahan_compensation[1], val_a_x, val_b_y); \
627
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[2], \
628
+ &cross_covariance_kahan_compensation[2], val_a_x, val_b_z); \
629
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[3], \
630
+ &cross_covariance_kahan_compensation[3], val_a_y, val_b_x); \
631
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[4], \
632
+ &cross_covariance_kahan_compensation[4], val_a_y, val_b_y); \
633
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[5], \
634
+ &cross_covariance_kahan_compensation[5], val_a_y, val_b_z); \
635
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[6], \
636
+ &cross_covariance_kahan_compensation[6], val_a_z, val_b_x); \
637
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[7], \
638
+ &cross_covariance_kahan_compensation[7], val_a_z, val_b_y); \
639
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[8], \
640
+ &cross_covariance_kahan_compensation[8], val_a_z, val_b_z); \
607
641
  } \
608
642
  variance_a = (variance_a + variance_a_compensation) * inv_n; \
609
643
  /* Convert to svd_type for SVD */ \
610
644
  nk_##svd_type##_t cross_covariance[9]; \
611
- for (int j = 0; j < 9; ++j) cross_covariance[j] = (nk_##svd_type##_t)(h[j] + h_compensation[j]); \
645
+ for (int j = 0; j < 9; ++j) \
646
+ cross_covariance[j] = (nk_##svd_type##_t)(cross_covariance_kahan_sum[j] + \
647
+ cross_covariance_kahan_compensation[j]); \
612
648
  /* Step 3: SVD of H = U * S * Vᵀ */ \
613
- nk_##svd_type##_t svd_u[9], svd_s[9], svd_v[9]; \
614
- nk_svd3x3_##svd_type##_(cross_covariance, svd_u, svd_s, svd_v); \
649
+ nk_##svd_type##_t svd_left[9], svd_diagonal[9], svd_right[9]; \
650
+ nk_svd3x3_##svd_type##_(cross_covariance, svd_left, svd_diagonal, svd_right); \
615
651
  /* Step 4: R = V * Uᵀ */ \
616
- nk_##svd_type##_t rotation_matrix[9]; \
617
- nk_rotation_from_svd_##svd_type##_serial_(svd_u, svd_v, rotation_matrix); \
652
+ nk_##svd_type##_t optimal_rotation[9]; \
653
+ nk_rotation_from_svd_##svd_type##_serial_(svd_left, svd_right, optimal_rotation); \
618
654
  /* Handle reflection and compute scale: c = trace(D × S) / variance(a) */ \
619
- /* D = diag(1, 1, det(R)), svd_s contains proper positive singular values on diagonal */ \
620
- nk_##svd_type##_t rotation_det = nk_det3x3_##svd_type##_(rotation_matrix); \
621
- nk_##svd_type##_t sign_det = rotation_det < 0 ? (nk_##svd_type##_t) - 1.0 : (nk_##svd_type##_t)1.0; \
622
- nk_##svd_type##_t trace_scaled_s = svd_s[0] + svd_s[4] + sign_det * svd_s[8]; \
655
+ /* D = diag(1, 1, det(R)), svd_diagonal contains proper positive singular values on diagonal */ \
656
+ nk_##svd_type##_t rotation_determinant = nk_det3x3_##svd_type##_(optimal_rotation); \
657
+ nk_##svd_type##_t sign_det = rotation_determinant < 0 ? (nk_##svd_type##_t) - 1.0 : (nk_##svd_type##_t)1.0; \
658
+ nk_##svd_type##_t trace_scaled_s = svd_diagonal[0] + svd_diagonal[4] + sign_det * svd_diagonal[8]; \
623
659
  nk_##accumulator_type##_t scale_factor = (nk_##accumulator_type##_t)trace_scaled_s / \
624
660
  ((nk_##accumulator_type##_t)n * variance_a); \
625
661
  if (scale) *scale = (nk_##output_type##_t)scale_factor; \
626
- if (rotation_det < 0) { \
627
- svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8]; \
628
- nk_rotation_from_svd_##svd_type##_serial_(svd_u, svd_v, rotation_matrix); \
662
+ if (rotation_determinant < 0) { \
663
+ svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8]; \
664
+ nk_rotation_from_svd_##svd_type##_serial_(svd_left, svd_right, optimal_rotation); \
629
665
  } \
630
666
  /* Output rotation matrix */ \
631
667
  if (rotation) \
632
- for (int j = 0; j < 9; ++j) rotation[j] = (nk_##output_type##_t)rotation_matrix[j]; \
668
+ for (int j = 0; j < 9; ++j) rotation[j] = (nk_##output_type##_t)optimal_rotation[j]; \
633
669
  /* Step 5: Compute RMSD after similarity transform: ‖c × R × a - b‖ */ \
634
670
  nk_##accumulator_type##_t sum_squared = 0, sum_squared_compensation = 0; \
635
671
  for (nk_size_t i = 0; i < n; ++i) { \
@@ -644,14 +680,14 @@ nk_define_det3x3_(f64)
644
680
  point_b[1] = (nk_##svd_type##_t)(val_b_y - centroid_b_y), \
645
681
  point_b[2] = (nk_##svd_type##_t)(val_b_z - centroid_b_z); \
646
682
  rotated_point_a[0] = (nk_##svd_type##_t)scale_factor * \
647
- (rotation_matrix[0] * point_a[0] + rotation_matrix[1] * point_a[1] + \
648
- rotation_matrix[2] * point_a[2]); \
683
+ (optimal_rotation[0] * point_a[0] + optimal_rotation[1] * point_a[1] + \
684
+ optimal_rotation[2] * point_a[2]); \
649
685
  rotated_point_a[1] = (nk_##svd_type##_t)scale_factor * \
650
- (rotation_matrix[3] * point_a[0] + rotation_matrix[4] * point_a[1] + \
651
- rotation_matrix[5] * point_a[2]); \
686
+ (optimal_rotation[3] * point_a[0] + optimal_rotation[4] * point_a[1] + \
687
+ optimal_rotation[5] * point_a[2]); \
652
688
  rotated_point_a[2] = (nk_##svd_type##_t)scale_factor * \
653
- (rotation_matrix[6] * point_a[0] + rotation_matrix[7] * point_a[1] + \
654
- rotation_matrix[8] * point_a[2]); \
689
+ (optimal_rotation[6] * point_a[0] + optimal_rotation[7] * point_a[1] + \
690
+ optimal_rotation[8] * point_a[2]); \
655
691
  nk_##svd_type##_t dx = rotated_point_a[0] - point_b[0]; \
656
692
  nk_##svd_type##_t dy = rotated_point_a[1] - point_b[1]; \
657
693
  nk_##svd_type##_t dz = rotated_point_a[2] - point_b[2]; \
@@ -697,7 +733,7 @@ nk_define_umeyama_(bf16, f32, f32, f32, f32, nk_bf16_to_f32_serial, nk_f32_sqrt_
697
733
  #undef nk_define_jacobi_eigenanalysis_
698
734
  #undef nk_define_qr_givens_quaternion_
699
735
  #undef nk_define_sort_singular_values_
700
- #undef nk_define_qr_decomposition_
736
+ #undef nk_define_qr_orthogonal_factor_
701
737
  #undef nk_define_svd3x3_
702
738
  #undef nk_define_det3x3_
703
739
  #undef nk_define_rmsd_
@@ -714,6 +750,12 @@ nk_define_umeyama_(bf16, f32, f32, f32, f32, nk_bf16_to_f32_serial, nk_f32_sqrt_
714
750
  #endif
715
751
  #endif
716
752
 
753
+ #if defined(__clang__)
754
+ #pragma clang attribute pop
755
+ #elif defined(__GNUC__)
756
+ #pragma GCC pop_options
757
+ #endif
758
+
717
759
  #if defined(__cplusplus)
718
760
  } // extern "C"
719
761
  #endif