numkong 7.4.5 → 7.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/README.md +1 -0
  2. package/binding.gyp +99 -5
  3. package/c/dispatch_e5m2.c +23 -3
  4. package/c/dispatch_f16.c +23 -0
  5. package/c/numkong.c +0 -13
  6. package/include/numkong/attention/sme.h +34 -31
  7. package/include/numkong/capabilities.h +2 -15
  8. package/include/numkong/cast/README.md +3 -0
  9. package/include/numkong/cast/haswell.h +28 -64
  10. package/include/numkong/cast/neon.h +15 -0
  11. package/include/numkong/cast/serial.h +17 -0
  12. package/include/numkong/cast/skylake.h +67 -52
  13. package/include/numkong/cast.h +1 -0
  14. package/include/numkong/curved/smef64.h +82 -62
  15. package/include/numkong/dot/README.md +1 -0
  16. package/include/numkong/dot/haswell.h +92 -13
  17. package/include/numkong/dot/rvvbf16.h +1 -1
  18. package/include/numkong/dot/rvvhalf.h +1 -1
  19. package/include/numkong/dot/serial.h +15 -0
  20. package/include/numkong/dot/skylake.h +61 -14
  21. package/include/numkong/dot/sve.h +6 -5
  22. package/include/numkong/dot/svebfdot.h +2 -1
  23. package/include/numkong/dot/svehalf.h +6 -5
  24. package/include/numkong/dot/svesdot.h +3 -2
  25. package/include/numkong/dots/README.md +2 -0
  26. package/include/numkong/dots/graniteamx.h +1167 -0
  27. package/include/numkong/dots/haswell.h +28 -28
  28. package/include/numkong/dots/sapphireamx.h +1 -1
  29. package/include/numkong/dots/serial.h +33 -11
  30. package/include/numkong/dots/skylake.h +28 -23
  31. package/include/numkong/dots/sme.h +172 -140
  32. package/include/numkong/dots/smebi32.h +14 -11
  33. package/include/numkong/dots/smef64.h +31 -26
  34. package/include/numkong/dots.h +41 -3
  35. package/include/numkong/each/serial.h +39 -0
  36. package/include/numkong/geospatial/haswell.h +1 -1
  37. package/include/numkong/geospatial/neon.h +1 -1
  38. package/include/numkong/geospatial/serial.h +15 -4
  39. package/include/numkong/geospatial/skylake.h +1 -1
  40. package/include/numkong/maxsim/serial.h +15 -0
  41. package/include/numkong/maxsim/sme.h +34 -33
  42. package/include/numkong/mesh/README.md +50 -44
  43. package/include/numkong/mesh/genoa.h +462 -0
  44. package/include/numkong/mesh/haswell.h +806 -933
  45. package/include/numkong/mesh/neon.h +871 -943
  46. package/include/numkong/mesh/neonbfdot.h +382 -522
  47. package/include/numkong/mesh/neonfhm.h +676 -0
  48. package/include/numkong/mesh/rvv.h +404 -319
  49. package/include/numkong/mesh/serial.h +225 -161
  50. package/include/numkong/mesh/skylake.h +1029 -1585
  51. package/include/numkong/mesh/v128relaxed.h +403 -377
  52. package/include/numkong/mesh.h +38 -0
  53. package/include/numkong/reduce/neon.h +29 -0
  54. package/include/numkong/reduce/neonbfdot.h +2 -2
  55. package/include/numkong/reduce/neonfhm.h +4 -4
  56. package/include/numkong/reduce/serial.h +15 -1
  57. package/include/numkong/reduce/sve.h +52 -0
  58. package/include/numkong/reduce.h +4 -0
  59. package/include/numkong/set/sve.h +6 -5
  60. package/include/numkong/sets/smebi32.h +35 -30
  61. package/include/numkong/sparse/serial.h +17 -2
  62. package/include/numkong/sparse/sve2.h +3 -2
  63. package/include/numkong/spatial/genoa.h +0 -68
  64. package/include/numkong/spatial/haswell.h +98 -56
  65. package/include/numkong/spatial/serial.h +15 -0
  66. package/include/numkong/spatial/skylake.h +114 -54
  67. package/include/numkong/spatial/sve.h +7 -6
  68. package/include/numkong/spatial/svebfdot.h +7 -4
  69. package/include/numkong/spatial/svehalf.h +5 -4
  70. package/include/numkong/spatial/svesdot.h +9 -8
  71. package/include/numkong/spatial.h +0 -12
  72. package/include/numkong/spatials/graniteamx.h +301 -0
  73. package/include/numkong/spatials/serial.h +39 -0
  74. package/include/numkong/spatials/skylake.h +2 -2
  75. package/include/numkong/spatials/sme.h +391 -350
  76. package/include/numkong/spatials/smef64.h +79 -70
  77. package/include/numkong/spatials.h +54 -4
  78. package/include/numkong/tensor.hpp +107 -23
  79. package/include/numkong/types.h +59 -0
  80. package/javascript/dist/cjs/numkong.js +13 -0
  81. package/javascript/dist/esm/numkong.js +13 -0
  82. package/javascript/numkong.c +59 -14
  83. package/javascript/numkong.ts +13 -0
  84. package/package.json +7 -7
  85. package/probes/probe.js +2 -2
  86. package/wasm/numkong.wasm +0 -0
@@ -146,28 +146,28 @@ extern "C" {
146
146
 
147
147
  #define nk_define_sort_singular_values_(type) \
148
148
  NK_INTERNAL void nk_sort_singular_values_##type##_(nk_##type##_t *b, nk_##type##_t *v) { \
149
- nk_##type##_t rho1 = b[0] * b[0] + b[3] * b[3] + b[6] * b[6]; \
150
- nk_##type##_t rho2 = b[1] * b[1] + b[4] * b[4] + b[7] * b[7]; \
151
- nk_##type##_t rho3 = b[2] * b[2] + b[5] * b[5] + b[8] * b[8]; \
149
+ nk_##type##_t column_norm_squared_0 = b[0] * b[0] + b[3] * b[3] + b[6] * b[6]; \
150
+ nk_##type##_t column_norm_squared_1 = b[1] * b[1] + b[4] * b[4] + b[7] * b[7]; \
151
+ nk_##type##_t column_norm_squared_2 = b[2] * b[2] + b[5] * b[5] + b[8] * b[8]; \
152
152
  int should_swap; \
153
153
  /* Sort columns by descending singular value magnitude */ \
154
- should_swap = rho1 < rho2; \
154
+ should_swap = column_norm_squared_0 < column_norm_squared_1; \
155
155
  nk_conditional_negating_swap_##type##_(should_swap, &b[0], &b[1]); \
156
156
  nk_conditional_negating_swap_##type##_(should_swap, &v[0], &v[1]); \
157
157
  nk_conditional_negating_swap_##type##_(should_swap, &b[3], &b[4]); \
158
158
  nk_conditional_negating_swap_##type##_(should_swap, &v[3], &v[4]); \
159
159
  nk_conditional_negating_swap_##type##_(should_swap, &b[6], &b[7]); \
160
160
  nk_conditional_negating_swap_##type##_(should_swap, &v[6], &v[7]); \
161
- nk_cond_swap_##type##_(should_swap, &rho1, &rho2); \
162
- should_swap = rho1 < rho3; \
161
+ nk_cond_swap_##type##_(should_swap, &column_norm_squared_0, &column_norm_squared_1); \
162
+ should_swap = column_norm_squared_0 < column_norm_squared_2; \
163
163
  nk_conditional_negating_swap_##type##_(should_swap, &b[0], &b[2]); \
164
164
  nk_conditional_negating_swap_##type##_(should_swap, &v[0], &v[2]); \
165
165
  nk_conditional_negating_swap_##type##_(should_swap, &b[3], &b[5]); \
166
166
  nk_conditional_negating_swap_##type##_(should_swap, &v[3], &v[5]); \
167
167
  nk_conditional_negating_swap_##type##_(should_swap, &b[6], &b[8]); \
168
168
  nk_conditional_negating_swap_##type##_(should_swap, &v[6], &v[8]); \
169
- nk_cond_swap_##type##_(should_swap, &rho1, &rho3); \
170
- should_swap = rho2 < rho3; \
169
+ nk_cond_swap_##type##_(should_swap, &column_norm_squared_0, &column_norm_squared_2); \
170
+ should_swap = column_norm_squared_1 < column_norm_squared_2; \
171
171
  nk_conditional_negating_swap_##type##_(should_swap, &b[1], &b[2]); \
172
172
  nk_conditional_negating_swap_##type##_(should_swap, &v[1], &v[2]); \
173
173
  nk_conditional_negating_swap_##type##_(should_swap, &b[4], &b[5]); \
@@ -176,48 +176,32 @@ extern "C" {
176
176
  nk_conditional_negating_swap_##type##_(should_swap, &v[7], &v[8]); \
177
177
  }
178
178
 
179
- #define nk_define_qr_decomposition_(type) \
180
- NK_INTERNAL void nk_qr_decomposition_##type##_(nk_##type##_t const *input, nk_##type##_t *q, nk_##type##_t *r) { \
179
+ /* Q-only QR via three Givens rotations. The R upper-triangular factor is unused by the SVD
180
+ * caller, so we skip computing it (saves the third cos_theta/sin_theta and 9 R stores per call).
181
+ */
182
+ #define nk_define_qr_orthogonal_factor_(type) \
183
+ NK_INTERNAL void nk_qr_orthogonal_factor_##type##_(nk_##type##_t const *input, nk_##type##_t *q) { \
181
184
  nk_##type##_t cos_half_1, sin_half_1; \
182
185
  nk_##type##_t cos_half_2, sin_half_2; \
183
186
  nk_##type##_t cos_half_3, sin_half_3; \
184
187
  nk_##type##_t cos_theta, sin_theta; \
185
- nk_##type##_t rotation_temp[9], matrix_temp[9]; \
188
+ nk_##type##_t rotation_temp[9]; \
186
189
  /* First Givens rotation (zero input[3]) */ \
187
190
  nk_qr_givens_quaternion_##type##_(input[0], input[3], &cos_half_1, &sin_half_1); \
188
191
  cos_theta = 1 - 2 * sin_half_1 * sin_half_1; \
189
192
  sin_theta = 2 * cos_half_1 * sin_half_1; \
190
- rotation_temp[0] = cos_theta * input[0] + sin_theta * input[3]; \
191
193
  rotation_temp[1] = cos_theta * input[1] + sin_theta * input[4]; \
192
- rotation_temp[2] = cos_theta * input[2] + sin_theta * input[5]; \
193
- rotation_temp[3] = -sin_theta * input[0] + cos_theta * input[3]; \
194
194
  rotation_temp[4] = -sin_theta * input[1] + cos_theta * input[4]; \
195
- rotation_temp[5] = -sin_theta * input[2] + cos_theta * input[5]; \
196
- rotation_temp[6] = input[6], rotation_temp[7] = input[7]; \
197
- rotation_temp[8] = input[8]; \
198
- /* Second Givens rotation (zero rotation_temp[6]) */ \
199
- nk_qr_givens_quaternion_##type##_(rotation_temp[0], rotation_temp[6], &cos_half_2, &sin_half_2); \
195
+ rotation_temp[7] = input[7]; \
196
+ /* Second Givens rotation (only the [4]/[7] cells flow into the third Givens) */ \
197
+ nk_qr_givens_quaternion_##type##_(input[0] * cos_theta + sin_theta * input[3], input[6], &cos_half_2, \
198
+ &sin_half_2); \
200
199
  cos_theta = 1 - 2 * sin_half_2 * sin_half_2; \
201
200
  sin_theta = 2 * cos_half_2 * sin_half_2; \
202
- matrix_temp[0] = cos_theta * rotation_temp[0] + sin_theta * rotation_temp[6]; \
203
- matrix_temp[1] = cos_theta * rotation_temp[1] + sin_theta * rotation_temp[7]; \
204
- matrix_temp[2] = cos_theta * rotation_temp[2] + sin_theta * rotation_temp[8]; \
205
- matrix_temp[3] = rotation_temp[3], matrix_temp[4] = rotation_temp[4]; \
206
- matrix_temp[5] = rotation_temp[5]; \
207
- matrix_temp[6] = -sin_theta * rotation_temp[0] + cos_theta * rotation_temp[6]; \
208
- matrix_temp[7] = -sin_theta * rotation_temp[1] + cos_theta * rotation_temp[7]; \
209
- matrix_temp[8] = -sin_theta * rotation_temp[2] + cos_theta * rotation_temp[8]; \
210
- /* Third Givens rotation (zero matrix_temp[7]) */ \
211
- nk_qr_givens_quaternion_##type##_(matrix_temp[4], matrix_temp[7], &cos_half_3, &sin_half_3); \
212
- cos_theta = 1 - 2 * sin_half_3 * sin_half_3; \
213
- sin_theta = 2 * cos_half_3 * sin_half_3; \
214
- r[0] = matrix_temp[0], r[1] = matrix_temp[1], r[2] = matrix_temp[2]; \
215
- r[3] = cos_theta * matrix_temp[3] + sin_theta * matrix_temp[6]; \
216
- r[4] = cos_theta * matrix_temp[4] + sin_theta * matrix_temp[7]; \
217
- r[5] = cos_theta * matrix_temp[5] + sin_theta * matrix_temp[8]; \
218
- r[6] = -sin_theta * matrix_temp[3] + cos_theta * matrix_temp[6]; \
219
- r[7] = -sin_theta * matrix_temp[4] + cos_theta * matrix_temp[7]; \
220
- r[8] = -sin_theta * matrix_temp[5] + cos_theta * matrix_temp[8]; \
201
+ nk_##type##_t matrix_temp_4 = rotation_temp[4]; \
202
+ nk_##type##_t matrix_temp_7 = -sin_theta * rotation_temp[1] + cos_theta * rotation_temp[7]; \
203
+ /* Third Givens rotation (zero matrix_temp_7) only sin_half_3 / cos_half_3 are needed for Q */ \
204
+ nk_qr_givens_quaternion_##type##_(matrix_temp_4, matrix_temp_7, &cos_half_3, &sin_half_3); \
221
205
  /* Construct Q = Q1 * Q2 * Q3 (closed-form expressions) */ \
222
206
  nk_##type##_t sin_half_1_sq = sin_half_1 * sin_half_1; \
223
207
  nk_##type##_t sin_half_2_sq = sin_half_2 * sin_half_2; \
@@ -238,49 +222,48 @@ extern "C" {
238
222
  q[8] = (-1 + 2 * sin_half_2_sq) * (-1 + 2 * sin_half_3_sq); \
239
223
  }
240
224
 
241
- #define nk_define_svd3x3_(type, compute_sqrt) \
242
- NK_INTERNAL void nk_svd3x3_##type##_(nk_##type##_t const *a, nk_##type##_t *svd_u, nk_##type##_t *svd_s, \
243
- nk_##type##_t *svd_v) { \
244
- /* Compute Aᵀ * A (symmetric) */ \
245
- nk_##type##_t ata[9]; \
246
- ata[0] = nk_sum_three_squares_##type##_(a[0], a[3], a[6]); \
247
- ata[1] = nk_sum_three_products_##type##_(a[0], a[1], a[3], a[4], a[6], a[7]); \
248
- ata[2] = nk_sum_three_products_##type##_(a[0], a[2], a[3], a[5], a[6], a[8]); \
249
- ata[3] = ata[1]; \
250
- ata[4] = nk_sum_three_squares_##type##_(a[1], a[4], a[7]); \
251
- ata[5] = nk_sum_three_products_##type##_(a[1], a[2], a[4], a[5], a[7], a[8]); \
252
- ata[6] = ata[2]; \
253
- ata[7] = ata[5]; \
254
- ata[8] = nk_sum_three_squares_##type##_(a[2], a[5], a[8]); \
255
- /* Jacobi eigenanalysis of Aᵀ * A */ \
256
- nk_##type##_t quaternion[4]; \
257
- nk_jacobi_eigenanalysis_##type##_(&ata[0], &ata[1], &ata[4], &ata[2], &ata[5], &ata[8], quaternion); \
258
- nk_quaternion_to_mat3x3_##type##_(quaternion, svd_v); \
259
- /* B = A * V */ \
260
- nk_##type##_t product[9]; \
261
- product[0] = nk_sum_three_products_##type##_(a[0], svd_v[0], a[1], svd_v[3], a[2], svd_v[6]); \
262
- product[1] = nk_sum_three_products_##type##_(a[0], svd_v[1], a[1], svd_v[4], a[2], svd_v[7]); \
263
- product[2] = nk_sum_three_products_##type##_(a[0], svd_v[2], a[1], svd_v[5], a[2], svd_v[8]); \
264
- product[3] = nk_sum_three_products_##type##_(a[3], svd_v[0], a[4], svd_v[3], a[5], svd_v[6]); \
265
- product[4] = nk_sum_three_products_##type##_(a[3], svd_v[1], a[4], svd_v[4], a[5], svd_v[7]); \
266
- product[5] = nk_sum_three_products_##type##_(a[3], svd_v[2], a[4], svd_v[5], a[5], svd_v[8]); \
267
- product[6] = nk_sum_three_products_##type##_(a[6], svd_v[0], a[7], svd_v[3], a[8], svd_v[6]); \
268
- product[7] = nk_sum_three_products_##type##_(a[6], svd_v[1], a[7], svd_v[4], a[8], svd_v[7]); \
269
- product[8] = nk_sum_three_products_##type##_(a[6], svd_v[2], a[7], svd_v[5], a[8], svd_v[8]); \
270
- /* Sort singular values and update V */ \
271
- nk_sort_singular_values_##type##_(product, svd_v); \
272
- /* Compute singular values from column norms of sorted B (before QR orthogonalizes them) */ \
273
- /* These are the true singular values: √(‖colᵢ‖²) */ \
274
- nk_##type##_t s1_sq = nk_sum_three_squares_##type##_(product[0], product[3], product[6]); \
275
- nk_##type##_t s2_sq = nk_sum_three_squares_##type##_(product[1], product[4], product[7]); \
276
- nk_##type##_t s3_sq = nk_sum_three_squares_##type##_(product[2], product[5], product[8]); \
277
- /* QR decomposition: B = U * R (we only need U for the rotation) */ \
278
- nk_##type##_t qr_r[9]; \
279
- nk_qr_decomposition_##type##_(product, svd_u, qr_r); \
280
- /* Store singular values in diagonal of svd_s (rest is zero for compatibility) */ \
281
- svd_s[0] = compute_sqrt(s1_sq), svd_s[1] = 0, svd_s[2] = 0; \
282
- svd_s[3] = 0, svd_s[4] = compute_sqrt(s2_sq), svd_s[5] = 0; \
283
- svd_s[6] = 0, svd_s[7] = 0, svd_s[8] = compute_sqrt(s3_sq); \
225
+ #define nk_define_svd3x3_(type, compute_sqrt) \
226
+ NK_INTERNAL void nk_svd3x3_##type##_(nk_##type##_t const *a, nk_##type##_t *svd_left, nk_##type##_t *svd_diagonal, \
227
+ nk_##type##_t *svd_right) { \
228
+ /* Compute Aᵀ * A (symmetric) */ \
229
+ nk_##type##_t ata[9]; \
230
+ ata[0] = nk_sum_three_squares_##type##_(a[0], a[3], a[6]); \
231
+ ata[1] = nk_sum_three_products_##type##_(a[0], a[1], a[3], a[4], a[6], a[7]); \
232
+ ata[2] = nk_sum_three_products_##type##_(a[0], a[2], a[3], a[5], a[6], a[8]); \
233
+ ata[3] = ata[1]; \
234
+ ata[4] = nk_sum_three_squares_##type##_(a[1], a[4], a[7]); \
235
+ ata[5] = nk_sum_three_products_##type##_(a[1], a[2], a[4], a[5], a[7], a[8]); \
236
+ ata[6] = ata[2]; \
237
+ ata[7] = ata[5]; \
238
+ ata[8] = nk_sum_three_squares_##type##_(a[2], a[5], a[8]); \
239
+ /* Jacobi eigenanalysis of Aᵀ * A */ \
240
+ nk_##type##_t quaternion[4]; \
241
+ nk_jacobi_eigenanalysis_##type##_(&ata[0], &ata[1], &ata[4], &ata[2], &ata[5], &ata[8], quaternion); \
242
+ nk_quaternion_to_mat3x3_##type##_(quaternion, svd_right); \
243
+ /* B = A * V */ \
244
+ nk_##type##_t product[9]; \
245
+ product[0] = nk_sum_three_products_##type##_(a[0], svd_right[0], a[1], svd_right[3], a[2], svd_right[6]); \
246
+ product[1] = nk_sum_three_products_##type##_(a[0], svd_right[1], a[1], svd_right[4], a[2], svd_right[7]); \
247
+ product[2] = nk_sum_three_products_##type##_(a[0], svd_right[2], a[1], svd_right[5], a[2], svd_right[8]); \
248
+ product[3] = nk_sum_three_products_##type##_(a[3], svd_right[0], a[4], svd_right[3], a[5], svd_right[6]); \
249
+ product[4] = nk_sum_three_products_##type##_(a[3], svd_right[1], a[4], svd_right[4], a[5], svd_right[7]); \
250
+ product[5] = nk_sum_three_products_##type##_(a[3], svd_right[2], a[4], svd_right[5], a[5], svd_right[8]); \
251
+ product[6] = nk_sum_three_products_##type##_(a[6], svd_right[0], a[7], svd_right[3], a[8], svd_right[6]); \
252
+ product[7] = nk_sum_three_products_##type##_(a[6], svd_right[1], a[7], svd_right[4], a[8], svd_right[7]); \
253
+ product[8] = nk_sum_three_products_##type##_(a[6], svd_right[2], a[7], svd_right[5], a[8], svd_right[8]); \
254
+ /* Sort singular values and update V */ \
255
+ nk_sort_singular_values_##type##_(product, svd_right); \
256
+ /* Compute singular values from column norms of sorted B (before QR orthogonalizes them) */ \
257
+ /* These are the true singular values: √(‖colᵢ‖²) */ \
258
+ nk_##type##_t singular_value_squared_0 = nk_sum_three_squares_##type##_(product[0], product[3], product[6]); \
259
+ nk_##type##_t singular_value_squared_1 = nk_sum_three_squares_##type##_(product[1], product[4], product[7]); \
260
+ nk_##type##_t singular_value_squared_2 = nk_sum_three_squares_##type##_(product[2], product[5], product[8]); \
261
+ /* Q-only QR: extract U (the orthogonal factor); R is unused by SVD */ \
262
+ nk_qr_orthogonal_factor_##type##_(product, svd_left); \
263
+ /* Store singular values on diagonal positions [0], [4], [8]; off-diagonals never read by callers */ \
264
+ svd_diagonal[0] = compute_sqrt(singular_value_squared_0); \
265
+ svd_diagonal[4] = compute_sqrt(singular_value_squared_1); \
266
+ svd_diagonal[8] = compute_sqrt(singular_value_squared_2); \
284
267
  }
285
268
 
286
269
  #define nk_define_det3x3_(type) \
@@ -289,6 +272,29 @@ extern "C" {
289
272
  m[2] * (m[3] * m[7] - m[4] * m[6]); \
290
273
  }
291
274
 
275
+ /* Keep the serial instantiations below actually scalar, regardless of build type.
276
+ * Without this, -O3 + LTO can vectorize or clone the serial kernels under AVX-512
277
+ * callers in dispatch_*.c, which wastes binary and breaks the nk_*_serial-as-scalar-oracle
278
+ * contract that tests and numerical-stability docs rely on. See dots/serial.h. */
279
+ #if defined(__clang__)
280
+ #pragma clang attribute push(__attribute__((noinline)), apply_to = function)
281
+ #elif defined(__GNUC__)
282
+ #pragma GCC push_options
283
+ #pragma GCC optimize("no-tree-vectorize", "no-tree-slp-vectorize", "no-ipa-cp-clone", "no-inline")
284
+ #endif
285
+
286
+ /* Size bias for release. Gated on NDEBUG so Debug builds keep -O0 for stepping. */
287
+ #if defined(NDEBUG)
288
+ #if defined(_MSC_VER)
289
+ #pragma optimize("s", on)
290
+ #elif defined(__clang__)
291
+ #pragma clang attribute push(__attribute__((minsize)), apply_to = function)
292
+ #elif defined(__GNUC__)
293
+ #pragma GCC push_options
294
+ #pragma GCC optimize("Os")
295
+ #endif
296
+ #endif
297
+
292
298
  NK_INTERNAL nk_f32_t nk_sum_three_products_f32_(nk_f32_t left_0, nk_f32_t right_0, nk_f32_t left_1, nk_f32_t right_1,
293
299
  nk_f32_t left_2, nk_f32_t right_2) {
294
300
  return left_0 * right_0 + left_1 * right_1 + left_2 * right_2;
@@ -340,27 +346,47 @@ NK_INTERNAL void nk_accumulate_square_f64_(nk_f64_t *sum, nk_f64_t *compensation
340
346
  nk_f64_dot2_(sum, compensation, value, value);
341
347
  }
342
348
 
343
- NK_INTERNAL void nk_rotation_from_svd_f32_serial_(nk_f32_t const *svd_u, nk_f32_t const *svd_v, nk_f32_t *rotation) {
344
- rotation[0] = nk_sum_three_products_f32_(svd_v[0], svd_u[0], svd_v[1], svd_u[1], svd_v[2], svd_u[2]);
345
- rotation[1] = nk_sum_three_products_f32_(svd_v[0], svd_u[3], svd_v[1], svd_u[4], svd_v[2], svd_u[5]);
346
- rotation[2] = nk_sum_three_products_f32_(svd_v[0], svd_u[6], svd_v[1], svd_u[7], svd_v[2], svd_u[8]);
347
- rotation[3] = nk_sum_three_products_f32_(svd_v[3], svd_u[0], svd_v[4], svd_u[1], svd_v[5], svd_u[2]);
348
- rotation[4] = nk_sum_three_products_f32_(svd_v[3], svd_u[3], svd_v[4], svd_u[4], svd_v[5], svd_u[5]);
349
- rotation[5] = nk_sum_three_products_f32_(svd_v[3], svd_u[6], svd_v[4], svd_u[7], svd_v[5], svd_u[8]);
350
- rotation[6] = nk_sum_three_products_f32_(svd_v[6], svd_u[0], svd_v[7], svd_u[1], svd_v[8], svd_u[2]);
351
- rotation[7] = nk_sum_three_products_f32_(svd_v[6], svd_u[3], svd_v[7], svd_u[4], svd_v[8], svd_u[5]);
352
- rotation[8] = nk_sum_three_products_f32_(svd_v[6], svd_u[6], svd_v[7], svd_u[7], svd_v[8], svd_u[8]);
349
+ NK_INTERNAL void nk_rotation_from_svd_f32_serial_(nk_f32_t const *svd_left, nk_f32_t const *svd_right,
350
+ nk_f32_t *rotation) {
351
+ rotation[0] = nk_sum_three_products_f32_( //
352
+ svd_right[0], svd_left[0], svd_right[1], svd_left[1], svd_right[2], svd_left[2]);
353
+ rotation[1] = nk_sum_three_products_f32_( //
354
+ svd_right[0], svd_left[3], svd_right[1], svd_left[4], svd_right[2], svd_left[5]);
355
+ rotation[2] = nk_sum_three_products_f32_( //
356
+ svd_right[0], svd_left[6], svd_right[1], svd_left[7], svd_right[2], svd_left[8]);
357
+ rotation[3] = nk_sum_three_products_f32_( //
358
+ svd_right[3], svd_left[0], svd_right[4], svd_left[1], svd_right[5], svd_left[2]);
359
+ rotation[4] = nk_sum_three_products_f32_( //
360
+ svd_right[3], svd_left[3], svd_right[4], svd_left[4], svd_right[5], svd_left[5]);
361
+ rotation[5] = nk_sum_three_products_f32_( //
362
+ svd_right[3], svd_left[6], svd_right[4], svd_left[7], svd_right[5], svd_left[8]);
363
+ rotation[6] = nk_sum_three_products_f32_( //
364
+ svd_right[6], svd_left[0], svd_right[7], svd_left[1], svd_right[8], svd_left[2]);
365
+ rotation[7] = nk_sum_three_products_f32_( //
366
+ svd_right[6], svd_left[3], svd_right[7], svd_left[4], svd_right[8], svd_left[5]);
367
+ rotation[8] = nk_sum_three_products_f32_( //
368
+ svd_right[6], svd_left[6], svd_right[7], svd_left[7], svd_right[8], svd_left[8]);
353
369
  }
354
- NK_INTERNAL void nk_rotation_from_svd_f64_serial_(nk_f64_t const *svd_u, nk_f64_t const *svd_v, nk_f64_t *rotation) {
355
- rotation[0] = nk_sum_three_products_f64_(svd_v[0], svd_u[0], svd_v[1], svd_u[1], svd_v[2], svd_u[2]);
356
- rotation[1] = nk_sum_three_products_f64_(svd_v[0], svd_u[3], svd_v[1], svd_u[4], svd_v[2], svd_u[5]);
357
- rotation[2] = nk_sum_three_products_f64_(svd_v[0], svd_u[6], svd_v[1], svd_u[7], svd_v[2], svd_u[8]);
358
- rotation[3] = nk_sum_three_products_f64_(svd_v[3], svd_u[0], svd_v[4], svd_u[1], svd_v[5], svd_u[2]);
359
- rotation[4] = nk_sum_three_products_f64_(svd_v[3], svd_u[3], svd_v[4], svd_u[4], svd_v[5], svd_u[5]);
360
- rotation[5] = nk_sum_three_products_f64_(svd_v[3], svd_u[6], svd_v[4], svd_u[7], svd_v[5], svd_u[8]);
361
- rotation[6] = nk_sum_three_products_f64_(svd_v[6], svd_u[0], svd_v[7], svd_u[1], svd_v[8], svd_u[2]);
362
- rotation[7] = nk_sum_three_products_f64_(svd_v[6], svd_u[3], svd_v[7], svd_u[4], svd_v[8], svd_u[5]);
363
- rotation[8] = nk_sum_three_products_f64_(svd_v[6], svd_u[6], svd_v[7], svd_u[7], svd_v[8], svd_u[8]);
370
+ NK_INTERNAL void nk_rotation_from_svd_f64_serial_(nk_f64_t const *svd_left, nk_f64_t const *svd_right,
371
+ nk_f64_t *rotation) {
372
+ rotation[0] = nk_sum_three_products_f64_( //
373
+ svd_right[0], svd_left[0], svd_right[1], svd_left[1], svd_right[2], svd_left[2]);
374
+ rotation[1] = nk_sum_three_products_f64_( //
375
+ svd_right[0], svd_left[3], svd_right[1], svd_left[4], svd_right[2], svd_left[5]);
376
+ rotation[2] = nk_sum_three_products_f64_( //
377
+ svd_right[0], svd_left[6], svd_right[1], svd_left[7], svd_right[2], svd_left[8]);
378
+ rotation[3] = nk_sum_three_products_f64_( //
379
+ svd_right[3], svd_left[0], svd_right[4], svd_left[1], svd_right[5], svd_left[2]);
380
+ rotation[4] = nk_sum_three_products_f64_( //
381
+ svd_right[3], svd_left[3], svd_right[4], svd_left[4], svd_right[5], svd_left[5]);
382
+ rotation[5] = nk_sum_three_products_f64_( //
383
+ svd_right[3], svd_left[6], svd_right[4], svd_left[7], svd_right[5], svd_left[8]);
384
+ rotation[6] = nk_sum_three_products_f64_( //
385
+ svd_right[6], svd_left[0], svd_right[7], svd_left[1], svd_right[8], svd_left[2]);
386
+ rotation[7] = nk_sum_three_products_f64_( //
387
+ svd_right[6], svd_left[3], svd_right[7], svd_left[4], svd_right[8], svd_left[5]);
388
+ rotation[8] = nk_sum_three_products_f64_( //
389
+ svd_right[6], svd_left[6], svd_right[7], svd_left[7], svd_right[8], svd_left[8]);
364
390
  }
365
391
 
366
392
  nk_define_cond_swap_(f32)
@@ -372,7 +398,7 @@ nk_define_quaternion_to_mat3x3_(f32)
372
398
  nk_define_jacobi_eigenanalysis_(f32, nk_f32_rsqrt_serial)
373
399
  nk_define_qr_givens_quaternion_(f32, NK_F32_SVD_EPSILON_, nk_f32_rsqrt_serial)
374
400
  nk_define_sort_singular_values_(f32)
375
- nk_define_qr_decomposition_(f32)
401
+ nk_define_qr_orthogonal_factor_(f32)
376
402
  nk_define_svd3x3_(f32, nk_f32_sqrt_serial)
377
403
  nk_define_det3x3_(f32)
378
404
 
@@ -385,7 +411,7 @@ nk_define_quaternion_to_mat3x3_(f64)
385
411
  nk_define_jacobi_eigenanalysis_(f64, nk_f64_rsqrt_serial)
386
412
  nk_define_qr_givens_quaternion_(f64, NK_F64_SVD_EPSILON_, nk_f64_rsqrt_serial)
387
413
  nk_define_sort_singular_values_(f64)
388
- nk_define_qr_decomposition_(f64)
414
+ nk_define_qr_orthogonal_factor_(f64)
389
415
  nk_define_svd3x3_(f64, nk_f64_sqrt_serial)
390
416
  nk_define_det3x3_(f64)
391
417
 
@@ -460,42 +486,53 @@ nk_define_det3x3_(f64)
460
486
  b_centroid[0] = (nk_##output_type##_t)centroid_b_x, b_centroid[1] = (nk_##output_type##_t)centroid_b_y, \
461
487
  b_centroid[2] = (nk_##output_type##_t)centroid_b_z; \
462
488
  /* Step 2: Build 3×3 covariance matrix H = (A - Ā)ᵀ × (B - B̄) */ \
463
- nk_##accumulator_type##_t h[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
464
- nk_##accumulator_type##_t h_compensation[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
489
+ nk_##accumulator_type##_t cross_covariance_kahan_sum[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
490
+ nk_##accumulator_type##_t cross_covariance_kahan_compensation[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
465
491
  for (nk_size_t i = 0; i < n; ++i) { \
466
492
  load_and_convert(a + i * 3 + 0, &val_a_x), load_and_convert(b + i * 3 + 0, &val_b_x); \
467
493
  load_and_convert(a + i * 3 + 1, &val_a_y), load_and_convert(b + i * 3 + 1, &val_b_y); \
468
494
  load_and_convert(a + i * 3 + 2, &val_a_z), load_and_convert(b + i * 3 + 2, &val_b_z); \
469
495
  val_a_x -= centroid_a_x, val_a_y -= centroid_a_y, val_a_z -= centroid_a_z; \
470
496
  val_b_x -= centroid_b_x, val_b_y -= centroid_b_y, val_b_z -= centroid_b_z; \
471
- nk_accumulate_product_##accumulator_type##_(&h[0], &h_compensation[0], val_a_x, val_b_x); \
472
- nk_accumulate_product_##accumulator_type##_(&h[1], &h_compensation[1], val_a_x, val_b_y); \
473
- nk_accumulate_product_##accumulator_type##_(&h[2], &h_compensation[2], val_a_x, val_b_z); \
474
- nk_accumulate_product_##accumulator_type##_(&h[3], &h_compensation[3], val_a_y, val_b_x); \
475
- nk_accumulate_product_##accumulator_type##_(&h[4], &h_compensation[4], val_a_y, val_b_y); \
476
- nk_accumulate_product_##accumulator_type##_(&h[5], &h_compensation[5], val_a_y, val_b_z); \
477
- nk_accumulate_product_##accumulator_type##_(&h[6], &h_compensation[6], val_a_z, val_b_x); \
478
- nk_accumulate_product_##accumulator_type##_(&h[7], &h_compensation[7], val_a_z, val_b_y); \
479
- nk_accumulate_product_##accumulator_type##_(&h[8], &h_compensation[8], val_a_z, val_b_z); \
497
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[0], \
498
+ &cross_covariance_kahan_compensation[0], val_a_x, val_b_x); \
499
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[1], \
500
+ &cross_covariance_kahan_compensation[1], val_a_x, val_b_y); \
501
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[2], \
502
+ &cross_covariance_kahan_compensation[2], val_a_x, val_b_z); \
503
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[3], \
504
+ &cross_covariance_kahan_compensation[3], val_a_y, val_b_x); \
505
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[4], \
506
+ &cross_covariance_kahan_compensation[4], val_a_y, val_b_y); \
507
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[5], \
508
+ &cross_covariance_kahan_compensation[5], val_a_y, val_b_z); \
509
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[6], \
510
+ &cross_covariance_kahan_compensation[6], val_a_z, val_b_x); \
511
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[7], \
512
+ &cross_covariance_kahan_compensation[7], val_a_z, val_b_y); \
513
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[8], \
514
+ &cross_covariance_kahan_compensation[8], val_a_z, val_b_z); \
480
515
  } \
481
516
  /* Convert to svd_type for SVD */ \
482
517
  nk_##svd_type##_t cross_covariance[9]; \
483
- for (int j = 0; j < 9; ++j) cross_covariance[j] = (nk_##svd_type##_t)(h[j] + h_compensation[j]); \
518
+ for (int j = 0; j < 9; ++j) \
519
+ cross_covariance[j] = (nk_##svd_type##_t)(cross_covariance_kahan_sum[j] + \
520
+ cross_covariance_kahan_compensation[j]); \
484
521
  /* Step 3: SVD of H = U * S * Vᵀ */ \
485
- nk_##svd_type##_t svd_u[9], svd_s[9], svd_v[9]; \
486
- nk_svd3x3_##svd_type##_(cross_covariance, svd_u, svd_s, svd_v); \
522
+ nk_##svd_type##_t svd_left[9], svd_diagonal[9], svd_right[9]; \
523
+ nk_svd3x3_##svd_type##_(cross_covariance, svd_left, svd_diagonal, svd_right); \
487
524
  /* Step 4: R = V * Uᵀ */ \
488
- nk_##svd_type##_t rotation_matrix[9]; \
489
- nk_rotation_from_svd_##svd_type##_serial_(svd_u, svd_v, rotation_matrix); \
525
+ nk_##svd_type##_t optimal_rotation[9]; \
526
+ nk_rotation_from_svd_##svd_type##_serial_(svd_left, svd_right, optimal_rotation); \
490
527
  /* Handle reflection: if det(R) < 0, negate third column of V and recompute R */ \
491
- nk_##svd_type##_t rotation_det = nk_det3x3_##svd_type##_(rotation_matrix); \
492
- if (rotation_det < 0) { \
493
- svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8]; \
494
- nk_rotation_from_svd_##svd_type##_serial_(svd_u, svd_v, rotation_matrix); \
528
+ nk_##svd_type##_t rotation_determinant = nk_det3x3_##svd_type##_(optimal_rotation); \
529
+ if (rotation_determinant < 0) { \
530
+ svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8]; \
531
+ nk_rotation_from_svd_##svd_type##_serial_(svd_left, svd_right, optimal_rotation); \
495
532
  } \
496
533
  /* Output rotation matrix and scale=1.0 */ \
497
534
  if (rotation) \
498
- for (int j = 0; j < 9; ++j) rotation[j] = (nk_##output_type##_t)rotation_matrix[j]; \
535
+ for (int j = 0; j < 9; ++j) rotation[j] = (nk_##output_type##_t)optimal_rotation[j]; \
499
536
  if (scale) *scale = (nk_##output_type##_t)1; \
500
537
  /* Step 5: Compute RMSD after rotation */ \
501
538
  nk_##accumulator_type##_t sum_squared = 0, sum_squared_compensation = 0; \
@@ -510,12 +547,12 @@ nk_define_det3x3_(f64)
510
547
  point_b[0] = (nk_##svd_type##_t)(val_b_x - centroid_b_x), \
511
548
  point_b[1] = (nk_##svd_type##_t)(val_b_y - centroid_b_y), \
512
549
  point_b[2] = (nk_##svd_type##_t)(val_b_z - centroid_b_z); \
513
- rotated_point_a[0] = rotation_matrix[0] * point_a[0] + rotation_matrix[1] * point_a[1] + \
514
- rotation_matrix[2] * point_a[2]; \
515
- rotated_point_a[1] = rotation_matrix[3] * point_a[0] + rotation_matrix[4] * point_a[1] + \
516
- rotation_matrix[5] * point_a[2]; \
517
- rotated_point_a[2] = rotation_matrix[6] * point_a[0] + rotation_matrix[7] * point_a[1] + \
518
- rotation_matrix[8] * point_a[2]; \
550
+ rotated_point_a[0] = optimal_rotation[0] * point_a[0] + optimal_rotation[1] * point_a[1] + \
551
+ optimal_rotation[2] * point_a[2]; \
552
+ rotated_point_a[1] = optimal_rotation[3] * point_a[0] + optimal_rotation[4] * point_a[1] + \
553
+ optimal_rotation[5] * point_a[2]; \
554
+ rotated_point_a[2] = optimal_rotation[6] * point_a[0] + optimal_rotation[7] * point_a[1] + \
555
+ optimal_rotation[8] * point_a[2]; \
519
556
  nk_##svd_type##_t dx = rotated_point_a[0] - point_b[0]; \
520
557
  nk_##svd_type##_t dy = rotated_point_a[1] - point_b[1]; \
521
558
  nk_##svd_type##_t dz = rotated_point_a[2] - point_b[2]; \
@@ -571,8 +608,8 @@ nk_define_det3x3_(f64)
571
608
  b_centroid[0] = (nk_##output_type##_t)centroid_b_x, b_centroid[1] = (nk_##output_type##_t)centroid_b_y, \
572
609
  b_centroid[2] = (nk_##output_type##_t)centroid_b_z; \
573
610
  /* Step 2: Build covariance matrix H and compute variance of A */ \
574
- nk_##accumulator_type##_t h[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
575
- nk_##accumulator_type##_t h_compensation[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
611
+ nk_##accumulator_type##_t cross_covariance_kahan_sum[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
612
+ nk_##accumulator_type##_t cross_covariance_kahan_compensation[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
576
613
  nk_##accumulator_type##_t variance_a = 0, variance_a_compensation = 0; \
577
614
  for (nk_size_t i = 0; i < n; ++i) { \
578
615
  load_and_convert(a + i * 3 + 0, &val_a_x), load_and_convert(b + i * 3 + 0, &val_b_x); \
@@ -583,41 +620,52 @@ nk_define_det3x3_(f64)
583
620
  nk_accumulate_square_##accumulator_type##_(&variance_a, &variance_a_compensation, val_a_x); \
584
621
  nk_accumulate_square_##accumulator_type##_(&variance_a, &variance_a_compensation, val_a_y); \
585
622
  nk_accumulate_square_##accumulator_type##_(&variance_a, &variance_a_compensation, val_a_z); \
586
- nk_accumulate_product_##accumulator_type##_(&h[0], &h_compensation[0], val_a_x, val_b_x); \
587
- nk_accumulate_product_##accumulator_type##_(&h[1], &h_compensation[1], val_a_x, val_b_y); \
588
- nk_accumulate_product_##accumulator_type##_(&h[2], &h_compensation[2], val_a_x, val_b_z); \
589
- nk_accumulate_product_##accumulator_type##_(&h[3], &h_compensation[3], val_a_y, val_b_x); \
590
- nk_accumulate_product_##accumulator_type##_(&h[4], &h_compensation[4], val_a_y, val_b_y); \
591
- nk_accumulate_product_##accumulator_type##_(&h[5], &h_compensation[5], val_a_y, val_b_z); \
592
- nk_accumulate_product_##accumulator_type##_(&h[6], &h_compensation[6], val_a_z, val_b_x); \
593
- nk_accumulate_product_##accumulator_type##_(&h[7], &h_compensation[7], val_a_z, val_b_y); \
594
- nk_accumulate_product_##accumulator_type##_(&h[8], &h_compensation[8], val_a_z, val_b_z); \
623
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[0], \
624
+ &cross_covariance_kahan_compensation[0], val_a_x, val_b_x); \
625
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[1], \
626
+ &cross_covariance_kahan_compensation[1], val_a_x, val_b_y); \
627
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[2], \
628
+ &cross_covariance_kahan_compensation[2], val_a_x, val_b_z); \
629
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[3], \
630
+ &cross_covariance_kahan_compensation[3], val_a_y, val_b_x); \
631
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[4], \
632
+ &cross_covariance_kahan_compensation[4], val_a_y, val_b_y); \
633
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[5], \
634
+ &cross_covariance_kahan_compensation[5], val_a_y, val_b_z); \
635
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[6], \
636
+ &cross_covariance_kahan_compensation[6], val_a_z, val_b_x); \
637
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[7], \
638
+ &cross_covariance_kahan_compensation[7], val_a_z, val_b_y); \
639
+ nk_accumulate_product_##accumulator_type##_(&cross_covariance_kahan_sum[8], \
640
+ &cross_covariance_kahan_compensation[8], val_a_z, val_b_z); \
595
641
  } \
596
642
  variance_a = (variance_a + variance_a_compensation) * inv_n; \
597
643
  /* Convert to svd_type for SVD */ \
598
644
  nk_##svd_type##_t cross_covariance[9]; \
599
- for (int j = 0; j < 9; ++j) cross_covariance[j] = (nk_##svd_type##_t)(h[j] + h_compensation[j]); \
645
+ for (int j = 0; j < 9; ++j) \
646
+ cross_covariance[j] = (nk_##svd_type##_t)(cross_covariance_kahan_sum[j] + \
647
+ cross_covariance_kahan_compensation[j]); \
600
648
  /* Step 3: SVD of H = U * S * Vᵀ */ \
601
- nk_##svd_type##_t svd_u[9], svd_s[9], svd_v[9]; \
602
- nk_svd3x3_##svd_type##_(cross_covariance, svd_u, svd_s, svd_v); \
649
+ nk_##svd_type##_t svd_left[9], svd_diagonal[9], svd_right[9]; \
650
+ nk_svd3x3_##svd_type##_(cross_covariance, svd_left, svd_diagonal, svd_right); \
603
651
  /* Step 4: R = V * Uᵀ */ \
604
- nk_##svd_type##_t rotation_matrix[9]; \
605
- nk_rotation_from_svd_##svd_type##_serial_(svd_u, svd_v, rotation_matrix); \
652
+ nk_##svd_type##_t optimal_rotation[9]; \
653
+ nk_rotation_from_svd_##svd_type##_serial_(svd_left, svd_right, optimal_rotation); \
606
654
  /* Handle reflection and compute scale: c = trace(D × S) / variance(a) */ \
607
- /* D = diag(1, 1, det(R)), svd_s contains proper positive singular values on diagonal */ \
608
- nk_##svd_type##_t rotation_det = nk_det3x3_##svd_type##_(rotation_matrix); \
609
- nk_##svd_type##_t sign_det = rotation_det < 0 ? (nk_##svd_type##_t) - 1.0 : (nk_##svd_type##_t)1.0; \
610
- nk_##svd_type##_t trace_scaled_s = svd_s[0] + svd_s[4] + sign_det * svd_s[8]; \
655
+ /* D = diag(1, 1, det(R)), svd_diagonal contains proper positive singular values on diagonal */ \
656
+ nk_##svd_type##_t rotation_determinant = nk_det3x3_##svd_type##_(optimal_rotation); \
657
+ nk_##svd_type##_t sign_det = rotation_determinant < 0 ? (nk_##svd_type##_t) - 1.0 : (nk_##svd_type##_t)1.0; \
658
+ nk_##svd_type##_t trace_scaled_s = svd_diagonal[0] + svd_diagonal[4] + sign_det * svd_diagonal[8]; \
611
659
  nk_##accumulator_type##_t scale_factor = (nk_##accumulator_type##_t)trace_scaled_s / \
612
660
  ((nk_##accumulator_type##_t)n * variance_a); \
613
661
  if (scale) *scale = (nk_##output_type##_t)scale_factor; \
614
- if (rotation_det < 0) { \
615
- svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8]; \
616
- nk_rotation_from_svd_##svd_type##_serial_(svd_u, svd_v, rotation_matrix); \
662
+ if (rotation_determinant < 0) { \
663
+ svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8]; \
664
+ nk_rotation_from_svd_##svd_type##_serial_(svd_left, svd_right, optimal_rotation); \
617
665
  } \
618
666
  /* Output rotation matrix */ \
619
667
  if (rotation) \
620
- for (int j = 0; j < 9; ++j) rotation[j] = (nk_##output_type##_t)rotation_matrix[j]; \
668
+ for (int j = 0; j < 9; ++j) rotation[j] = (nk_##output_type##_t)optimal_rotation[j]; \
621
669
  /* Step 5: Compute RMSD after similarity transform: ‖c × R × a - b‖ */ \
622
670
  nk_##accumulator_type##_t sum_squared = 0, sum_squared_compensation = 0; \
623
671
  for (nk_size_t i = 0; i < n; ++i) { \
@@ -632,14 +680,14 @@ nk_define_det3x3_(f64)
632
680
  point_b[1] = (nk_##svd_type##_t)(val_b_y - centroid_b_y), \
633
681
  point_b[2] = (nk_##svd_type##_t)(val_b_z - centroid_b_z); \
634
682
  rotated_point_a[0] = (nk_##svd_type##_t)scale_factor * \
635
- (rotation_matrix[0] * point_a[0] + rotation_matrix[1] * point_a[1] + \
636
- rotation_matrix[2] * point_a[2]); \
683
+ (optimal_rotation[0] * point_a[0] + optimal_rotation[1] * point_a[1] + \
684
+ optimal_rotation[2] * point_a[2]); \
637
685
  rotated_point_a[1] = (nk_##svd_type##_t)scale_factor * \
638
- (rotation_matrix[3] * point_a[0] + rotation_matrix[4] * point_a[1] + \
639
- rotation_matrix[5] * point_a[2]); \
686
+ (optimal_rotation[3] * point_a[0] + optimal_rotation[4] * point_a[1] + \
687
+ optimal_rotation[5] * point_a[2]); \
640
688
  rotated_point_a[2] = (nk_##svd_type##_t)scale_factor * \
641
- (rotation_matrix[6] * point_a[0] + rotation_matrix[7] * point_a[1] + \
642
- rotation_matrix[8] * point_a[2]); \
689
+ (optimal_rotation[6] * point_a[0] + optimal_rotation[7] * point_a[1] + \
690
+ optimal_rotation[8] * point_a[2]); \
643
691
  nk_##svd_type##_t dx = rotated_point_a[0] - point_b[0]; \
644
692
  nk_##svd_type##_t dy = rotated_point_a[1] - point_b[1]; \
645
693
  nk_##svd_type##_t dz = rotated_point_a[2] - point_b[2]; \
@@ -685,13 +733,29 @@ nk_define_umeyama_(bf16, f32, f32, f32, f32, nk_bf16_to_f32_serial, nk_f32_sqrt_
685
733
  #undef nk_define_jacobi_eigenanalysis_
686
734
  #undef nk_define_qr_givens_quaternion_
687
735
  #undef nk_define_sort_singular_values_
688
- #undef nk_define_qr_decomposition_
736
+ #undef nk_define_qr_orthogonal_factor_
689
737
  #undef nk_define_svd3x3_
690
738
  #undef nk_define_det3x3_
691
739
  #undef nk_define_rmsd_
692
740
  #undef nk_define_kabsch_
693
741
  #undef nk_define_umeyama_
694
742
 
743
+ #if defined(NDEBUG)
744
+ #if defined(_MSC_VER)
745
+ #pragma optimize("", on)
746
+ #elif defined(__clang__)
747
+ #pragma clang attribute pop
748
+ #elif defined(__GNUC__)
749
+ #pragma GCC pop_options
750
+ #endif
751
+ #endif
752
+
753
+ #if defined(__clang__)
754
+ #pragma clang attribute pop
755
+ #elif defined(__GNUC__)
756
+ #pragma GCC pop_options
757
+ #endif
758
+
695
759
  #if defined(__cplusplus)
696
760
  } // extern "C"
697
761
  #endif