numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,742 @@
1
+ /**
2
+ * @brief SWAR-accelerated Point Cloud Alignment for SIMD-free CPUs.
3
+ * @file include/numkong/mesh/serial.h
4
+ * @author Ash Vardanian
5
+ * @date December 27, 2025
6
+ *
7
+ * @sa include/numkong/mesh.h
8
+ */
9
+ #ifndef NK_MESH_SERIAL_H
10
+ #define NK_MESH_SERIAL_H
11
+
12
+ #include "numkong/types.h"
13
+ #include "numkong/scalar/serial.h"
14
+ #include "numkong/spatial/serial.h" // `nk_f32_sqrt_serial`, `nk_f64_sqrt_serial`
15
+
16
+ #if defined(__cplusplus)
17
+ extern "C" {
18
+ #endif
19
+
20
+ /* Constants for the McAdams 3×3 SVD algorithm.
21
+ * γ = (√8 + 3)² / 4 = 5.828427124
22
+ * cstar = cos(π/8), sstar = sin(π/8)
23
+ */
24
+ #define NK_F32_SVD_GAMMA_ 5.828427124f
25
+ #define NK_F32_SVD_CSTAR_ 0.923879532f
26
+ #define NK_F32_SVD_SSTAR_ 0.3826834323f
27
+ #define NK_F32_SVD_EPSILON_ 1e-6f
28
+
29
+ #define NK_F64_SVD_GAMMA_ 5.828427124746190
30
+ #define NK_F64_SVD_CSTAR_ 0.9238795325112867
31
+ #define NK_F64_SVD_SSTAR_ 0.3826834323650898
32
+ #define NK_F64_SVD_EPSILON_ 1e-12
33
+
34
+ /* Type-Generic SVD Helper Macros
35
+ * These macros generate f32 and f64 versions of SVD helper functions
36
+ * used by the Kabsch and Umeyama algorithms.
37
+ */
38
+
39
+ #define nk_define_cond_swap_(type) \
40
+ NK_INTERNAL void nk_cond_swap_##type##_(int c, nk_##type##_t *x, nk_##type##_t *y) { \
41
+ nk_##type##_t temp = *x; \
42
+ *x = c ? *y : *x; \
43
+ *y = c ? temp : *y; \
44
+ }
45
+
46
+ #define nk_define_conditional_negating_swap_(type) \
47
+ NK_INTERNAL void nk_conditional_negating_swap_##type##_(int c, nk_##type##_t *x, nk_##type##_t *y) { \
48
+ nk_##type##_t neg_x = -*x; \
49
+ *x = c ? *y : *x; \
50
+ *y = c ? neg_x : *y; \
51
+ }
52
+
53
+ #define nk_define_approximate_givens_quaternion_(type, gamma, cstar, sstar, compute_rsqrt) \
54
+ NK_INTERNAL void nk_approximate_givens_quaternion_##type##_( \
55
+ nk_##type##_t a11, nk_##type##_t a12, nk_##type##_t a22, nk_##type##_t *cos_half, nk_##type##_t *sin_half) { \
56
+ *cos_half = 2 * (a11 - a22), *sin_half = a12; \
57
+ int use_givens = gamma * (*sin_half) * (*sin_half) < (*cos_half) * (*cos_half); \
58
+ nk_##type##_t w = compute_rsqrt((*cos_half) * (*cos_half) + (*sin_half) * (*sin_half)); \
59
+ *cos_half = use_givens ? w * (*cos_half) : cstar; \
60
+ *sin_half = use_givens ? w * (*sin_half) : sstar; \
61
+ }
62
+
63
+ #define nk_define_jacobi_conjugation_(type) \
64
+ NK_INTERNAL void nk_jacobi_conjugation_##type##_( \
65
+ int idx_x, int idx_y, int idx_z, nk_##type##_t *s11, nk_##type##_t *s21, nk_##type##_t *s22, \
66
+ nk_##type##_t *s31, nk_##type##_t *s32, nk_##type##_t *s33, nk_##type##_t *quaternion) { \
67
+ nk_##type##_t cos_half, sin_half; \
68
+ nk_approximate_givens_quaternion_##type##_(*s11, *s21, *s22, &cos_half, &sin_half); \
69
+ nk_##type##_t scale = cos_half * cos_half + sin_half * sin_half; \
70
+ nk_##type##_t cos_theta = (cos_half * cos_half - sin_half * sin_half) / scale; \
71
+ nk_##type##_t sin_theta = (2 * sin_half * cos_half) / scale; \
72
+ nk_##type##_t s11_old = *s11, s21_old = *s21, s22_old = *s22; \
73
+ nk_##type##_t s31_old = *s31, s32_old = *s32, s33_old = *s33; \
74
+ *s11 = cos_theta * (cos_theta * s11_old + sin_theta * s21_old) + \
75
+ sin_theta * (cos_theta * s21_old + sin_theta * s22_old); \
76
+ *s21 = cos_theta * (-sin_theta * s11_old + cos_theta * s21_old) + \
77
+ sin_theta * (-sin_theta * s21_old + cos_theta * s22_old); \
78
+ *s22 = -sin_theta * (-sin_theta * s11_old + cos_theta * s21_old) + \
79
+ cos_theta * (-sin_theta * s21_old + cos_theta * s22_old); \
80
+ *s31 = cos_theta * s31_old + sin_theta * s32_old; \
81
+ *s32 = -sin_theta * s31_old + cos_theta * s32_old; \
82
+ *s33 = s33_old; \
83
+ /* Update quaternion accumulator */ \
84
+ nk_##type##_t quat_temp[3]; \
85
+ quat_temp[0] = quaternion[0] * sin_half; \
86
+ quat_temp[1] = quaternion[1] * sin_half; \
87
+ quat_temp[2] = quaternion[2] * sin_half; \
88
+ sin_half *= quaternion[3]; \
89
+ quaternion[0] *= cos_half, quaternion[1] *= cos_half; \
90
+ quaternion[2] *= cos_half, quaternion[3] *= cos_half; \
91
+ quaternion[idx_z] += sin_half, quaternion[3] -= quat_temp[idx_z]; \
92
+ quaternion[idx_x] += quat_temp[idx_y], quaternion[idx_y] -= quat_temp[idx_x]; \
93
+ /* Cyclic permutation of matrix elements */ \
94
+ s11_old = *s22, s21_old = *s32, s22_old = *s33; \
95
+ s31_old = *s21, s32_old = *s31, s33_old = *s11; \
96
+ *s11 = s11_old, *s21 = s21_old, *s22 = s22_old; \
97
+ *s31 = s31_old, *s32 = s32_old, *s33 = s33_old; \
98
+ }
99
+
100
+ #define nk_define_quaternion_to_mat3x3_(type) \
101
+ NK_INTERNAL void nk_quaternion_to_mat3x3_##type##_(nk_##type##_t const *quat, nk_##type##_t *matrix) { \
102
+ nk_##type##_t w = quat[3], x = quat[0], y = quat[1], z = quat[2]; \
103
+ nk_##type##_t q_xx = x * x, q_yy = y * y, q_zz = z * z; \
104
+ nk_##type##_t q_xz = x * z, q_xy = x * y, q_yz = y * z; \
105
+ nk_##type##_t q_wx = w * x, q_wy = w * y, q_wz = w * z; \
106
+ matrix[0] = 1 - 2 * (q_yy + q_zz), matrix[1] = 2 * (q_xy - q_wz); \
107
+ matrix[2] = 2 * (q_xz + q_wy); \
108
+ matrix[3] = 2 * (q_xy + q_wz), matrix[4] = 1 - 2 * (q_xx + q_zz); \
109
+ matrix[5] = 2 * (q_yz - q_wx); \
110
+ matrix[6] = 2 * (q_xz - q_wy), matrix[7] = 2 * (q_yz + q_wx); \
111
+ matrix[8] = 1 - 2 * (q_xx + q_yy); \
112
+ }
113
+
114
+ #define nk_define_jacobi_eigenanalysis_(type, compute_rsqrt) \
115
+ NK_INTERNAL void nk_jacobi_eigenanalysis_##type##_(nk_##type##_t *s11, nk_##type##_t *s21, nk_##type##_t *s22, \
116
+ nk_##type##_t *s31, nk_##type##_t *s32, nk_##type##_t *s33, \
117
+ nk_##type##_t *quaternion) { \
118
+ quaternion[0] = 0, quaternion[1] = 0, quaternion[2] = 0, quaternion[3] = 1; \
119
+ /* 16 iterations for better convergence with repeated eigenvalues and identity-like matrices */ \
120
+ for (int iter = 0; iter < 16; iter++) { \
121
+ nk_jacobi_conjugation_##type##_(0, 1, 2, s11, s21, s22, s31, s32, s33, quaternion); \
122
+ nk_jacobi_conjugation_##type##_(1, 2, 0, s11, s21, s22, s31, s32, s33, quaternion); \
123
+ nk_jacobi_conjugation_##type##_(2, 0, 1, s11, s21, s22, s31, s32, s33, quaternion); \
124
+ } \
125
+ nk_##type##_t norm = compute_rsqrt(quaternion[0] * quaternion[0] + quaternion[1] * quaternion[1] + \
126
+ quaternion[2] * quaternion[2] + quaternion[3] * quaternion[3]); \
127
+ quaternion[0] *= norm, quaternion[1] *= norm; \
128
+ quaternion[2] *= norm, quaternion[3] *= norm; \
129
+ }
130
+
131
+ #define nk_define_qr_givens_quaternion_(type, epsilon, compute_rsqrt) \
132
+ NK_INTERNAL void nk_qr_givens_quaternion_##type##_(nk_##type##_t a1, nk_##type##_t a2, nk_##type##_t *cos_half, \
133
+ nk_##type##_t *sin_half) { \
134
+ nk_##type##_t a1_sq_plus_a2_sq = a1 * a1 + a2 * a2; \
135
+ nk_##type##_t rho = a1_sq_plus_a2_sq * compute_rsqrt(a1_sq_plus_a2_sq); \
136
+ rho = a1_sq_plus_a2_sq > epsilon ? rho : 0; \
137
+ *sin_half = rho > epsilon ? a2 : 0; \
138
+ nk_##type##_t abs_a1 = a1 < 0 ? -a1 : a1; \
139
+ nk_##type##_t max_rho = rho > epsilon ? rho : epsilon; \
140
+ *cos_half = abs_a1 + max_rho; \
141
+ int should_swap = a1 < 0; \
142
+ nk_cond_swap_##type##_(should_swap, sin_half, cos_half); \
143
+ nk_##type##_t w = compute_rsqrt((*cos_half) * (*cos_half) + (*sin_half) * (*sin_half)); \
144
+ *cos_half *= w, *sin_half *= w; \
145
+ }
146
+
147
+ #define nk_define_sort_singular_values_(type) \
148
+ NK_INTERNAL void nk_sort_singular_values_##type##_(nk_##type##_t *b, nk_##type##_t *v) { \
149
+ nk_##type##_t rho1 = b[0] * b[0] + b[3] * b[3] + b[6] * b[6]; \
150
+ nk_##type##_t rho2 = b[1] * b[1] + b[4] * b[4] + b[7] * b[7]; \
151
+ nk_##type##_t rho3 = b[2] * b[2] + b[5] * b[5] + b[8] * b[8]; \
152
+ int should_swap; \
153
+ /* Sort columns by descending singular value magnitude */ \
154
+ should_swap = rho1 < rho2; \
155
+ nk_conditional_negating_swap_##type##_(should_swap, &b[0], &b[1]); \
156
+ nk_conditional_negating_swap_##type##_(should_swap, &v[0], &v[1]); \
157
+ nk_conditional_negating_swap_##type##_(should_swap, &b[3], &b[4]); \
158
+ nk_conditional_negating_swap_##type##_(should_swap, &v[3], &v[4]); \
159
+ nk_conditional_negating_swap_##type##_(should_swap, &b[6], &b[7]); \
160
+ nk_conditional_negating_swap_##type##_(should_swap, &v[6], &v[7]); \
161
+ nk_cond_swap_##type##_(should_swap, &rho1, &rho2); \
162
+ should_swap = rho1 < rho3; \
163
+ nk_conditional_negating_swap_##type##_(should_swap, &b[0], &b[2]); \
164
+ nk_conditional_negating_swap_##type##_(should_swap, &v[0], &v[2]); \
165
+ nk_conditional_negating_swap_##type##_(should_swap, &b[3], &b[5]); \
166
+ nk_conditional_negating_swap_##type##_(should_swap, &v[3], &v[5]); \
167
+ nk_conditional_negating_swap_##type##_(should_swap, &b[6], &b[8]); \
168
+ nk_conditional_negating_swap_##type##_(should_swap, &v[6], &v[8]); \
169
+ nk_cond_swap_##type##_(should_swap, &rho1, &rho3); \
170
+ should_swap = rho2 < rho3; \
171
+ nk_conditional_negating_swap_##type##_(should_swap, &b[1], &b[2]); \
172
+ nk_conditional_negating_swap_##type##_(should_swap, &v[1], &v[2]); \
173
+ nk_conditional_negating_swap_##type##_(should_swap, &b[4], &b[5]); \
174
+ nk_conditional_negating_swap_##type##_(should_swap, &v[4], &v[5]); \
175
+ nk_conditional_negating_swap_##type##_(should_swap, &b[7], &b[8]); \
176
+ nk_conditional_negating_swap_##type##_(should_swap, &v[7], &v[8]); \
177
+ }
178
+
179
+ #define nk_define_qr_decomposition_(type) \
180
+ NK_INTERNAL void nk_qr_decomposition_##type##_(nk_##type##_t const *input, nk_##type##_t *q, nk_##type##_t *r) { \
181
+ nk_##type##_t cos_half_1, sin_half_1; \
182
+ nk_##type##_t cos_half_2, sin_half_2; \
183
+ nk_##type##_t cos_half_3, sin_half_3; \
184
+ nk_##type##_t cos_theta, sin_theta; \
185
+ nk_##type##_t rotation_temp[9], matrix_temp[9]; \
186
+ /* First Givens rotation (zero input[3]) */ \
187
+ nk_qr_givens_quaternion_##type##_(input[0], input[3], &cos_half_1, &sin_half_1); \
188
+ cos_theta = 1 - 2 * sin_half_1 * sin_half_1; \
189
+ sin_theta = 2 * cos_half_1 * sin_half_1; \
190
+ rotation_temp[0] = cos_theta * input[0] + sin_theta * input[3]; \
191
+ rotation_temp[1] = cos_theta * input[1] + sin_theta * input[4]; \
192
+ rotation_temp[2] = cos_theta * input[2] + sin_theta * input[5]; \
193
+ rotation_temp[3] = -sin_theta * input[0] + cos_theta * input[3]; \
194
+ rotation_temp[4] = -sin_theta * input[1] + cos_theta * input[4]; \
195
+ rotation_temp[5] = -sin_theta * input[2] + cos_theta * input[5]; \
196
+ rotation_temp[6] = input[6], rotation_temp[7] = input[7]; \
197
+ rotation_temp[8] = input[8]; \
198
+ /* Second Givens rotation (zero rotation_temp[6]) */ \
199
+ nk_qr_givens_quaternion_##type##_(rotation_temp[0], rotation_temp[6], &cos_half_2, &sin_half_2); \
200
+ cos_theta = 1 - 2 * sin_half_2 * sin_half_2; \
201
+ sin_theta = 2 * cos_half_2 * sin_half_2; \
202
+ matrix_temp[0] = cos_theta * rotation_temp[0] + sin_theta * rotation_temp[6]; \
203
+ matrix_temp[1] = cos_theta * rotation_temp[1] + sin_theta * rotation_temp[7]; \
204
+ matrix_temp[2] = cos_theta * rotation_temp[2] + sin_theta * rotation_temp[8]; \
205
+ matrix_temp[3] = rotation_temp[3], matrix_temp[4] = rotation_temp[4]; \
206
+ matrix_temp[5] = rotation_temp[5]; \
207
+ matrix_temp[6] = -sin_theta * rotation_temp[0] + cos_theta * rotation_temp[6]; \
208
+ matrix_temp[7] = -sin_theta * rotation_temp[1] + cos_theta * rotation_temp[7]; \
209
+ matrix_temp[8] = -sin_theta * rotation_temp[2] + cos_theta * rotation_temp[8]; \
210
+ /* Third Givens rotation (zero matrix_temp[7]) */ \
211
+ nk_qr_givens_quaternion_##type##_(matrix_temp[4], matrix_temp[7], &cos_half_3, &sin_half_3); \
212
+ cos_theta = 1 - 2 * sin_half_3 * sin_half_3; \
213
+ sin_theta = 2 * cos_half_3 * sin_half_3; \
214
+ r[0] = matrix_temp[0], r[1] = matrix_temp[1], r[2] = matrix_temp[2]; \
215
+ r[3] = cos_theta * matrix_temp[3] + sin_theta * matrix_temp[6]; \
216
+ r[4] = cos_theta * matrix_temp[4] + sin_theta * matrix_temp[7]; \
217
+ r[5] = cos_theta * matrix_temp[5] + sin_theta * matrix_temp[8]; \
218
+ r[6] = -sin_theta * matrix_temp[3] + cos_theta * matrix_temp[6]; \
219
+ r[7] = -sin_theta * matrix_temp[4] + cos_theta * matrix_temp[7]; \
220
+ r[8] = -sin_theta * matrix_temp[5] + cos_theta * matrix_temp[8]; \
221
+ /* Construct Q = Q1 * Q2 * Q3 (closed-form expressions) */ \
222
+ nk_##type##_t sin_half_1_sq = sin_half_1 * sin_half_1; \
223
+ nk_##type##_t sin_half_2_sq = sin_half_2 * sin_half_2; \
224
+ nk_##type##_t sin_half_3_sq = sin_half_3 * sin_half_3; \
225
+ q[0] = (-1 + 2 * sin_half_1_sq) * (-1 + 2 * sin_half_2_sq); \
226
+ q[1] = 4 * cos_half_2 * cos_half_3 * (-1 + 2 * sin_half_1_sq) * sin_half_2 * sin_half_3 + \
227
+ 2 * cos_half_1 * sin_half_1 * (-1 + 2 * sin_half_3_sq); \
228
+ q[2] = 4 * cos_half_1 * cos_half_3 * sin_half_1 * sin_half_3 - \
229
+ 2 * cos_half_2 * (-1 + 2 * sin_half_1_sq) * sin_half_2 * (-1 + 2 * sin_half_3_sq); \
230
+ q[3] = 2 * cos_half_1 * sin_half_1 * (1 - 2 * sin_half_2_sq); \
231
+ q[4] = -8 * cos_half_1 * cos_half_2 * cos_half_3 * sin_half_1 * sin_half_2 * sin_half_3 + \
232
+ (-1 + 2 * sin_half_1_sq) * (-1 + 2 * sin_half_3_sq); \
233
+ q[5] = -2 * cos_half_3 * sin_half_3 + 4 * sin_half_1 * \
234
+ (cos_half_3 * sin_half_1 * sin_half_3 + \
235
+ cos_half_1 * cos_half_2 * sin_half_2 * (-1 + 2 * sin_half_3_sq)); \
236
+ q[6] = 2 * cos_half_2 * sin_half_2; \
237
+ q[7] = 2 * cos_half_3 * (1 - 2 * sin_half_2_sq) * sin_half_3; \
238
+ q[8] = (-1 + 2 * sin_half_2_sq) * (-1 + 2 * sin_half_3_sq); \
239
+ }
240
+
241
+ #define nk_define_svd3x3_(type, compute_sqrt) \
242
+ NK_INTERNAL void nk_svd3x3_##type##_(nk_##type##_t const *a, nk_##type##_t *svd_u, nk_##type##_t *svd_s, \
243
+ nk_##type##_t *svd_v) { \
244
+ /* Compute Aᵀ * A (symmetric) */ \
245
+ nk_##type##_t ata[9]; \
246
+ ata[0] = nk_sum_three_squares_##type##_(a[0], a[3], a[6]); \
247
+ ata[1] = nk_sum_three_products_##type##_(a[0], a[1], a[3], a[4], a[6], a[7]); \
248
+ ata[2] = nk_sum_three_products_##type##_(a[0], a[2], a[3], a[5], a[6], a[8]); \
249
+ ata[3] = ata[1]; \
250
+ ata[4] = nk_sum_three_squares_##type##_(a[1], a[4], a[7]); \
251
+ ata[5] = nk_sum_three_products_##type##_(a[1], a[2], a[4], a[5], a[7], a[8]); \
252
+ ata[6] = ata[2]; \
253
+ ata[7] = ata[5]; \
254
+ ata[8] = nk_sum_three_squares_##type##_(a[2], a[5], a[8]); \
255
+ /* Jacobi eigenanalysis of Aᵀ * A */ \
256
+ nk_##type##_t quaternion[4]; \
257
+ nk_jacobi_eigenanalysis_##type##_(&ata[0], &ata[1], &ata[4], &ata[2], &ata[5], &ata[8], quaternion); \
258
+ nk_quaternion_to_mat3x3_##type##_(quaternion, svd_v); \
259
+ /* B = A * V */ \
260
+ nk_##type##_t product[9]; \
261
+ product[0] = nk_sum_three_products_##type##_(a[0], svd_v[0], a[1], svd_v[3], a[2], svd_v[6]); \
262
+ product[1] = nk_sum_three_products_##type##_(a[0], svd_v[1], a[1], svd_v[4], a[2], svd_v[7]); \
263
+ product[2] = nk_sum_three_products_##type##_(a[0], svd_v[2], a[1], svd_v[5], a[2], svd_v[8]); \
264
+ product[3] = nk_sum_three_products_##type##_(a[3], svd_v[0], a[4], svd_v[3], a[5], svd_v[6]); \
265
+ product[4] = nk_sum_three_products_##type##_(a[3], svd_v[1], a[4], svd_v[4], a[5], svd_v[7]); \
266
+ product[5] = nk_sum_three_products_##type##_(a[3], svd_v[2], a[4], svd_v[5], a[5], svd_v[8]); \
267
+ product[6] = nk_sum_three_products_##type##_(a[6], svd_v[0], a[7], svd_v[3], a[8], svd_v[6]); \
268
+ product[7] = nk_sum_three_products_##type##_(a[6], svd_v[1], a[7], svd_v[4], a[8], svd_v[7]); \
269
+ product[8] = nk_sum_three_products_##type##_(a[6], svd_v[2], a[7], svd_v[5], a[8], svd_v[8]); \
270
+ /* Sort singular values and update V */ \
271
+ nk_sort_singular_values_##type##_(product, svd_v); \
272
+ /* Compute singular values from column norms of sorted B (before QR orthogonalizes them) */ \
273
+ /* These are the true singular values: √(‖colᵢ‖²) */ \
274
+ nk_##type##_t s1_sq = nk_sum_three_squares_##type##_(product[0], product[3], product[6]); \
275
+ nk_##type##_t s2_sq = nk_sum_three_squares_##type##_(product[1], product[4], product[7]); \
276
+ nk_##type##_t s3_sq = nk_sum_three_squares_##type##_(product[2], product[5], product[8]); \
277
+ /* QR decomposition: B = U * R (we only need U for the rotation) */ \
278
+ nk_##type##_t qr_r[9]; \
279
+ nk_qr_decomposition_##type##_(product, svd_u, qr_r); \
280
+ /* Store singular values in diagonal of svd_s (rest is zero for compatibility) */ \
281
+ svd_s[0] = compute_sqrt(s1_sq), svd_s[1] = 0, svd_s[2] = 0; \
282
+ svd_s[3] = 0, svd_s[4] = compute_sqrt(s2_sq), svd_s[5] = 0; \
283
+ svd_s[6] = 0, svd_s[7] = 0, svd_s[8] = compute_sqrt(s3_sq); \
284
+ }
285
+
286
+ #define nk_define_det3x3_(type) \
287
+ NK_INTERNAL nk_##type##_t nk_det3x3_##type##_(nk_##type##_t const *m) { \
288
+ return m[0] * (m[4] * m[8] - m[5] * m[7]) - m[1] * (m[3] * m[8] - m[5] * m[6]) + \
289
+ m[2] * (m[3] * m[7] - m[4] * m[6]); \
290
+ }
291
+
292
+ NK_INTERNAL nk_f32_t nk_sum_three_products_f32_(nk_f32_t left_0, nk_f32_t right_0, nk_f32_t left_1, nk_f32_t right_1,
293
+ nk_f32_t left_2, nk_f32_t right_2) {
294
+ return left_0 * right_0 + left_1 * right_1 + left_2 * right_2;
295
+ }
296
+ NK_INTERNAL nk_f64_t nk_sum_three_products_f64_(nk_f64_t left_0, nk_f64_t right_0, nk_f64_t left_1, nk_f64_t right_1,
297
+ nk_f64_t left_2, nk_f64_t right_2) {
298
+ nk_f64_t sum = 0.0, compensation = 0.0;
299
+ nk_f64_dot2_(&sum, &compensation, left_0, right_0);
300
+ nk_f64_dot2_(&sum, &compensation, left_1, right_1);
301
+ nk_f64_dot2_(&sum, &compensation, left_2, right_2);
302
+ return sum + compensation;
303
+ }
304
+
305
+ NK_INTERNAL nk_f32_t nk_sum_three_squares_f32_(nk_f32_t value_0, nk_f32_t value_1, nk_f32_t value_2) {
306
+ return value_0 * value_0 + value_1 * value_1 + value_2 * value_2;
307
+ }
308
+ NK_INTERNAL nk_f64_t nk_sum_three_squares_f64_(nk_f64_t value_0, nk_f64_t value_1, nk_f64_t value_2) {
309
+ nk_f64_t sum = 0.0, compensation = 0.0;
310
+ nk_f64_dot2_(&sum, &compensation, value_0, value_0);
311
+ nk_f64_dot2_(&sum, &compensation, value_1, value_1);
312
+ nk_f64_dot2_(&sum, &compensation, value_2, value_2);
313
+ return sum + compensation;
314
+ }
315
+
316
+ NK_INTERNAL void nk_accumulate_sum_f32_(nk_f32_t *sum, nk_f32_t *compensation, nk_f32_t value) {
317
+ nk_unused_(compensation);
318
+ *sum += value;
319
+ }
320
+ NK_INTERNAL void nk_accumulate_sum_f64_(nk_f64_t *sum, nk_f64_t *compensation, nk_f64_t value) {
321
+ nk_f64_t running_sum = *sum + value;
322
+ *compensation += (nk_f64_abs_(*sum) >= nk_f64_abs_(value)) ? ((*sum - running_sum) + value)
323
+ : ((value - running_sum) + *sum);
324
+ *sum = running_sum;
325
+ }
326
+
327
+ NK_INTERNAL void nk_accumulate_product_f32_(nk_f32_t *sum, nk_f32_t *compensation, nk_f32_t left, nk_f32_t right) {
328
+ nk_unused_(compensation);
329
+ *sum += left * right;
330
+ }
331
+ NK_INTERNAL void nk_accumulate_product_f64_(nk_f64_t *sum, nk_f64_t *compensation, nk_f64_t left, nk_f64_t right) {
332
+ nk_f64_dot2_(sum, compensation, left, right);
333
+ }
334
+
335
+ NK_INTERNAL void nk_accumulate_square_f32_(nk_f32_t *sum, nk_f32_t *compensation, nk_f32_t value) {
336
+ nk_unused_(compensation);
337
+ *sum += value * value;
338
+ }
339
+ NK_INTERNAL void nk_accumulate_square_f64_(nk_f64_t *sum, nk_f64_t *compensation, nk_f64_t value) {
340
+ nk_f64_dot2_(sum, compensation, value, value);
341
+ }
342
+
343
+ NK_INTERNAL void nk_rotation_from_svd_f32_serial_(nk_f32_t const *svd_u, nk_f32_t const *svd_v, nk_f32_t *rotation) {
344
+ rotation[0] = nk_sum_three_products_f32_(svd_v[0], svd_u[0], svd_v[1], svd_u[1], svd_v[2], svd_u[2]);
345
+ rotation[1] = nk_sum_three_products_f32_(svd_v[0], svd_u[3], svd_v[1], svd_u[4], svd_v[2], svd_u[5]);
346
+ rotation[2] = nk_sum_three_products_f32_(svd_v[0], svd_u[6], svd_v[1], svd_u[7], svd_v[2], svd_u[8]);
347
+ rotation[3] = nk_sum_three_products_f32_(svd_v[3], svd_u[0], svd_v[4], svd_u[1], svd_v[5], svd_u[2]);
348
+ rotation[4] = nk_sum_three_products_f32_(svd_v[3], svd_u[3], svd_v[4], svd_u[4], svd_v[5], svd_u[5]);
349
+ rotation[5] = nk_sum_three_products_f32_(svd_v[3], svd_u[6], svd_v[4], svd_u[7], svd_v[5], svd_u[8]);
350
+ rotation[6] = nk_sum_three_products_f32_(svd_v[6], svd_u[0], svd_v[7], svd_u[1], svd_v[8], svd_u[2]);
351
+ rotation[7] = nk_sum_three_products_f32_(svd_v[6], svd_u[3], svd_v[7], svd_u[4], svd_v[8], svd_u[5]);
352
+ rotation[8] = nk_sum_three_products_f32_(svd_v[6], svd_u[6], svd_v[7], svd_u[7], svd_v[8], svd_u[8]);
353
+ }
354
+ NK_INTERNAL void nk_rotation_from_svd_f64_serial_(nk_f64_t const *svd_u, nk_f64_t const *svd_v, nk_f64_t *rotation) {
355
+ rotation[0] = nk_sum_three_products_f64_(svd_v[0], svd_u[0], svd_v[1], svd_u[1], svd_v[2], svd_u[2]);
356
+ rotation[1] = nk_sum_three_products_f64_(svd_v[0], svd_u[3], svd_v[1], svd_u[4], svd_v[2], svd_u[5]);
357
+ rotation[2] = nk_sum_three_products_f64_(svd_v[0], svd_u[6], svd_v[1], svd_u[7], svd_v[2], svd_u[8]);
358
+ rotation[3] = nk_sum_three_products_f64_(svd_v[3], svd_u[0], svd_v[4], svd_u[1], svd_v[5], svd_u[2]);
359
+ rotation[4] = nk_sum_three_products_f64_(svd_v[3], svd_u[3], svd_v[4], svd_u[4], svd_v[5], svd_u[5]);
360
+ rotation[5] = nk_sum_three_products_f64_(svd_v[3], svd_u[6], svd_v[4], svd_u[7], svd_v[5], svd_u[8]);
361
+ rotation[6] = nk_sum_three_products_f64_(svd_v[6], svd_u[0], svd_v[7], svd_u[1], svd_v[8], svd_u[2]);
362
+ rotation[7] = nk_sum_three_products_f64_(svd_v[6], svd_u[3], svd_v[7], svd_u[4], svd_v[8], svd_u[5]);
363
+ rotation[8] = nk_sum_three_products_f64_(svd_v[6], svd_u[6], svd_v[7], svd_u[7], svd_v[8], svd_u[8]);
364
+ }
365
+
366
+ nk_define_cond_swap_(f32)
367
+ nk_define_conditional_negating_swap_(f32)
368
+ nk_define_approximate_givens_quaternion_(f32, NK_F32_SVD_GAMMA_, NK_F32_SVD_CSTAR_, NK_F32_SVD_SSTAR_,
369
+ nk_f32_rsqrt_serial)
370
+ nk_define_jacobi_conjugation_(f32)
371
+ nk_define_quaternion_to_mat3x3_(f32)
372
+ nk_define_jacobi_eigenanalysis_(f32, nk_f32_rsqrt_serial)
373
+ nk_define_qr_givens_quaternion_(f32, NK_F32_SVD_EPSILON_, nk_f32_rsqrt_serial)
374
+ nk_define_sort_singular_values_(f32)
375
+ nk_define_qr_decomposition_(f32)
376
+ nk_define_svd3x3_(f32, nk_f32_sqrt_serial)
377
+ nk_define_det3x3_(f32)
378
+
379
+ nk_define_cond_swap_(f64)
380
+ nk_define_conditional_negating_swap_(f64)
381
+ nk_define_approximate_givens_quaternion_(f64, NK_F64_SVD_GAMMA_, NK_F64_SVD_CSTAR_, NK_F64_SVD_SSTAR_,
382
+ nk_f64_rsqrt_serial)
383
+ nk_define_jacobi_conjugation_(f64)
384
+ nk_define_quaternion_to_mat3x3_(f64)
385
+ nk_define_jacobi_eigenanalysis_(f64, nk_f64_rsqrt_serial)
386
+ nk_define_qr_givens_quaternion_(f64, NK_F64_SVD_EPSILON_, nk_f64_rsqrt_serial)
387
+ nk_define_sort_singular_values_(f64)
388
+ nk_define_qr_decomposition_(f64)
389
+ nk_define_svd3x3_(f64, nk_f64_sqrt_serial)
390
+ nk_define_det3x3_(f64)
391
+
392
+ /* RMSD (Root Mean Square Deviation) without optimal superposition.
393
+ * Simply computes the RMS of distances between corresponding points.
394
+ */
395
+ #define nk_define_rmsd_(input_type, accumulator_type, output_type, result_type, load_and_convert, compute_sqrt) \
396
+ NK_PUBLIC void nk_rmsd_##input_type##_serial(nk_##input_type##_t const *a, nk_##input_type##_t const *b, \
397
+ nk_size_t n, nk_##output_type##_t *a_centroid, \
398
+ nk_##output_type##_t *b_centroid, nk_##output_type##_t *rotation, \
399
+ nk_##output_type##_t *scale, nk_##result_type##_t *result) { \
400
+ nk_##accumulator_type##_t sum_a_x = 0, sum_a_y = 0, sum_a_z = 0; \
401
+ nk_##accumulator_type##_t sum_b_x = 0, sum_b_y = 0, sum_b_z = 0; \
402
+ nk_##accumulator_type##_t sum_a_x_compensation = 0, sum_a_y_compensation = 0, sum_a_z_compensation = 0; \
403
+ nk_##accumulator_type##_t sum_b_x_compensation = 0, sum_b_y_compensation = 0, sum_b_z_compensation = 0; \
404
+ nk_##accumulator_type##_t val_a_x, val_a_y, val_a_z, val_b_x, val_b_y, val_b_z; \
405
+ for (nk_size_t i = 0; i < n; ++i) { \
406
+ load_and_convert(a + i * 3 + 0, &val_a_x), load_and_convert(a + i * 3 + 1, &val_a_y); \
407
+ load_and_convert(a + i * 3 + 2, &val_a_z), load_and_convert(b + i * 3 + 0, &val_b_x); \
408
+ load_and_convert(b + i * 3 + 1, &val_b_y), load_and_convert(b + i * 3 + 2, &val_b_z); \
409
+ nk_accumulate_sum_##accumulator_type##_(&sum_a_x, &sum_a_x_compensation, val_a_x); \
410
+ nk_accumulate_sum_##accumulator_type##_(&sum_a_y, &sum_a_y_compensation, val_a_y); \
411
+ nk_accumulate_sum_##accumulator_type##_(&sum_a_z, &sum_a_z_compensation, val_a_z); \
412
+ nk_accumulate_sum_##accumulator_type##_(&sum_b_x, &sum_b_x_compensation, val_b_x); \
413
+ nk_accumulate_sum_##accumulator_type##_(&sum_b_y, &sum_b_y_compensation, val_b_y); \
414
+ nk_accumulate_sum_##accumulator_type##_(&sum_b_z, &sum_b_z_compensation, val_b_z); \
415
+ } \
416
+ nk_##accumulator_type##_t inv_n = (nk_##accumulator_type##_t)1.0 / n; \
417
+ nk_##accumulator_type##_t centroid_a_x = (sum_a_x + sum_a_x_compensation) * inv_n; \
418
+ nk_##accumulator_type##_t centroid_a_y = (sum_a_y + sum_a_y_compensation) * inv_n; \
419
+ nk_##accumulator_type##_t centroid_a_z = (sum_a_z + sum_a_z_compensation) * inv_n; \
420
+ nk_##accumulator_type##_t centroid_b_x = (sum_b_x + sum_b_x_compensation) * inv_n; \
421
+ nk_##accumulator_type##_t centroid_b_y = (sum_b_y + sum_b_y_compensation) * inv_n; \
422
+ nk_##accumulator_type##_t centroid_b_z = (sum_b_z + sum_b_z_compensation) * inv_n; \
423
+ if (a_centroid) { \
424
+ a_centroid[0] = (nk_##output_type##_t)centroid_a_x; \
425
+ a_centroid[1] = (nk_##output_type##_t)centroid_a_y; \
426
+ a_centroid[2] = (nk_##output_type##_t)centroid_a_z; \
427
+ } \
428
+ if (b_centroid) { \
429
+ b_centroid[0] = (nk_##output_type##_t)centroid_b_x; \
430
+ b_centroid[1] = (nk_##output_type##_t)centroid_b_y; \
431
+ b_centroid[2] = (nk_##output_type##_t)centroid_b_z; \
432
+ } \
433
+ /* RMSD uses identity rotation and scale=1.0 */ \
434
+ if (rotation) { \
435
+ rotation[0] = 1, rotation[1] = 0, rotation[2] = 0; \
436
+ rotation[3] = 0, rotation[4] = 1, rotation[5] = 0; \
437
+ rotation[6] = 0, rotation[7] = 0, rotation[8] = 1; \
438
+ } \
439
+ if (scale) *scale = 1.0; \
440
+ nk_##accumulator_type##_t sum_squared = 0, sum_squared_compensation = 0; \
441
+ for (nk_size_t i = 0; i < n; ++i) { \
442
+ load_and_convert(a + i * 3 + 0, &val_a_x), load_and_convert(b + i * 3 + 0, &val_b_x); \
443
+ load_and_convert(a + i * 3 + 1, &val_a_y), load_and_convert(b + i * 3 + 1, &val_b_y); \
444
+ load_and_convert(a + i * 3 + 2, &val_a_z), load_and_convert(b + i * 3 + 2, &val_b_z); \
445
+ nk_##accumulator_type##_t dx = (val_a_x - centroid_a_x) - (val_b_x - centroid_b_x); \
446
+ nk_##accumulator_type##_t dy = (val_a_y - centroid_a_y) - (val_b_y - centroid_b_y); \
447
+ nk_##accumulator_type##_t dz = (val_a_z - centroid_a_z) - (val_b_z - centroid_b_z); \
448
+ nk_accumulate_square_##accumulator_type##_(&sum_squared, &sum_squared_compensation, dx); \
449
+ nk_accumulate_square_##accumulator_type##_(&sum_squared, &sum_squared_compensation, dy); \
450
+ nk_accumulate_square_##accumulator_type##_(&sum_squared, &sum_squared_compensation, dz); \
451
+ } \
452
+ nk_##accumulator_type##_t msd = (sum_squared + sum_squared_compensation) * inv_n; \
453
+ *result = msd > 0 ? (nk_##result_type##_t)compute_sqrt(msd) : 0; \
454
+ }
455
+
456
+ /* Kabsch algorithm for optimal rigid body superposition.
457
+ * Finds the rotation matrix R that minimizes RMSD between the two point sets.
458
+ */
459
+ #define nk_define_kabsch_(input_type, accumulator_type, output_type, result_type, svd_type, load_and_convert, \
460
+ compute_sqrt) \
461
+ NK_PUBLIC void nk_kabsch_##input_type##_serial(nk_##input_type##_t const *a, nk_##input_type##_t const *b, \
462
+ nk_size_t n, nk_##output_type##_t *a_centroid, \
463
+ nk_##output_type##_t *b_centroid, nk_##output_type##_t *rotation, \
464
+ nk_##output_type##_t *scale, nk_##result_type##_t *result) { \
465
+ /* Step 1: Compute centroids */ \
466
+ nk_##accumulator_type##_t sum_a_x = 0, sum_a_y = 0, sum_a_z = 0; \
467
+ nk_##accumulator_type##_t sum_b_x = 0, sum_b_y = 0, sum_b_z = 0; \
468
+ nk_##accumulator_type##_t sum_a_x_compensation = 0, sum_a_y_compensation = 0, sum_a_z_compensation = 0; \
469
+ nk_##accumulator_type##_t sum_b_x_compensation = 0, sum_b_y_compensation = 0, sum_b_z_compensation = 0; \
470
+ nk_##accumulator_type##_t val_a_x, val_a_y, val_a_z, val_b_x, val_b_y, val_b_z; \
471
+ for (nk_size_t i = 0; i < n; ++i) { \
472
+ load_and_convert(a + i * 3 + 0, &val_a_x), load_and_convert(a + i * 3 + 1, &val_a_y); \
473
+ load_and_convert(a + i * 3 + 2, &val_a_z), load_and_convert(b + i * 3 + 0, &val_b_x); \
474
+ load_and_convert(b + i * 3 + 1, &val_b_y), load_and_convert(b + i * 3 + 2, &val_b_z); \
475
+ nk_accumulate_sum_##accumulator_type##_(&sum_a_x, &sum_a_x_compensation, val_a_x); \
476
+ nk_accumulate_sum_##accumulator_type##_(&sum_a_y, &sum_a_y_compensation, val_a_y); \
477
+ nk_accumulate_sum_##accumulator_type##_(&sum_a_z, &sum_a_z_compensation, val_a_z); \
478
+ nk_accumulate_sum_##accumulator_type##_(&sum_b_x, &sum_b_x_compensation, val_b_x); \
479
+ nk_accumulate_sum_##accumulator_type##_(&sum_b_y, &sum_b_y_compensation, val_b_y); \
480
+ nk_accumulate_sum_##accumulator_type##_(&sum_b_z, &sum_b_z_compensation, val_b_z); \
481
+ } \
482
+ nk_##accumulator_type##_t inv_n = (nk_##accumulator_type##_t)1.0 / n; \
483
+ nk_##accumulator_type##_t centroid_a_x = (sum_a_x + sum_a_x_compensation) * inv_n; \
484
+ nk_##accumulator_type##_t centroid_a_y = (sum_a_y + sum_a_y_compensation) * inv_n; \
485
+ nk_##accumulator_type##_t centroid_a_z = (sum_a_z + sum_a_z_compensation) * inv_n; \
486
+ nk_##accumulator_type##_t centroid_b_x = (sum_b_x + sum_b_x_compensation) * inv_n; \
487
+ nk_##accumulator_type##_t centroid_b_y = (sum_b_y + sum_b_y_compensation) * inv_n; \
488
+ nk_##accumulator_type##_t centroid_b_z = (sum_b_z + sum_b_z_compensation) * inv_n; \
489
+ if (a_centroid) { \
490
+ a_centroid[0] = (nk_##output_type##_t)centroid_a_x; \
491
+ a_centroid[1] = (nk_##output_type##_t)centroid_a_y; \
492
+ a_centroid[2] = (nk_##output_type##_t)centroid_a_z; \
493
+ } \
494
+ if (b_centroid) { \
495
+ b_centroid[0] = (nk_##output_type##_t)centroid_b_x; \
496
+ b_centroid[1] = (nk_##output_type##_t)centroid_b_y; \
497
+ b_centroid[2] = (nk_##output_type##_t)centroid_b_z; \
498
+ } \
499
+ /* Step 2: Build 3×3 covariance matrix H = (A - Ā)ᵀ × (B - B̄) */ \
500
+ nk_##accumulator_type##_t h[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
501
+ nk_##accumulator_type##_t h_compensation[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
502
+ for (nk_size_t i = 0; i < n; ++i) { \
503
+ load_and_convert(a + i * 3 + 0, &val_a_x), load_and_convert(b + i * 3 + 0, &val_b_x); \
504
+ load_and_convert(a + i * 3 + 1, &val_a_y), load_and_convert(b + i * 3 + 1, &val_b_y); \
505
+ load_and_convert(a + i * 3 + 2, &val_a_z), load_and_convert(b + i * 3 + 2, &val_b_z); \
506
+ val_a_x -= centroid_a_x, val_a_y -= centroid_a_y, val_a_z -= centroid_a_z; \
507
+ val_b_x -= centroid_b_x, val_b_y -= centroid_b_y, val_b_z -= centroid_b_z; \
508
+ nk_accumulate_product_##accumulator_type##_(&h[0], &h_compensation[0], val_a_x, val_b_x); \
509
+ nk_accumulate_product_##accumulator_type##_(&h[1], &h_compensation[1], val_a_x, val_b_y); \
510
+ nk_accumulate_product_##accumulator_type##_(&h[2], &h_compensation[2], val_a_x, val_b_z); \
511
+ nk_accumulate_product_##accumulator_type##_(&h[3], &h_compensation[3], val_a_y, val_b_x); \
512
+ nk_accumulate_product_##accumulator_type##_(&h[4], &h_compensation[4], val_a_y, val_b_y); \
513
+ nk_accumulate_product_##accumulator_type##_(&h[5], &h_compensation[5], val_a_y, val_b_z); \
514
+ nk_accumulate_product_##accumulator_type##_(&h[6], &h_compensation[6], val_a_z, val_b_x); \
515
+ nk_accumulate_product_##accumulator_type##_(&h[7], &h_compensation[7], val_a_z, val_b_y); \
516
+ nk_accumulate_product_##accumulator_type##_(&h[8], &h_compensation[8], val_a_z, val_b_z); \
517
+ } \
518
+ /* Convert to svd_type for SVD */ \
519
+ nk_##svd_type##_t cross_covariance[9]; \
520
+ for (int j = 0; j < 9; ++j) cross_covariance[j] = (nk_##svd_type##_t)(h[j] + h_compensation[j]); \
521
+ /* Step 3: SVD of H = U * S * Vᵀ */ \
522
+ nk_##svd_type##_t svd_u[9], svd_s[9], svd_v[9]; \
523
+ nk_svd3x3_##svd_type##_(cross_covariance, svd_u, svd_s, svd_v); \
524
+ /* Step 4: R = V * Uᵀ */ \
525
+ nk_##svd_type##_t rotation_matrix[9]; \
526
+ nk_rotation_from_svd_##svd_type##_serial_(svd_u, svd_v, rotation_matrix); \
527
+ /* Handle reflection: if det(R) < 0, negate third column of V and recompute R */ \
528
+ nk_##svd_type##_t rotation_det = nk_det3x3_##svd_type##_(rotation_matrix); \
529
+ if (rotation_det < 0) { \
530
+ svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8]; \
531
+ nk_rotation_from_svd_##svd_type##_serial_(svd_u, svd_v, rotation_matrix); \
532
+ } \
533
+ /* Output rotation matrix and scale=1.0 */ \
534
+ if (rotation) { \
535
+ for (int j = 0; j < 9; ++j) rotation[j] = (nk_##output_type##_t)rotation_matrix[j]; \
536
+ } \
537
+ if (scale) *scale = 1.0; \
538
+ /* Step 5: Compute RMSD after rotation */ \
539
+ nk_##accumulator_type##_t sum_squared = 0, sum_squared_compensation = 0; \
540
+ for (nk_size_t i = 0; i < n; ++i) { \
541
+ nk_##svd_type##_t point_a[3], point_b[3], rotated_point_a[3]; \
542
+ load_and_convert(a + i * 3 + 0, &val_a_x), load_and_convert(a + i * 3 + 1, &val_a_y); \
543
+ load_and_convert(a + i * 3 + 2, &val_a_z), load_and_convert(b + i * 3 + 0, &val_b_x); \
544
+ load_and_convert(b + i * 3 + 1, &val_b_y), load_and_convert(b + i * 3 + 2, &val_b_z); \
545
+ point_a[0] = (nk_##svd_type##_t)(val_a_x - centroid_a_x); \
546
+ point_a[1] = (nk_##svd_type##_t)(val_a_y - centroid_a_y); \
547
+ point_a[2] = (nk_##svd_type##_t)(val_a_z - centroid_a_z); \
548
+ point_b[0] = (nk_##svd_type##_t)(val_b_x - centroid_b_x); \
549
+ point_b[1] = (nk_##svd_type##_t)(val_b_y - centroid_b_y); \
550
+ point_b[2] = (nk_##svd_type##_t)(val_b_z - centroid_b_z); \
551
+ rotated_point_a[0] = rotation_matrix[0] * point_a[0] + rotation_matrix[1] * point_a[1] + \
552
+ rotation_matrix[2] * point_a[2]; \
553
+ rotated_point_a[1] = rotation_matrix[3] * point_a[0] + rotation_matrix[4] * point_a[1] + \
554
+ rotation_matrix[5] * point_a[2]; \
555
+ rotated_point_a[2] = rotation_matrix[6] * point_a[0] + rotation_matrix[7] * point_a[1] + \
556
+ rotation_matrix[8] * point_a[2]; \
557
+ nk_##svd_type##_t dx = rotated_point_a[0] - point_b[0]; \
558
+ nk_##svd_type##_t dy = rotated_point_a[1] - point_b[1]; \
559
+ nk_##svd_type##_t dz = rotated_point_a[2] - point_b[2]; \
560
+ nk_accumulate_square_##accumulator_type##_(&sum_squared, &sum_squared_compensation, \
561
+ (nk_##accumulator_type##_t)dx); \
562
+ nk_accumulate_square_##accumulator_type##_(&sum_squared, &sum_squared_compensation, \
563
+ (nk_##accumulator_type##_t)dy); \
564
+ nk_accumulate_square_##accumulator_type##_(&sum_squared, &sum_squared_compensation, \
565
+ (nk_##accumulator_type##_t)dz); \
566
+ } \
567
+ *result = (nk_##result_type##_t)compute_sqrt((sum_squared + sum_squared_compensation) * inv_n); \
568
+ }
569
+
570
+ /* Umeyama algorithm for optimal similarity transformation (rotation + uniform scale).
571
+ * Finds the rotation matrix R and scale factor c that minimizes ‖c × R × A - B‖.
572
+ * Reference: S. Umeyama, "Least-squares estimation of transformation parameters
573
+ * between two point patterns", IEEE TPAMI 1991.
574
+ */
575
+ #define nk_define_umeyama_(input_type, accumulator_type, output_type, result_type, svd_type, load_and_convert, \
576
+ compute_sqrt) \
577
+ NK_PUBLIC void nk_umeyama_##input_type##_serial(nk_##input_type##_t const *a, nk_##input_type##_t const *b, \
578
+ nk_size_t n, nk_##output_type##_t *a_centroid, \
579
+ nk_##output_type##_t *b_centroid, nk_##output_type##_t *rotation, \
580
+ nk_##output_type##_t *scale, nk_##result_type##_t *result) { \
581
+ /* Step 1: Compute centroids */ \
582
+ nk_##accumulator_type##_t sum_a_x = 0, sum_a_y = 0, sum_a_z = 0; \
583
+ nk_##accumulator_type##_t sum_b_x = 0, sum_b_y = 0, sum_b_z = 0; \
584
+ nk_##accumulator_type##_t sum_a_x_compensation = 0, sum_a_y_compensation = 0, sum_a_z_compensation = 0; \
585
+ nk_##accumulator_type##_t sum_b_x_compensation = 0, sum_b_y_compensation = 0, sum_b_z_compensation = 0; \
586
+ nk_##accumulator_type##_t val_a_x, val_a_y, val_a_z, val_b_x, val_b_y, val_b_z; \
587
+ for (nk_size_t i = 0; i < n; ++i) { \
588
+ load_and_convert(a + i * 3 + 0, &val_a_x), load_and_convert(a + i * 3 + 1, &val_a_y); \
589
+ load_and_convert(a + i * 3 + 2, &val_a_z), load_and_convert(b + i * 3 + 0, &val_b_x); \
590
+ load_and_convert(b + i * 3 + 1, &val_b_y), load_and_convert(b + i * 3 + 2, &val_b_z); \
591
+ nk_accumulate_sum_##accumulator_type##_(&sum_a_x, &sum_a_x_compensation, val_a_x); \
592
+ nk_accumulate_sum_##accumulator_type##_(&sum_a_y, &sum_a_y_compensation, val_a_y); \
593
+ nk_accumulate_sum_##accumulator_type##_(&sum_a_z, &sum_a_z_compensation, val_a_z); \
594
+ nk_accumulate_sum_##accumulator_type##_(&sum_b_x, &sum_b_x_compensation, val_b_x); \
595
+ nk_accumulate_sum_##accumulator_type##_(&sum_b_y, &sum_b_y_compensation, val_b_y); \
596
+ nk_accumulate_sum_##accumulator_type##_(&sum_b_z, &sum_b_z_compensation, val_b_z); \
597
+ } \
598
+ nk_##accumulator_type##_t inv_n = (nk_##accumulator_type##_t)1.0 / n; \
599
+ nk_##accumulator_type##_t centroid_a_x = (sum_a_x + sum_a_x_compensation) * inv_n; \
600
+ nk_##accumulator_type##_t centroid_a_y = (sum_a_y + sum_a_y_compensation) * inv_n; \
601
+ nk_##accumulator_type##_t centroid_a_z = (sum_a_z + sum_a_z_compensation) * inv_n; \
602
+ nk_##accumulator_type##_t centroid_b_x = (sum_b_x + sum_b_x_compensation) * inv_n; \
603
+ nk_##accumulator_type##_t centroid_b_y = (sum_b_y + sum_b_y_compensation) * inv_n; \
604
+ nk_##accumulator_type##_t centroid_b_z = (sum_b_z + sum_b_z_compensation) * inv_n; \
605
+ if (a_centroid) { \
606
+ a_centroid[0] = (nk_##output_type##_t)centroid_a_x; \
607
+ a_centroid[1] = (nk_##output_type##_t)centroid_a_y; \
608
+ a_centroid[2] = (nk_##output_type##_t)centroid_a_z; \
609
+ } \
610
+ if (b_centroid) { \
611
+ b_centroid[0] = (nk_##output_type##_t)centroid_b_x; \
612
+ b_centroid[1] = (nk_##output_type##_t)centroid_b_y; \
613
+ b_centroid[2] = (nk_##output_type##_t)centroid_b_z; \
614
+ } \
615
+ /* Step 2: Build covariance matrix H and compute variance of A */ \
616
+ nk_##accumulator_type##_t h[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
617
+ nk_##accumulator_type##_t h_compensation[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; \
618
+ nk_##accumulator_type##_t variance_a = 0, variance_a_compensation = 0; \
619
+ for (nk_size_t i = 0; i < n; ++i) { \
620
+ load_and_convert(a + i * 3 + 0, &val_a_x), load_and_convert(b + i * 3 + 0, &val_b_x); \
621
+ load_and_convert(a + i * 3 + 1, &val_a_y), load_and_convert(b + i * 3 + 1, &val_b_y); \
622
+ load_and_convert(a + i * 3 + 2, &val_a_z), load_and_convert(b + i * 3 + 2, &val_b_z); \
623
+ val_a_x -= centroid_a_x, val_a_y -= centroid_a_y, val_a_z -= centroid_a_z; \
624
+ val_b_x -= centroid_b_x, val_b_y -= centroid_b_y, val_b_z -= centroid_b_z; \
625
+ nk_accumulate_square_##accumulator_type##_(&variance_a, &variance_a_compensation, val_a_x); \
626
+ nk_accumulate_square_##accumulator_type##_(&variance_a, &variance_a_compensation, val_a_y); \
627
+ nk_accumulate_square_##accumulator_type##_(&variance_a, &variance_a_compensation, val_a_z); \
628
+ nk_accumulate_product_##accumulator_type##_(&h[0], &h_compensation[0], val_a_x, val_b_x); \
629
+ nk_accumulate_product_##accumulator_type##_(&h[1], &h_compensation[1], val_a_x, val_b_y); \
630
+ nk_accumulate_product_##accumulator_type##_(&h[2], &h_compensation[2], val_a_x, val_b_z); \
631
+ nk_accumulate_product_##accumulator_type##_(&h[3], &h_compensation[3], val_a_y, val_b_x); \
632
+ nk_accumulate_product_##accumulator_type##_(&h[4], &h_compensation[4], val_a_y, val_b_y); \
633
+ nk_accumulate_product_##accumulator_type##_(&h[5], &h_compensation[5], val_a_y, val_b_z); \
634
+ nk_accumulate_product_##accumulator_type##_(&h[6], &h_compensation[6], val_a_z, val_b_x); \
635
+ nk_accumulate_product_##accumulator_type##_(&h[7], &h_compensation[7], val_a_z, val_b_y); \
636
+ nk_accumulate_product_##accumulator_type##_(&h[8], &h_compensation[8], val_a_z, val_b_z); \
637
+ } \
638
+ variance_a = (variance_a + variance_a_compensation) * inv_n; \
639
+ /* Convert to svd_type for SVD */ \
640
+ nk_##svd_type##_t cross_covariance[9]; \
641
+ for (int j = 0; j < 9; ++j) cross_covariance[j] = (nk_##svd_type##_t)(h[j] + h_compensation[j]); \
642
+ /* Step 3: SVD of H = U * S * Vᵀ */ \
643
+ nk_##svd_type##_t svd_u[9], svd_s[9], svd_v[9]; \
644
+ nk_svd3x3_##svd_type##_(cross_covariance, svd_u, svd_s, svd_v); \
645
+ /* Step 4: R = V * Uᵀ */ \
646
+ nk_##svd_type##_t rotation_matrix[9]; \
647
+ nk_rotation_from_svd_##svd_type##_serial_(svd_u, svd_v, rotation_matrix); \
648
+ /* Handle reflection and compute scale: c = trace(D × S) / variance(a) */ \
649
+ /* D = diag(1, 1, det(R)), svd_s contains proper positive singular values on diagonal */ \
650
+ nk_##svd_type##_t rotation_det = nk_det3x3_##svd_type##_(rotation_matrix); \
651
+ nk_##svd_type##_t sign_det = rotation_det < 0 ? (nk_##svd_type##_t) - 1.0 : (nk_##svd_type##_t)1.0; \
652
+ nk_##svd_type##_t trace_scaled_s = svd_s[0] + svd_s[4] + sign_det * svd_s[8]; \
653
+ nk_##accumulator_type##_t scale_factor = (nk_##accumulator_type##_t)trace_scaled_s / \
654
+ ((nk_##accumulator_type##_t)n * variance_a); \
655
+ if (scale) *scale = scale_factor; \
656
+ if (rotation_det < 0) { \
657
+ svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8]; \
658
+ nk_rotation_from_svd_##svd_type##_serial_(svd_u, svd_v, rotation_matrix); \
659
+ } \
660
+ /* Output rotation matrix */ \
661
+ if (rotation) { \
662
+ for (int j = 0; j < 9; ++j) rotation[j] = (nk_##output_type##_t)rotation_matrix[j]; \
663
+ } \
664
+ /* Step 5: Compute RMSD after similarity transform: ‖c × R × a - b‖ */ \
665
+ nk_##accumulator_type##_t sum_squared = 0, sum_squared_compensation = 0; \
666
+ for (nk_size_t i = 0; i < n; ++i) { \
667
+ nk_##svd_type##_t point_a[3], point_b[3], rotated_point_a[3]; \
668
+ load_and_convert(a + i * 3 + 0, &val_a_x), load_and_convert(a + i * 3 + 1, &val_a_y); \
669
+ load_and_convert(a + i * 3 + 2, &val_a_z), load_and_convert(b + i * 3 + 0, &val_b_x); \
670
+ load_and_convert(b + i * 3 + 1, &val_b_y), load_and_convert(b + i * 3 + 2, &val_b_z); \
671
+ point_a[0] = (nk_##svd_type##_t)(val_a_x - centroid_a_x); \
672
+ point_a[1] = (nk_##svd_type##_t)(val_a_y - centroid_a_y); \
673
+ point_a[2] = (nk_##svd_type##_t)(val_a_z - centroid_a_z); \
674
+ point_b[0] = (nk_##svd_type##_t)(val_b_x - centroid_b_x); \
675
+ point_b[1] = (nk_##svd_type##_t)(val_b_y - centroid_b_y); \
676
+ point_b[2] = (nk_##svd_type##_t)(val_b_z - centroid_b_z); \
677
+ rotated_point_a[0] = (nk_##svd_type##_t)scale_factor * \
678
+ (rotation_matrix[0] * point_a[0] + rotation_matrix[1] * point_a[1] + \
679
+ rotation_matrix[2] * point_a[2]); \
680
+ rotated_point_a[1] = (nk_##svd_type##_t)scale_factor * \
681
+ (rotation_matrix[3] * point_a[0] + rotation_matrix[4] * point_a[1] + \
682
+ rotation_matrix[5] * point_a[2]); \
683
+ rotated_point_a[2] = (nk_##svd_type##_t)scale_factor * \
684
+ (rotation_matrix[6] * point_a[0] + rotation_matrix[7] * point_a[1] + \
685
+ rotation_matrix[8] * point_a[2]); \
686
+ nk_##svd_type##_t dx = rotated_point_a[0] - point_b[0]; \
687
+ nk_##svd_type##_t dy = rotated_point_a[1] - point_b[1]; \
688
+ nk_##svd_type##_t dz = rotated_point_a[2] - point_b[2]; \
689
+ nk_accumulate_square_##accumulator_type##_(&sum_squared, &sum_squared_compensation, \
690
+ (nk_##accumulator_type##_t)dx); \
691
+ nk_accumulate_square_##accumulator_type##_(&sum_squared, &sum_squared_compensation, \
692
+ (nk_##accumulator_type##_t)dy); \
693
+ nk_accumulate_square_##accumulator_type##_(&sum_squared, &sum_squared_compensation, \
694
+ (nk_##accumulator_type##_t)dz); \
695
+ } \
696
+ *result = (nk_##result_type##_t)compute_sqrt((sum_squared + sum_squared_compensation) * inv_n); \
697
+ }
698
+
699
+ nk_define_rmsd_(f64, f64, f64, f64, nk_assign_from_to_, nk_f64_sqrt_serial) // nk_rmsd_f64_serial
700
+ nk_define_kabsch_(f64, f64, f64, f64, f64, nk_assign_from_to_, nk_f64_sqrt_serial) // nk_kabsch_f64_serial
701
+ nk_define_umeyama_(f64, f64, f64, f64, f64, nk_assign_from_to_, nk_f64_sqrt_serial) // nk_umeyama_f64_serial
702
+
703
+ nk_define_rmsd_(f32, f64, f32, f64, nk_assign_from_to_, nk_f64_sqrt_serial) // nk_rmsd_f32_serial
704
+ nk_define_kabsch_(f32, f64, f32, f64, f64, nk_assign_from_to_, nk_f64_sqrt_serial) // nk_kabsch_f32_serial
705
+ nk_define_umeyama_(f32, f64, f32, f64, f64, nk_assign_from_to_, nk_f64_sqrt_serial) // nk_umeyama_f32_serial
706
+
707
+ nk_define_rmsd_(f16, f32, f32, f32, nk_f16_to_f32_serial, nk_f32_sqrt_serial) // nk_rmsd_f16_serial
708
+ nk_define_kabsch_(f16, f32, f32, f32, f32, nk_f16_to_f32_serial, nk_f32_sqrt_serial) // nk_kabsch_f16_serial
709
+ nk_define_umeyama_(f16, f32, f32, f32, f32, nk_f16_to_f32_serial, nk_f32_sqrt_serial) // nk_umeyama_f16_serial
710
+
711
+ nk_define_rmsd_(bf16, f32, f32, f32, nk_bf16_to_f32_serial, nk_f32_sqrt_serial) // nk_rmsd_bf16_serial
712
+ nk_define_kabsch_(bf16, f32, f32, f32, f32, nk_bf16_to_f32_serial, nk_f32_sqrt_serial) // nk_kabsch_bf16_serial
713
+ nk_define_umeyama_(bf16, f32, f32, f32, f32, nk_bf16_to_f32_serial, nk_f32_sqrt_serial) // nk_umeyama_bf16_serial
714
+
715
+ #undef NK_F32_SVD_GAMMA_
716
+ #undef NK_F32_SVD_CSTAR_
717
+ #undef NK_F32_SVD_SSTAR_
718
+ #undef NK_F32_SVD_EPSILON_
719
+ #undef NK_F64_SVD_GAMMA_
720
+ #undef NK_F64_SVD_CSTAR_
721
+ #undef NK_F64_SVD_SSTAR_
722
+ #undef NK_F64_SVD_EPSILON_
723
+ #undef nk_define_cond_swap_
724
+ #undef nk_define_conditional_negating_swap_
725
+ #undef nk_define_approximate_givens_quaternion_
726
+ #undef nk_define_jacobi_conjugation_
727
+ #undef nk_define_quaternion_to_mat3x3_
728
+ #undef nk_define_jacobi_eigenanalysis_
729
+ #undef nk_define_qr_givens_quaternion_
730
+ #undef nk_define_sort_singular_values_
731
+ #undef nk_define_qr_decomposition_
732
+ #undef nk_define_svd3x3_
733
+ #undef nk_define_det3x3_
734
+ #undef nk_define_rmsd_
735
+ #undef nk_define_kabsch_
736
+ #undef nk_define_umeyama_
737
+
738
+ #if defined(__cplusplus)
739
+ } // extern "C"
740
+ #endif
741
+
742
+ #endif // NK_MESH_SERIAL_H