numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,1329 @@
1
+ /**
2
+ * @brief SIMD-accelerated Point Cloud Alignment for NEON.
3
+ * @file include/numkong/mesh/neon.h
4
+ * @author Ash Vardanian
5
+ * @date December 27, 2025
6
+ *
7
+ * @sa include/numkong/mesh.h
8
+ *
9
+ * @section neon_mesh_instructions Key NEON Mesh Instructions
10
+ *
11
+ * Point cloud operations use these ARM NEON instructions:
12
+ *
13
+ * Intrinsic Instruction Latency Throughput
14
+ * A76 M4+/V1+/Oryon
15
+ * vfmaq_f32 FMLA (V.4S, V.4S, V.4S) 4cy 2/cy 4/cy
16
+ * vmulq_n_f32 FMUL (V.4S, V.4S, V.S[0]) 3cy 2/cy 4/cy
17
+ * vsubq_f32 FSUB (V.4S, V.4S, V.4S) 2cy 2/cy 4/cy
18
+ * vaddvq_f32 FADDP+FADDP (reduce) 5cy 1/cy 1/cy
19
+ * vld3q_f32 LD3 ({Vt.4S, Vt2.4S, Vt3.4S}) 6cy 1/cy 1/cy
20
+ *
21
+ * LD3 provides hardware stride-3 deinterleaving for XYZ point data. The 6cy latency and
22
+ * 1/cy throughput make it the memory bottleneck regardless of core microarchitecture.
23
+ *
24
+ * FMA throughput doubles on 4-pipe cores (Apple M4+, Graviton3+, Oryon). Using 2x loop
25
+ * unrolling with independent accumulators hides FMA latency and saturates 2 FP pipes on
26
+ * A76-class cores; 4x unrolling may further benefit 4-pipe cores.
27
+ */
28
+ #ifndef NK_MESH_NEON_H
29
+ #define NK_MESH_NEON_H
30
+
31
+ #if NK_TARGET_ARM_
32
+ #if NK_TARGET_NEON
33
+
34
+ #include "numkong/types.h"
35
+ #include "numkong/dot/neon.h"
36
+ #include "numkong/mesh/serial.h"
37
+ #include "numkong/spatial/neon.h" // `nk_f32_sqrt_neon`, `nk_f64_sqrt_neon`
38
+
39
+ #if defined(__cplusplus)
40
+ extern "C" {
41
+ #endif
42
+
43
+ #if defined(__clang__)
44
+ #pragma clang attribute push(__attribute__((target("arch=armv8-a+simd"))), apply_to = function)
45
+ #elif defined(__GNUC__)
46
+ #pragma GCC push_options
47
+ #pragma GCC target("arch=armv8-a+simd")
48
+ #endif
49
+
50
+ NK_INTERNAL void nk_deinterleave_f32x4_neon_(nk_f32_t const *ptr, float32x4_t *x_out, float32x4_t *y_out,
51
+ float32x4_t *z_out) {
52
+ // Deinterleave 12 floats (4 xyz triplets) into separate x, y, z vectors.
53
+ // Uses NEON vld3q for efficient stride-3 deinterleaving.
54
+ //
55
+ // Input: 12 contiguous floats [x0,y0,z0, x1,y1,z1, x2,y2,z2, x3,y3,z3]
56
+ // Output: x[4], y[4], z[4] vectors
57
+ float32x4x3_t xyz = vld3q_f32(ptr);
58
+ *x_out = xyz.val[0];
59
+ *y_out = xyz.val[1];
60
+ *z_out = xyz.val[2];
61
+ }
62
+
63
+ NK_INTERNAL void nk_deinterleave_f64x2_neon_(nk_f64_t const *ptr, float64x2_t *x_out, float64x2_t *y_out,
64
+ float64x2_t *z_out) {
65
+ // Deinterleave 6 f64 values (2 xyz triplets) into separate x, y, z vectors.
66
+ //
67
+ // Input: 6 contiguous f64 [x0,y0,z0, x1,y1,z1]
68
+ // Output: x[2], y[2], z[2] vectors
69
+ // NEON doesn't have vld3q_f64, so we use vcombine to avoid stack round-trips
70
+ // Load 2 xyz triplets: [x0,y0,z0, x1,y1,z1]
71
+ *x_out = vcombine_f64(vld1_f64(&ptr[0]), vld1_f64(&ptr[3]));
72
+ *y_out = vcombine_f64(vld1_f64(&ptr[1]), vld1_f64(&ptr[4]));
73
+ *z_out = vcombine_f64(vld1_f64(&ptr[2]), vld1_f64(&ptr[5]));
74
+ }
75
+
76
+ NK_INTERNAL float64x2_t nk_promote_upper_f32x4_to_f64x2_neon_(float32x4_t values_f32x4) {
77
+ return vcvt_f64_f32(vget_high_f32(values_f32x4));
78
+ }
79
+
80
+ NK_INTERNAL nk_f64_t nk_reduce_stable_f64x2_neon_(float64x2_t values_f64x2) {
81
+ nk_b128_vec_t values;
82
+ values.f64x2 = values_f64x2;
83
+ nk_f64_t sum = 0.0, compensation = 0.0;
84
+ nk_accumulate_sum_f64_(&sum, &compensation, values.f64s[0]);
85
+ nk_accumulate_sum_f64_(&sum, &compensation, values.f64s[1]);
86
+ return sum + compensation;
87
+ }
88
+
89
+ NK_INTERNAL void nk_rotation_from_svd_f64_neon_(nk_f64_t const *svd_u, nk_f64_t const *svd_v, nk_f64_t *rotation) {
90
+ nk_rotation_from_svd_f64_serial_(svd_u, svd_v, rotation);
91
+ }
92
+
93
+ NK_INTERNAL void nk_accumulate_square_f64x2_neon_(float64x2_t *sum_f64x2, float64x2_t *compensation_f64x2,
94
+ float64x2_t values_f64x2) {
95
+ float64x2_t product_f64x2 = vmulq_f64(values_f64x2, values_f64x2);
96
+ float64x2_t product_error_f64x2 = vfmaq_f64(vnegq_f64(product_f64x2), values_f64x2, values_f64x2);
97
+ float64x2_t tentative_sum_f64x2 = vaddq_f64(*sum_f64x2, product_f64x2);
98
+ float64x2_t virtual_addend_f64x2 = vsubq_f64(tentative_sum_f64x2, *sum_f64x2);
99
+ float64x2_t sum_error_f64x2 = vaddq_f64(vsubq_f64(*sum_f64x2, vsubq_f64(tentative_sum_f64x2, virtual_addend_f64x2)),
100
+ vsubq_f64(product_f64x2, virtual_addend_f64x2));
101
+ *sum_f64x2 = tentative_sum_f64x2;
102
+ *compensation_f64x2 = vaddq_f64(*compensation_f64x2, vaddq_f64(sum_error_f64x2, product_error_f64x2));
103
+ }
104
+
105
+ NK_INTERNAL void nk_bicentroid_f32_neon_( //
106
+ nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, //
107
+ nk_f64_t *ca_x, nk_f64_t *ca_y, nk_f64_t *ca_z, //
108
+ nk_f64_t *cb_x, nk_f64_t *cb_y, nk_f64_t *cb_z) { //
109
+ float64x2_t zero_f64x2 = vdupq_n_f64(0.0);
110
+ float64x2_t sum_a_x_lower_f64x2 = zero_f64x2, sum_a_x_upper_f64x2 = zero_f64x2;
111
+ float64x2_t sum_a_y_lower_f64x2 = zero_f64x2, sum_a_y_upper_f64x2 = zero_f64x2;
112
+ float64x2_t sum_a_z_lower_f64x2 = zero_f64x2, sum_a_z_upper_f64x2 = zero_f64x2;
113
+ float64x2_t sum_b_x_lower_f64x2 = zero_f64x2, sum_b_x_upper_f64x2 = zero_f64x2;
114
+ float64x2_t sum_b_y_lower_f64x2 = zero_f64x2, sum_b_y_upper_f64x2 = zero_f64x2;
115
+ float64x2_t sum_b_z_lower_f64x2 = zero_f64x2, sum_b_z_upper_f64x2 = zero_f64x2;
116
+ nk_size_t index = 0;
117
+
118
+ for (; index + 4 <= n; index += 4) {
119
+ float32x4_t a_x_f32x4, a_y_f32x4, a_z_f32x4, b_x_f32x4, b_y_f32x4, b_z_f32x4;
120
+ nk_deinterleave_f32x4_neon_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4),
121
+ nk_deinterleave_f32x4_neon_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
122
+
123
+ float64x2_t a_x_lower_f64x2 = vcvt_f64_f32(vget_low_f32(a_x_f32x4));
124
+ float64x2_t a_x_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(a_x_f32x4);
125
+ float64x2_t a_y_lower_f64x2 = vcvt_f64_f32(vget_low_f32(a_y_f32x4));
126
+ float64x2_t a_y_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(a_y_f32x4);
127
+ float64x2_t a_z_lower_f64x2 = vcvt_f64_f32(vget_low_f32(a_z_f32x4));
128
+ float64x2_t a_z_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(a_z_f32x4);
129
+ float64x2_t b_x_lower_f64x2 = vcvt_f64_f32(vget_low_f32(b_x_f32x4));
130
+ float64x2_t b_x_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(b_x_f32x4);
131
+ float64x2_t b_y_lower_f64x2 = vcvt_f64_f32(vget_low_f32(b_y_f32x4));
132
+ float64x2_t b_y_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(b_y_f32x4);
133
+ float64x2_t b_z_lower_f64x2 = vcvt_f64_f32(vget_low_f32(b_z_f32x4));
134
+ float64x2_t b_z_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(b_z_f32x4);
135
+
136
+ sum_a_x_lower_f64x2 = vaddq_f64(sum_a_x_lower_f64x2, a_x_lower_f64x2),
137
+ sum_a_x_upper_f64x2 = vaddq_f64(sum_a_x_upper_f64x2, a_x_upper_f64x2);
138
+ sum_a_y_lower_f64x2 = vaddq_f64(sum_a_y_lower_f64x2, a_y_lower_f64x2),
139
+ sum_a_y_upper_f64x2 = vaddq_f64(sum_a_y_upper_f64x2, a_y_upper_f64x2);
140
+ sum_a_z_lower_f64x2 = vaddq_f64(sum_a_z_lower_f64x2, a_z_lower_f64x2),
141
+ sum_a_z_upper_f64x2 = vaddq_f64(sum_a_z_upper_f64x2, a_z_upper_f64x2);
142
+ sum_b_x_lower_f64x2 = vaddq_f64(sum_b_x_lower_f64x2, b_x_lower_f64x2),
143
+ sum_b_x_upper_f64x2 = vaddq_f64(sum_b_x_upper_f64x2, b_x_upper_f64x2);
144
+ sum_b_y_lower_f64x2 = vaddq_f64(sum_b_y_lower_f64x2, b_y_lower_f64x2),
145
+ sum_b_y_upper_f64x2 = vaddq_f64(sum_b_y_upper_f64x2, b_y_upper_f64x2);
146
+ sum_b_z_lower_f64x2 = vaddq_f64(sum_b_z_lower_f64x2, b_z_lower_f64x2),
147
+ sum_b_z_upper_f64x2 = vaddq_f64(sum_b_z_upper_f64x2, b_z_upper_f64x2);
148
+ }
149
+
150
+ nk_f64_t sum_a_x = vaddvq_f64(vaddq_f64(sum_a_x_lower_f64x2, sum_a_x_upper_f64x2));
151
+ nk_f64_t sum_a_y = vaddvq_f64(vaddq_f64(sum_a_y_lower_f64x2, sum_a_y_upper_f64x2));
152
+ nk_f64_t sum_a_z = vaddvq_f64(vaddq_f64(sum_a_z_lower_f64x2, sum_a_z_upper_f64x2));
153
+ nk_f64_t sum_b_x = vaddvq_f64(vaddq_f64(sum_b_x_lower_f64x2, sum_b_x_upper_f64x2));
154
+ nk_f64_t sum_b_y = vaddvq_f64(vaddq_f64(sum_b_y_lower_f64x2, sum_b_y_upper_f64x2));
155
+ nk_f64_t sum_b_z = vaddvq_f64(vaddq_f64(sum_b_z_lower_f64x2, sum_b_z_upper_f64x2));
156
+
157
+ for (; index < n; ++index) {
158
+ sum_a_x += a[index * 3 + 0], sum_a_y += a[index * 3 + 1], sum_a_z += a[index * 3 + 2];
159
+ sum_b_x += b[index * 3 + 0], sum_b_y += b[index * 3 + 1], sum_b_z += b[index * 3 + 2];
160
+ }
161
+
162
+ nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
163
+ *ca_x = sum_a_x * inv_n, *ca_y = sum_a_y * inv_n, *ca_z = sum_a_z * inv_n;
164
+ *cb_x = sum_b_x * inv_n, *cb_y = sum_b_y * inv_n, *cb_z = sum_b_z * inv_n;
165
+ }
166
+
167
+ NK_INTERNAL void nk_cross_covariance_f32_neon_( //
168
+ nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t centroid_a_x, nk_f64_t centroid_a_y,
169
+ nk_f64_t centroid_a_z, nk_f64_t centroid_b_x, nk_f64_t centroid_b_y, nk_f64_t centroid_b_z, nk_f64_t h[9]) {
170
+ float64x2_t zero_f64x2 = vdupq_n_f64(0.0);
171
+ float64x2_t centroid_a_x_f64x2 = vdupq_n_f64(centroid_a_x), centroid_a_y_f64x2 = vdupq_n_f64(centroid_a_y);
172
+ float64x2_t centroid_a_z_f64x2 = vdupq_n_f64(centroid_a_z), centroid_b_x_f64x2 = vdupq_n_f64(centroid_b_x);
173
+ float64x2_t centroid_b_y_f64x2 = vdupq_n_f64(centroid_b_y), centroid_b_z_f64x2 = vdupq_n_f64(centroid_b_z);
174
+ float64x2_t cross_00_lower_f64x2 = zero_f64x2, cross_00_upper_f64x2 = zero_f64x2;
175
+ float64x2_t cross_01_lower_f64x2 = zero_f64x2, cross_01_upper_f64x2 = zero_f64x2;
176
+ float64x2_t cross_02_lower_f64x2 = zero_f64x2, cross_02_upper_f64x2 = zero_f64x2;
177
+ float64x2_t cross_10_lower_f64x2 = zero_f64x2, cross_10_upper_f64x2 = zero_f64x2;
178
+ float64x2_t cross_11_lower_f64x2 = zero_f64x2, cross_11_upper_f64x2 = zero_f64x2;
179
+ float64x2_t cross_12_lower_f64x2 = zero_f64x2, cross_12_upper_f64x2 = zero_f64x2;
180
+ float64x2_t cross_20_lower_f64x2 = zero_f64x2, cross_20_upper_f64x2 = zero_f64x2;
181
+ float64x2_t cross_21_lower_f64x2 = zero_f64x2, cross_21_upper_f64x2 = zero_f64x2;
182
+ float64x2_t cross_22_lower_f64x2 = zero_f64x2, cross_22_upper_f64x2 = zero_f64x2;
183
+ nk_size_t index = 0;
184
+
185
+ for (; index + 4 <= n; index += 4) {
186
+ float32x4_t a_x_f32x4, a_y_f32x4, a_z_f32x4, b_x_f32x4, b_y_f32x4, b_z_f32x4;
187
+ nk_deinterleave_f32x4_neon_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4),
188
+ nk_deinterleave_f32x4_neon_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
189
+
190
+ float64x2_t centered_a_x_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(a_x_f32x4)), centroid_a_x_f64x2);
191
+ float64x2_t centered_a_x_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(a_x_f32x4),
192
+ centroid_a_x_f64x2);
193
+ float64x2_t centered_a_y_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(a_y_f32x4)), centroid_a_y_f64x2);
194
+ float64x2_t centered_a_y_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(a_y_f32x4),
195
+ centroid_a_y_f64x2);
196
+ float64x2_t centered_a_z_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(a_z_f32x4)), centroid_a_z_f64x2);
197
+ float64x2_t centered_a_z_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(a_z_f32x4),
198
+ centroid_a_z_f64x2);
199
+ float64x2_t centered_b_x_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(b_x_f32x4)), centroid_b_x_f64x2);
200
+ float64x2_t centered_b_x_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(b_x_f32x4),
201
+ centroid_b_x_f64x2);
202
+ float64x2_t centered_b_y_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(b_y_f32x4)), centroid_b_y_f64x2);
203
+ float64x2_t centered_b_y_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(b_y_f32x4),
204
+ centroid_b_y_f64x2);
205
+ float64x2_t centered_b_z_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(b_z_f32x4)), centroid_b_z_f64x2);
206
+ float64x2_t centered_b_z_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(b_z_f32x4),
207
+ centroid_b_z_f64x2);
208
+
209
+ cross_00_lower_f64x2 = vfmaq_f64(cross_00_lower_f64x2, centered_a_x_lower_f64x2, centered_b_x_lower_f64x2),
210
+ cross_00_upper_f64x2 = vfmaq_f64(cross_00_upper_f64x2, centered_a_x_upper_f64x2, centered_b_x_upper_f64x2);
211
+ cross_01_lower_f64x2 = vfmaq_f64(cross_01_lower_f64x2, centered_a_x_lower_f64x2, centered_b_y_lower_f64x2),
212
+ cross_01_upper_f64x2 = vfmaq_f64(cross_01_upper_f64x2, centered_a_x_upper_f64x2, centered_b_y_upper_f64x2);
213
+ cross_02_lower_f64x2 = vfmaq_f64(cross_02_lower_f64x2, centered_a_x_lower_f64x2, centered_b_z_lower_f64x2),
214
+ cross_02_upper_f64x2 = vfmaq_f64(cross_02_upper_f64x2, centered_a_x_upper_f64x2, centered_b_z_upper_f64x2);
215
+ cross_10_lower_f64x2 = vfmaq_f64(cross_10_lower_f64x2, centered_a_y_lower_f64x2, centered_b_x_lower_f64x2),
216
+ cross_10_upper_f64x2 = vfmaq_f64(cross_10_upper_f64x2, centered_a_y_upper_f64x2, centered_b_x_upper_f64x2);
217
+ cross_11_lower_f64x2 = vfmaq_f64(cross_11_lower_f64x2, centered_a_y_lower_f64x2, centered_b_y_lower_f64x2),
218
+ cross_11_upper_f64x2 = vfmaq_f64(cross_11_upper_f64x2, centered_a_y_upper_f64x2, centered_b_y_upper_f64x2);
219
+ cross_12_lower_f64x2 = vfmaq_f64(cross_12_lower_f64x2, centered_a_y_lower_f64x2, centered_b_z_lower_f64x2),
220
+ cross_12_upper_f64x2 = vfmaq_f64(cross_12_upper_f64x2, centered_a_y_upper_f64x2, centered_b_z_upper_f64x2);
221
+ cross_20_lower_f64x2 = vfmaq_f64(cross_20_lower_f64x2, centered_a_z_lower_f64x2, centered_b_x_lower_f64x2),
222
+ cross_20_upper_f64x2 = vfmaq_f64(cross_20_upper_f64x2, centered_a_z_upper_f64x2, centered_b_x_upper_f64x2);
223
+ cross_21_lower_f64x2 = vfmaq_f64(cross_21_lower_f64x2, centered_a_z_lower_f64x2, centered_b_y_lower_f64x2),
224
+ cross_21_upper_f64x2 = vfmaq_f64(cross_21_upper_f64x2, centered_a_z_upper_f64x2, centered_b_y_upper_f64x2);
225
+ cross_22_lower_f64x2 = vfmaq_f64(cross_22_lower_f64x2, centered_a_z_lower_f64x2, centered_b_z_lower_f64x2),
226
+ cross_22_upper_f64x2 = vfmaq_f64(cross_22_upper_f64x2, centered_a_z_upper_f64x2, centered_b_z_upper_f64x2);
227
+ }
228
+
229
+ h[0] = vaddvq_f64(vaddq_f64(cross_00_lower_f64x2, cross_00_upper_f64x2));
230
+ h[1] = vaddvq_f64(vaddq_f64(cross_01_lower_f64x2, cross_01_upper_f64x2));
231
+ h[2] = vaddvq_f64(vaddq_f64(cross_02_lower_f64x2, cross_02_upper_f64x2));
232
+ h[3] = vaddvq_f64(vaddq_f64(cross_10_lower_f64x2, cross_10_upper_f64x2));
233
+ h[4] = vaddvq_f64(vaddq_f64(cross_11_lower_f64x2, cross_11_upper_f64x2));
234
+ h[5] = vaddvq_f64(vaddq_f64(cross_12_lower_f64x2, cross_12_upper_f64x2));
235
+ h[6] = vaddvq_f64(vaddq_f64(cross_20_lower_f64x2, cross_20_upper_f64x2));
236
+ h[7] = vaddvq_f64(vaddq_f64(cross_21_lower_f64x2, cross_21_upper_f64x2));
237
+ h[8] = vaddvq_f64(vaddq_f64(cross_22_lower_f64x2, cross_22_upper_f64x2));
238
+
239
+ for (; index < n; ++index) {
240
+ nk_f64_t centered_a_x = (nk_f64_t)a[index * 3 + 0] - centroid_a_x;
241
+ nk_f64_t centered_a_y = (nk_f64_t)a[index * 3 + 1] - centroid_a_y;
242
+ nk_f64_t centered_a_z = (nk_f64_t)a[index * 3 + 2] - centroid_a_z;
243
+ nk_f64_t centered_b_x = (nk_f64_t)b[index * 3 + 0] - centroid_b_x;
244
+ nk_f64_t centered_b_y = (nk_f64_t)b[index * 3 + 1] - centroid_b_y;
245
+ nk_f64_t centered_b_z = (nk_f64_t)b[index * 3 + 2] - centroid_b_z;
246
+ h[0] += centered_a_x * centered_b_x, h[1] += centered_a_x * centered_b_y, h[2] += centered_a_x * centered_b_z;
247
+ h[3] += centered_a_y * centered_b_x, h[4] += centered_a_y * centered_b_y, h[5] += centered_a_y * centered_b_z;
248
+ h[6] += centered_a_z * centered_b_x, h[7] += centered_a_z * centered_b_y, h[8] += centered_a_z * centered_b_z;
249
+ }
250
+ }
251
+
252
+ NK_INTERNAL void nk_cross_covariance_and_variance_f32_neon_( //
253
+ nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t centroid_a_x, nk_f64_t centroid_a_y,
254
+ nk_f64_t centroid_a_z, nk_f64_t centroid_b_x, nk_f64_t centroid_b_y, nk_f64_t centroid_b_z, nk_f64_t h[9],
255
+ nk_f64_t *variance_a) {
256
+ float64x2_t zero_f64x2 = vdupq_n_f64(0.0);
257
+ float64x2_t centroid_a_x_f64x2 = vdupq_n_f64(centroid_a_x), centroid_a_y_f64x2 = vdupq_n_f64(centroid_a_y);
258
+ float64x2_t centroid_a_z_f64x2 = vdupq_n_f64(centroid_a_z), centroid_b_x_f64x2 = vdupq_n_f64(centroid_b_x);
259
+ float64x2_t centroid_b_y_f64x2 = vdupq_n_f64(centroid_b_y), centroid_b_z_f64x2 = vdupq_n_f64(centroid_b_z);
260
+ float64x2_t cross_00_lower_f64x2 = zero_f64x2, cross_00_upper_f64x2 = zero_f64x2;
261
+ float64x2_t cross_01_lower_f64x2 = zero_f64x2, cross_01_upper_f64x2 = zero_f64x2;
262
+ float64x2_t cross_02_lower_f64x2 = zero_f64x2, cross_02_upper_f64x2 = zero_f64x2;
263
+ float64x2_t cross_10_lower_f64x2 = zero_f64x2, cross_10_upper_f64x2 = zero_f64x2;
264
+ float64x2_t cross_11_lower_f64x2 = zero_f64x2, cross_11_upper_f64x2 = zero_f64x2;
265
+ float64x2_t cross_12_lower_f64x2 = zero_f64x2, cross_12_upper_f64x2 = zero_f64x2;
266
+ float64x2_t cross_20_lower_f64x2 = zero_f64x2, cross_20_upper_f64x2 = zero_f64x2;
267
+ float64x2_t cross_21_lower_f64x2 = zero_f64x2, cross_21_upper_f64x2 = zero_f64x2;
268
+ float64x2_t cross_22_lower_f64x2 = zero_f64x2, cross_22_upper_f64x2 = zero_f64x2;
269
+ float64x2_t variance_lower_f64x2 = zero_f64x2, variance_upper_f64x2 = zero_f64x2;
270
+ nk_size_t index = 0;
271
+
272
+ for (; index + 4 <= n; index += 4) {
273
+ float32x4_t a_x_f32x4, a_y_f32x4, a_z_f32x4, b_x_f32x4, b_y_f32x4, b_z_f32x4;
274
+ nk_deinterleave_f32x4_neon_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4),
275
+ nk_deinterleave_f32x4_neon_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
276
+
277
+ float64x2_t centered_a_x_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(a_x_f32x4)), centroid_a_x_f64x2);
278
+ float64x2_t centered_a_x_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(a_x_f32x4),
279
+ centroid_a_x_f64x2);
280
+ float64x2_t centered_a_y_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(a_y_f32x4)), centroid_a_y_f64x2);
281
+ float64x2_t centered_a_y_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(a_y_f32x4),
282
+ centroid_a_y_f64x2);
283
+ float64x2_t centered_a_z_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(a_z_f32x4)), centroid_a_z_f64x2);
284
+ float64x2_t centered_a_z_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(a_z_f32x4),
285
+ centroid_a_z_f64x2);
286
+ float64x2_t centered_b_x_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(b_x_f32x4)), centroid_b_x_f64x2);
287
+ float64x2_t centered_b_x_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(b_x_f32x4),
288
+ centroid_b_x_f64x2);
289
+ float64x2_t centered_b_y_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(b_y_f32x4)), centroid_b_y_f64x2);
290
+ float64x2_t centered_b_y_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(b_y_f32x4),
291
+ centroid_b_y_f64x2);
292
+ float64x2_t centered_b_z_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(b_z_f32x4)), centroid_b_z_f64x2);
293
+ float64x2_t centered_b_z_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(b_z_f32x4),
294
+ centroid_b_z_f64x2);
295
+
296
+ cross_00_lower_f64x2 = vfmaq_f64(cross_00_lower_f64x2, centered_a_x_lower_f64x2, centered_b_x_lower_f64x2),
297
+ cross_00_upper_f64x2 = vfmaq_f64(cross_00_upper_f64x2, centered_a_x_upper_f64x2, centered_b_x_upper_f64x2);
298
+ cross_01_lower_f64x2 = vfmaq_f64(cross_01_lower_f64x2, centered_a_x_lower_f64x2, centered_b_y_lower_f64x2),
299
+ cross_01_upper_f64x2 = vfmaq_f64(cross_01_upper_f64x2, centered_a_x_upper_f64x2, centered_b_y_upper_f64x2);
300
+ cross_02_lower_f64x2 = vfmaq_f64(cross_02_lower_f64x2, centered_a_x_lower_f64x2, centered_b_z_lower_f64x2),
301
+ cross_02_upper_f64x2 = vfmaq_f64(cross_02_upper_f64x2, centered_a_x_upper_f64x2, centered_b_z_upper_f64x2);
302
+ cross_10_lower_f64x2 = vfmaq_f64(cross_10_lower_f64x2, centered_a_y_lower_f64x2, centered_b_x_lower_f64x2),
303
+ cross_10_upper_f64x2 = vfmaq_f64(cross_10_upper_f64x2, centered_a_y_upper_f64x2, centered_b_x_upper_f64x2);
304
+ cross_11_lower_f64x2 = vfmaq_f64(cross_11_lower_f64x2, centered_a_y_lower_f64x2, centered_b_y_lower_f64x2),
305
+ cross_11_upper_f64x2 = vfmaq_f64(cross_11_upper_f64x2, centered_a_y_upper_f64x2, centered_b_y_upper_f64x2);
306
+ cross_12_lower_f64x2 = vfmaq_f64(cross_12_lower_f64x2, centered_a_y_lower_f64x2, centered_b_z_lower_f64x2),
307
+ cross_12_upper_f64x2 = vfmaq_f64(cross_12_upper_f64x2, centered_a_y_upper_f64x2, centered_b_z_upper_f64x2);
308
+ cross_20_lower_f64x2 = vfmaq_f64(cross_20_lower_f64x2, centered_a_z_lower_f64x2, centered_b_x_lower_f64x2),
309
+ cross_20_upper_f64x2 = vfmaq_f64(cross_20_upper_f64x2, centered_a_z_upper_f64x2, centered_b_x_upper_f64x2);
310
+ cross_21_lower_f64x2 = vfmaq_f64(cross_21_lower_f64x2, centered_a_z_lower_f64x2, centered_b_y_lower_f64x2),
311
+ cross_21_upper_f64x2 = vfmaq_f64(cross_21_upper_f64x2, centered_a_z_upper_f64x2, centered_b_y_upper_f64x2);
312
+ cross_22_lower_f64x2 = vfmaq_f64(cross_22_lower_f64x2, centered_a_z_lower_f64x2, centered_b_z_lower_f64x2),
313
+ cross_22_upper_f64x2 = vfmaq_f64(cross_22_upper_f64x2, centered_a_z_upper_f64x2, centered_b_z_upper_f64x2);
314
+
315
+ variance_lower_f64x2 = vfmaq_f64(variance_lower_f64x2, centered_a_x_lower_f64x2, centered_a_x_lower_f64x2),
316
+ variance_upper_f64x2 = vfmaq_f64(variance_upper_f64x2, centered_a_x_upper_f64x2, centered_a_x_upper_f64x2);
317
+ variance_lower_f64x2 = vfmaq_f64(variance_lower_f64x2, centered_a_y_lower_f64x2, centered_a_y_lower_f64x2),
318
+ variance_upper_f64x2 = vfmaq_f64(variance_upper_f64x2, centered_a_y_upper_f64x2, centered_a_y_upper_f64x2);
319
+ variance_lower_f64x2 = vfmaq_f64(variance_lower_f64x2, centered_a_z_lower_f64x2, centered_a_z_lower_f64x2),
320
+ variance_upper_f64x2 = vfmaq_f64(variance_upper_f64x2, centered_a_z_upper_f64x2, centered_a_z_upper_f64x2);
321
+ }
322
+
323
+ h[0] = vaddvq_f64(vaddq_f64(cross_00_lower_f64x2, cross_00_upper_f64x2));
324
+ h[1] = vaddvq_f64(vaddq_f64(cross_01_lower_f64x2, cross_01_upper_f64x2));
325
+ h[2] = vaddvq_f64(vaddq_f64(cross_02_lower_f64x2, cross_02_upper_f64x2));
326
+ h[3] = vaddvq_f64(vaddq_f64(cross_10_lower_f64x2, cross_10_upper_f64x2));
327
+ h[4] = vaddvq_f64(vaddq_f64(cross_11_lower_f64x2, cross_11_upper_f64x2));
328
+ h[5] = vaddvq_f64(vaddq_f64(cross_12_lower_f64x2, cross_12_upper_f64x2));
329
+ h[6] = vaddvq_f64(vaddq_f64(cross_20_lower_f64x2, cross_20_upper_f64x2));
330
+ h[7] = vaddvq_f64(vaddq_f64(cross_21_lower_f64x2, cross_21_upper_f64x2));
331
+ h[8] = vaddvq_f64(vaddq_f64(cross_22_lower_f64x2, cross_22_upper_f64x2));
332
+ *variance_a = vaddvq_f64(vaddq_f64(variance_lower_f64x2, variance_upper_f64x2)) / (nk_f64_t)n;
333
+
334
+ for (; index < n; ++index) {
335
+ nk_f64_t centered_a_x = (nk_f64_t)a[index * 3 + 0] - centroid_a_x;
336
+ nk_f64_t centered_a_y = (nk_f64_t)a[index * 3 + 1] - centroid_a_y;
337
+ nk_f64_t centered_a_z = (nk_f64_t)a[index * 3 + 2] - centroid_a_z;
338
+ nk_f64_t centered_b_x = (nk_f64_t)b[index * 3 + 0] - centroid_b_x;
339
+ nk_f64_t centered_b_y = (nk_f64_t)b[index * 3 + 1] - centroid_b_y;
340
+ nk_f64_t centered_b_z = (nk_f64_t)b[index * 3 + 2] - centroid_b_z;
341
+ h[0] += centered_a_x * centered_b_x, h[1] += centered_a_x * centered_b_y, h[2] += centered_a_x * centered_b_z;
342
+ h[3] += centered_a_y * centered_b_x, h[4] += centered_a_y * centered_b_y, h[5] += centered_a_y * centered_b_z;
343
+ h[6] += centered_a_z * centered_b_x, h[7] += centered_a_z * centered_b_y, h[8] += centered_a_z * centered_b_z;
344
+ *variance_a += (centered_a_x * centered_a_x + centered_a_y * centered_a_y + centered_a_z * centered_a_z) /
345
+ (nk_f64_t)n;
346
+ }
347
+ }
348
+
349
+ NK_INTERNAL nk_f64_t nk_transformed_ssd_f32_neon_( //
350
+ nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t const *r, nk_f64_t scale, nk_f64_t centroid_a_x,
351
+ nk_f64_t centroid_a_y, nk_f64_t centroid_a_z, nk_f64_t centroid_b_x, nk_f64_t centroid_b_y, nk_f64_t centroid_b_z) {
352
+ float64x2_t scaled_rotation_x_x_f64x2 = vdupq_n_f64(scale * r[0]);
353
+ float64x2_t scaled_rotation_x_y_f64x2 = vdupq_n_f64(scale * r[1]);
354
+ float64x2_t scaled_rotation_x_z_f64x2 = vdupq_n_f64(scale * r[2]);
355
+ float64x2_t scaled_rotation_y_x_f64x2 = vdupq_n_f64(scale * r[3]);
356
+ float64x2_t scaled_rotation_y_y_f64x2 = vdupq_n_f64(scale * r[4]);
357
+ float64x2_t scaled_rotation_y_z_f64x2 = vdupq_n_f64(scale * r[5]);
358
+ float64x2_t scaled_rotation_z_x_f64x2 = vdupq_n_f64(scale * r[6]);
359
+ float64x2_t scaled_rotation_z_y_f64x2 = vdupq_n_f64(scale * r[7]);
360
+ float64x2_t scaled_rotation_z_z_f64x2 = vdupq_n_f64(scale * r[8]);
361
+ float64x2_t centroid_a_x_f64x2 = vdupq_n_f64(centroid_a_x), centroid_a_y_f64x2 = vdupq_n_f64(centroid_a_y);
362
+ float64x2_t centroid_a_z_f64x2 = vdupq_n_f64(centroid_a_z), centroid_b_x_f64x2 = vdupq_n_f64(centroid_b_x);
363
+ float64x2_t centroid_b_y_f64x2 = vdupq_n_f64(centroid_b_y), centroid_b_z_f64x2 = vdupq_n_f64(centroid_b_z);
364
+ float64x2_t sum_squared_lower_f64x2 = vdupq_n_f64(0.0), sum_squared_upper_f64x2 = vdupq_n_f64(0.0);
365
+ nk_size_t index = 0;
366
+
367
+ for (; index + 4 <= n; index += 4) {
368
+ float32x4_t a_x_f32x4, a_y_f32x4, a_z_f32x4, b_x_f32x4, b_y_f32x4, b_z_f32x4;
369
+ nk_deinterleave_f32x4_neon_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4),
370
+ nk_deinterleave_f32x4_neon_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
371
+
372
+ float64x2_t centered_a_x_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(a_x_f32x4)), centroid_a_x_f64x2);
373
+ float64x2_t centered_a_x_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(a_x_f32x4),
374
+ centroid_a_x_f64x2);
375
+ float64x2_t centered_a_y_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(a_y_f32x4)), centroid_a_y_f64x2);
376
+ float64x2_t centered_a_y_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(a_y_f32x4),
377
+ centroid_a_y_f64x2);
378
+ float64x2_t centered_a_z_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(a_z_f32x4)), centroid_a_z_f64x2);
379
+ float64x2_t centered_a_z_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(a_z_f32x4),
380
+ centroid_a_z_f64x2);
381
+ float64x2_t centered_b_x_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(b_x_f32x4)), centroid_b_x_f64x2);
382
+ float64x2_t centered_b_x_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(b_x_f32x4),
383
+ centroid_b_x_f64x2);
384
+ float64x2_t centered_b_y_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(b_y_f32x4)), centroid_b_y_f64x2);
385
+ float64x2_t centered_b_y_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(b_y_f32x4),
386
+ centroid_b_y_f64x2);
387
+ float64x2_t centered_b_z_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(b_z_f32x4)), centroid_b_z_f64x2);
388
+ float64x2_t centered_b_z_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(b_z_f32x4),
389
+ centroid_b_z_f64x2);
390
+
391
+ float64x2_t rotated_a_x_lower_f64x2 = vfmaq_f64(
392
+ vfmaq_f64(vmulq_f64(scaled_rotation_x_x_f64x2, centered_a_x_lower_f64x2), scaled_rotation_x_y_f64x2,
393
+ centered_a_y_lower_f64x2),
394
+ scaled_rotation_x_z_f64x2, centered_a_z_lower_f64x2);
395
+ float64x2_t rotated_a_x_upper_f64x2 = vfmaq_f64(
396
+ vfmaq_f64(vmulq_f64(scaled_rotation_x_x_f64x2, centered_a_x_upper_f64x2), scaled_rotation_x_y_f64x2,
397
+ centered_a_y_upper_f64x2),
398
+ scaled_rotation_x_z_f64x2, centered_a_z_upper_f64x2);
399
+ float64x2_t rotated_a_y_lower_f64x2 = vfmaq_f64(
400
+ vfmaq_f64(vmulq_f64(scaled_rotation_y_x_f64x2, centered_a_x_lower_f64x2), scaled_rotation_y_y_f64x2,
401
+ centered_a_y_lower_f64x2),
402
+ scaled_rotation_y_z_f64x2, centered_a_z_lower_f64x2);
403
+ float64x2_t rotated_a_y_upper_f64x2 = vfmaq_f64(
404
+ vfmaq_f64(vmulq_f64(scaled_rotation_y_x_f64x2, centered_a_x_upper_f64x2), scaled_rotation_y_y_f64x2,
405
+ centered_a_y_upper_f64x2),
406
+ scaled_rotation_y_z_f64x2, centered_a_z_upper_f64x2);
407
+ float64x2_t rotated_a_z_lower_f64x2 = vfmaq_f64(
408
+ vfmaq_f64(vmulq_f64(scaled_rotation_z_x_f64x2, centered_a_x_lower_f64x2), scaled_rotation_z_y_f64x2,
409
+ centered_a_y_lower_f64x2),
410
+ scaled_rotation_z_z_f64x2, centered_a_z_lower_f64x2);
411
+ float64x2_t rotated_a_z_upper_f64x2 = vfmaq_f64(
412
+ vfmaq_f64(vmulq_f64(scaled_rotation_z_x_f64x2, centered_a_x_upper_f64x2), scaled_rotation_z_y_f64x2,
413
+ centered_a_y_upper_f64x2),
414
+ scaled_rotation_z_z_f64x2, centered_a_z_upper_f64x2);
415
+
416
+ float64x2_t delta_x_lower_f64x2 = vsubq_f64(rotated_a_x_lower_f64x2, centered_b_x_lower_f64x2);
417
+ float64x2_t delta_x_upper_f64x2 = vsubq_f64(rotated_a_x_upper_f64x2, centered_b_x_upper_f64x2);
418
+ float64x2_t delta_y_lower_f64x2 = vsubq_f64(rotated_a_y_lower_f64x2, centered_b_y_lower_f64x2);
419
+ float64x2_t delta_y_upper_f64x2 = vsubq_f64(rotated_a_y_upper_f64x2, centered_b_y_upper_f64x2);
420
+ float64x2_t delta_z_lower_f64x2 = vsubq_f64(rotated_a_z_lower_f64x2, centered_b_z_lower_f64x2);
421
+ float64x2_t delta_z_upper_f64x2 = vsubq_f64(rotated_a_z_upper_f64x2, centered_b_z_upper_f64x2);
422
+
423
+ sum_squared_lower_f64x2 = vfmaq_f64(sum_squared_lower_f64x2, delta_x_lower_f64x2, delta_x_lower_f64x2),
424
+ sum_squared_upper_f64x2 = vfmaq_f64(sum_squared_upper_f64x2, delta_x_upper_f64x2, delta_x_upper_f64x2);
425
+ sum_squared_lower_f64x2 = vfmaq_f64(sum_squared_lower_f64x2, delta_y_lower_f64x2, delta_y_lower_f64x2),
426
+ sum_squared_upper_f64x2 = vfmaq_f64(sum_squared_upper_f64x2, delta_y_upper_f64x2, delta_y_upper_f64x2);
427
+ sum_squared_lower_f64x2 = vfmaq_f64(sum_squared_lower_f64x2, delta_z_lower_f64x2, delta_z_lower_f64x2),
428
+ sum_squared_upper_f64x2 = vfmaq_f64(sum_squared_upper_f64x2, delta_z_upper_f64x2, delta_z_upper_f64x2);
429
+ }
430
+
431
+ nk_f64_t sum_squared = vaddvq_f64(vaddq_f64(sum_squared_lower_f64x2, sum_squared_upper_f64x2));
432
+ for (; index < n; ++index) {
433
+ nk_f64_t centered_a_x = (nk_f64_t)a[index * 3 + 0] - centroid_a_x;
434
+ nk_f64_t centered_a_y = (nk_f64_t)a[index * 3 + 1] - centroid_a_y;
435
+ nk_f64_t centered_a_z = (nk_f64_t)a[index * 3 + 2] - centroid_a_z;
436
+ nk_f64_t centered_b_x = (nk_f64_t)b[index * 3 + 0] - centroid_b_x;
437
+ nk_f64_t centered_b_y = (nk_f64_t)b[index * 3 + 1] - centroid_b_y;
438
+ nk_f64_t centered_b_z = (nk_f64_t)b[index * 3 + 2] - centroid_b_z;
439
+ nk_f64_t rotated_a_x = scale * (r[0] * centered_a_x + r[1] * centered_a_y + r[2] * centered_a_z);
440
+ nk_f64_t rotated_a_y = scale * (r[3] * centered_a_x + r[4] * centered_a_y + r[5] * centered_a_z);
441
+ nk_f64_t rotated_a_z = scale * (r[6] * centered_a_x + r[7] * centered_a_y + r[8] * centered_a_z);
442
+ nk_f64_t delta_x = rotated_a_x - centered_b_x, delta_y = rotated_a_y - centered_b_y,
443
+ delta_z = rotated_a_z - centered_b_z;
444
+ sum_squared += delta_x * delta_x + delta_y * delta_y + delta_z * delta_z;
445
+ }
446
+
447
+ return sum_squared;
448
+ }
449
+
450
+ /* Compute sum of squared distances for f64 after applying rotation (and optional scale).
451
+ *
452
+ * Optimization: 2x loop unrolling with multiple accumulators hides FMA latency (3-7 cycles).
453
+ */
454
+ NK_INTERNAL nk_f64_t nk_transformed_ssd_f64_neon_(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t const *r,
455
+ nk_f64_t scale, nk_f64_t centroid_a_x, nk_f64_t centroid_a_y,
456
+ nk_f64_t centroid_a_z, nk_f64_t centroid_b_x, nk_f64_t centroid_b_y,
457
+ nk_f64_t centroid_b_z) {
458
+ // Broadcast scaled rotation matrix elements
459
+ float64x2_t scaled_rotation_x_x_f64x2 = vdupq_n_f64(scale * r[0]);
460
+ float64x2_t scaled_rotation_x_y_f64x2 = vdupq_n_f64(scale * r[1]);
461
+ float64x2_t scaled_rotation_x_z_f64x2 = vdupq_n_f64(scale * r[2]);
462
+ float64x2_t scaled_rotation_y_x_f64x2 = vdupq_n_f64(scale * r[3]);
463
+ float64x2_t scaled_rotation_y_y_f64x2 = vdupq_n_f64(scale * r[4]);
464
+ float64x2_t scaled_rotation_y_z_f64x2 = vdupq_n_f64(scale * r[5]);
465
+ float64x2_t scaled_rotation_z_x_f64x2 = vdupq_n_f64(scale * r[6]);
466
+ float64x2_t scaled_rotation_z_y_f64x2 = vdupq_n_f64(scale * r[7]);
467
+ float64x2_t scaled_rotation_z_z_f64x2 = vdupq_n_f64(scale * r[8]);
468
+
469
+ // Broadcast centroids
470
+ float64x2_t centroid_a_x_f64x2 = vdupq_n_f64(centroid_a_x);
471
+ float64x2_t centroid_a_y_f64x2 = vdupq_n_f64(centroid_a_y);
472
+ float64x2_t centroid_a_z_f64x2 = vdupq_n_f64(centroid_a_z);
473
+ float64x2_t centroid_b_x_f64x2 = vdupq_n_f64(centroid_b_x);
474
+ float64x2_t centroid_b_y_f64x2 = vdupq_n_f64(centroid_b_y);
475
+ float64x2_t centroid_b_z_f64x2 = vdupq_n_f64(centroid_b_z);
476
+
477
+ // Two independent accumulators to hide FMA latency
478
+ float64x2_t sum_squared_a_f64x2 = vdupq_n_f64(0), sum_squared_a_compensation_f64x2 = vdupq_n_f64(0);
479
+ float64x2_t sum_squared_b_f64x2 = vdupq_n_f64(0), sum_squared_b_compensation_f64x2 = vdupq_n_f64(0);
480
+ nk_size_t j = 0;
481
+
482
+ // Main loop: process 4 points per iteration (2x unrolled, 2 points per batch)
483
+ for (; j + 4 <= n; j += 4) {
484
+ // First batch of 2 points
485
+ float64x2_t a1_x, a1_y, a1_z, b1_x, b1_y, b1_z;
486
+ nk_deinterleave_f64x2_neon_(a + j * 3, &a1_x, &a1_y, &a1_z);
487
+ nk_deinterleave_f64x2_neon_(b + j * 3, &b1_x, &b1_y, &b1_z);
488
+
489
+ // Second batch of 2 points
490
+ float64x2_t a2_x, a2_y, a2_z, b2_x, b2_y, b2_z;
491
+ nk_deinterleave_f64x2_neon_(a + (j + 2) * 3, &a2_x, &a2_y, &a2_z);
492
+ nk_deinterleave_f64x2_neon_(b + (j + 2) * 3, &b2_x, &b2_y, &b2_z);
493
+
494
+ // Center first batch
495
+ float64x2_t pa1_x = vsubq_f64(a1_x, centroid_a_x_f64x2);
496
+ float64x2_t pa1_y = vsubq_f64(a1_y, centroid_a_y_f64x2);
497
+ float64x2_t pa1_z = vsubq_f64(a1_z, centroid_a_z_f64x2);
498
+ float64x2_t pb1_x = vsubq_f64(b1_x, centroid_b_x_f64x2);
499
+ float64x2_t pb1_y = vsubq_f64(b1_y, centroid_b_y_f64x2);
500
+ float64x2_t pb1_z = vsubq_f64(b1_z, centroid_b_z_f64x2);
501
+
502
+ // Center second batch
503
+ float64x2_t pa2_x = vsubq_f64(a2_x, centroid_a_x_f64x2);
504
+ float64x2_t pa2_y = vsubq_f64(a2_y, centroid_a_y_f64x2);
505
+ float64x2_t pa2_z = vsubq_f64(a2_z, centroid_a_z_f64x2);
506
+ float64x2_t pb2_x = vsubq_f64(b2_x, centroid_b_x_f64x2);
507
+ float64x2_t pb2_y = vsubq_f64(b2_y, centroid_b_y_f64x2);
508
+ float64x2_t pb2_z = vsubq_f64(b2_z, centroid_b_z_f64x2);
509
+
510
+ // Rotate and scale first batch
511
+ float64x2_t ra1_x = vfmaq_f64(
512
+ vfmaq_f64(vmulq_f64(scaled_rotation_x_x_f64x2, pa1_x), scaled_rotation_x_y_f64x2, pa1_y),
513
+ scaled_rotation_x_z_f64x2, pa1_z);
514
+ float64x2_t ra1_y = vfmaq_f64(
515
+ vfmaq_f64(vmulq_f64(scaled_rotation_y_x_f64x2, pa1_x), scaled_rotation_y_y_f64x2, pa1_y),
516
+ scaled_rotation_y_z_f64x2, pa1_z);
517
+ float64x2_t ra1_z = vfmaq_f64(
518
+ vfmaq_f64(vmulq_f64(scaled_rotation_z_x_f64x2, pa1_x), scaled_rotation_z_y_f64x2, pa1_y),
519
+ scaled_rotation_z_z_f64x2, pa1_z);
520
+
521
+ // Rotate and scale second batch
522
+ float64x2_t ra2_x = vfmaq_f64(
523
+ vfmaq_f64(vmulq_f64(scaled_rotation_x_x_f64x2, pa2_x), scaled_rotation_x_y_f64x2, pa2_y),
524
+ scaled_rotation_x_z_f64x2, pa2_z);
525
+ float64x2_t ra2_y = vfmaq_f64(
526
+ vfmaq_f64(vmulq_f64(scaled_rotation_y_x_f64x2, pa2_x), scaled_rotation_y_y_f64x2, pa2_y),
527
+ scaled_rotation_y_z_f64x2, pa2_z);
528
+ float64x2_t ra2_z = vfmaq_f64(
529
+ vfmaq_f64(vmulq_f64(scaled_rotation_z_x_f64x2, pa2_x), scaled_rotation_z_y_f64x2, pa2_y),
530
+ scaled_rotation_z_z_f64x2, pa2_z);
531
+
532
+ // Deltas
533
+ float64x2_t delta1_x = vsubq_f64(ra1_x, pb1_x);
534
+ float64x2_t delta1_y = vsubq_f64(ra1_y, pb1_y);
535
+ float64x2_t delta1_z = vsubq_f64(ra1_z, pb1_z);
536
+ float64x2_t delta2_x = vsubq_f64(ra2_x, pb2_x);
537
+ float64x2_t delta2_y = vsubq_f64(ra2_y, pb2_y);
538
+ float64x2_t delta2_z = vsubq_f64(ra2_z, pb2_z);
539
+
540
+ // Accumulate to independent accumulators (interleaved for latency hiding)
541
+ nk_accumulate_square_f64x2_neon_(&sum_squared_a_f64x2, &sum_squared_a_compensation_f64x2, delta1_x);
542
+ nk_accumulate_square_f64x2_neon_(&sum_squared_b_f64x2, &sum_squared_b_compensation_f64x2, delta2_x);
543
+ nk_accumulate_square_f64x2_neon_(&sum_squared_a_f64x2, &sum_squared_a_compensation_f64x2, delta1_y);
544
+ nk_accumulate_square_f64x2_neon_(&sum_squared_b_f64x2, &sum_squared_b_compensation_f64x2, delta2_y);
545
+ nk_accumulate_square_f64x2_neon_(&sum_squared_a_f64x2, &sum_squared_a_compensation_f64x2, delta1_z);
546
+ nk_accumulate_square_f64x2_neon_(&sum_squared_b_f64x2, &sum_squared_b_compensation_f64x2, delta2_z);
547
+ }
548
+
549
+ // Handle remaining 2 points
550
+ if (j + 2 <= n) {
551
+ float64x2_t a_x, a_y, a_z, b_x, b_y, b_z;
552
+ nk_deinterleave_f64x2_neon_(a + j * 3, &a_x, &a_y, &a_z);
553
+ nk_deinterleave_f64x2_neon_(b + j * 3, &b_x, &b_y, &b_z);
554
+
555
+ float64x2_t pa_x = vsubq_f64(a_x, centroid_a_x_f64x2);
556
+ float64x2_t pa_y = vsubq_f64(a_y, centroid_a_y_f64x2);
557
+ float64x2_t pa_z = vsubq_f64(a_z, centroid_a_z_f64x2);
558
+ float64x2_t pb_x = vsubq_f64(b_x, centroid_b_x_f64x2);
559
+ float64x2_t pb_y = vsubq_f64(b_y, centroid_b_y_f64x2);
560
+ float64x2_t pb_z = vsubq_f64(b_z, centroid_b_z_f64x2);
561
+
562
+ float64x2_t ra_x = vfmaq_f64(
563
+ vfmaq_f64(vmulq_f64(scaled_rotation_x_x_f64x2, pa_x), scaled_rotation_x_y_f64x2, pa_y),
564
+ scaled_rotation_x_z_f64x2, pa_z);
565
+ float64x2_t ra_y = vfmaq_f64(
566
+ vfmaq_f64(vmulq_f64(scaled_rotation_y_x_f64x2, pa_x), scaled_rotation_y_y_f64x2, pa_y),
567
+ scaled_rotation_y_z_f64x2, pa_z);
568
+ float64x2_t ra_z = vfmaq_f64(
569
+ vfmaq_f64(vmulq_f64(scaled_rotation_z_x_f64x2, pa_x), scaled_rotation_z_y_f64x2, pa_y),
570
+ scaled_rotation_z_z_f64x2, pa_z);
571
+
572
+ float64x2_t delta_x = vsubq_f64(ra_x, pb_x);
573
+ float64x2_t delta_y = vsubq_f64(ra_y, pb_y);
574
+ float64x2_t delta_z = vsubq_f64(ra_z, pb_z);
575
+
576
+ nk_accumulate_square_f64x2_neon_(&sum_squared_a_f64x2, &sum_squared_a_compensation_f64x2, delta_x);
577
+ nk_accumulate_square_f64x2_neon_(&sum_squared_a_f64x2, &sum_squared_a_compensation_f64x2, delta_y);
578
+ nk_accumulate_square_f64x2_neon_(&sum_squared_a_f64x2, &sum_squared_a_compensation_f64x2, delta_z);
579
+ j += 2;
580
+ }
581
+
582
+ // Combine accumulators and reduce
583
+ float64x2_t sum_squared_f64x2 = vaddq_f64(sum_squared_a_f64x2, sum_squared_b_f64x2);
584
+ float64x2_t sum_squared_compensation_f64x2 = vaddq_f64(sum_squared_a_compensation_f64x2,
585
+ sum_squared_b_compensation_f64x2);
586
+ nk_f64_t sum_squared = nk_dot_stable_sum_f64x2_neon_(sum_squared_f64x2, sum_squared_compensation_f64x2);
587
+ nk_f64_t sum_squared_compensation = 0.0;
588
+
589
+ // Scalar tail
590
+ for (; j < n; ++j) {
591
+ nk_f64_t pa_x = a[j * 3 + 0] - centroid_a_x;
592
+ nk_f64_t pa_y = a[j * 3 + 1] - centroid_a_y;
593
+ nk_f64_t pa_z = a[j * 3 + 2] - centroid_a_z;
594
+ nk_f64_t pb_x = b[j * 3 + 0] - centroid_b_x;
595
+ nk_f64_t pb_y = b[j * 3 + 1] - centroid_b_y;
596
+ nk_f64_t pb_z = b[j * 3 + 2] - centroid_b_z;
597
+
598
+ nk_f64_t ra_x = scale * (r[0] * pa_x + r[1] * pa_y + r[2] * pa_z);
599
+ nk_f64_t ra_y = scale * (r[3] * pa_x + r[4] * pa_y + r[5] * pa_z);
600
+ nk_f64_t ra_z = scale * (r[6] * pa_x + r[7] * pa_y + r[8] * pa_z);
601
+
602
+ nk_f64_t delta_x = ra_x - pb_x;
603
+ nk_f64_t delta_y = ra_y - pb_y;
604
+ nk_f64_t delta_z = ra_z - pb_z;
605
+ nk_accumulate_square_f64_(&sum_squared, &sum_squared_compensation, delta_x);
606
+ nk_accumulate_square_f64_(&sum_squared, &sum_squared_compensation, delta_y);
607
+ nk_accumulate_square_f64_(&sum_squared, &sum_squared_compensation, delta_z);
608
+ }
609
+
610
+ return sum_squared + sum_squared_compensation;
611
+ }
612
+
613
+ NK_PUBLIC void nk_rmsd_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
614
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
615
+ if (rotation) {
616
+ rotation[0] = 1, rotation[1] = 0, rotation[2] = 0;
617
+ rotation[3] = 0, rotation[4] = 1, rotation[5] = 0;
618
+ rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
619
+ }
620
+ if (scale) *scale = 1.0f;
621
+
622
+ float64x2_t zero_f64x2 = vdupq_n_f64(0.0);
623
+ float64x2_t sum_a_x_lower_f64x2 = zero_f64x2, sum_a_x_upper_f64x2 = zero_f64x2;
624
+ float64x2_t sum_a_y_lower_f64x2 = zero_f64x2, sum_a_y_upper_f64x2 = zero_f64x2;
625
+ float64x2_t sum_a_z_lower_f64x2 = zero_f64x2, sum_a_z_upper_f64x2 = zero_f64x2;
626
+ float64x2_t sum_b_x_lower_f64x2 = zero_f64x2, sum_b_x_upper_f64x2 = zero_f64x2;
627
+ float64x2_t sum_b_y_lower_f64x2 = zero_f64x2, sum_b_y_upper_f64x2 = zero_f64x2;
628
+ float64x2_t sum_b_z_lower_f64x2 = zero_f64x2, sum_b_z_upper_f64x2 = zero_f64x2;
629
+ float64x2_t sum_squared_x_lower_f64x2 = zero_f64x2, sum_squared_x_upper_f64x2 = zero_f64x2;
630
+ float64x2_t sum_squared_y_lower_f64x2 = zero_f64x2, sum_squared_y_upper_f64x2 = zero_f64x2;
631
+ float64x2_t sum_squared_z_lower_f64x2 = zero_f64x2, sum_squared_z_upper_f64x2 = zero_f64x2;
632
+ nk_size_t index = 0;
633
+
634
+ for (; index + 4 <= n; index += 4) {
635
+ float32x4_t a_x_f32x4, a_y_f32x4, a_z_f32x4, b_x_f32x4, b_y_f32x4, b_z_f32x4;
636
+ nk_deinterleave_f32x4_neon_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4),
637
+ nk_deinterleave_f32x4_neon_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
638
+
639
+ float64x2_t a_x_lower_f64x2 = vcvt_f64_f32(vget_low_f32(a_x_f32x4));
640
+ float64x2_t a_x_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(a_x_f32x4);
641
+ float64x2_t a_y_lower_f64x2 = vcvt_f64_f32(vget_low_f32(a_y_f32x4));
642
+ float64x2_t a_y_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(a_y_f32x4);
643
+ float64x2_t a_z_lower_f64x2 = vcvt_f64_f32(vget_low_f32(a_z_f32x4));
644
+ float64x2_t a_z_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(a_z_f32x4);
645
+ float64x2_t b_x_lower_f64x2 = vcvt_f64_f32(vget_low_f32(b_x_f32x4));
646
+ float64x2_t b_x_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(b_x_f32x4);
647
+ float64x2_t b_y_lower_f64x2 = vcvt_f64_f32(vget_low_f32(b_y_f32x4));
648
+ float64x2_t b_y_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(b_y_f32x4);
649
+ float64x2_t b_z_lower_f64x2 = vcvt_f64_f32(vget_low_f32(b_z_f32x4));
650
+ float64x2_t b_z_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(b_z_f32x4);
651
+
652
+ sum_a_x_lower_f64x2 = vaddq_f64(sum_a_x_lower_f64x2, a_x_lower_f64x2),
653
+ sum_a_x_upper_f64x2 = vaddq_f64(sum_a_x_upper_f64x2, a_x_upper_f64x2);
654
+ sum_a_y_lower_f64x2 = vaddq_f64(sum_a_y_lower_f64x2, a_y_lower_f64x2),
655
+ sum_a_y_upper_f64x2 = vaddq_f64(sum_a_y_upper_f64x2, a_y_upper_f64x2);
656
+ sum_a_z_lower_f64x2 = vaddq_f64(sum_a_z_lower_f64x2, a_z_lower_f64x2),
657
+ sum_a_z_upper_f64x2 = vaddq_f64(sum_a_z_upper_f64x2, a_z_upper_f64x2);
658
+ sum_b_x_lower_f64x2 = vaddq_f64(sum_b_x_lower_f64x2, b_x_lower_f64x2),
659
+ sum_b_x_upper_f64x2 = vaddq_f64(sum_b_x_upper_f64x2, b_x_upper_f64x2);
660
+ sum_b_y_lower_f64x2 = vaddq_f64(sum_b_y_lower_f64x2, b_y_lower_f64x2),
661
+ sum_b_y_upper_f64x2 = vaddq_f64(sum_b_y_upper_f64x2, b_y_upper_f64x2);
662
+ sum_b_z_lower_f64x2 = vaddq_f64(sum_b_z_lower_f64x2, b_z_lower_f64x2),
663
+ sum_b_z_upper_f64x2 = vaddq_f64(sum_b_z_upper_f64x2, b_z_upper_f64x2);
664
+
665
+ float64x2_t delta_x_lower_f64x2 = vsubq_f64(a_x_lower_f64x2, b_x_lower_f64x2);
666
+ float64x2_t delta_x_upper_f64x2 = vsubq_f64(a_x_upper_f64x2, b_x_upper_f64x2);
667
+ float64x2_t delta_y_lower_f64x2 = vsubq_f64(a_y_lower_f64x2, b_y_lower_f64x2);
668
+ float64x2_t delta_y_upper_f64x2 = vsubq_f64(a_y_upper_f64x2, b_y_upper_f64x2);
669
+ float64x2_t delta_z_lower_f64x2 = vsubq_f64(a_z_lower_f64x2, b_z_lower_f64x2);
670
+ float64x2_t delta_z_upper_f64x2 = vsubq_f64(a_z_upper_f64x2, b_z_upper_f64x2);
671
+
672
+ sum_squared_x_lower_f64x2 = vfmaq_f64(sum_squared_x_lower_f64x2, delta_x_lower_f64x2, delta_x_lower_f64x2),
673
+ sum_squared_x_upper_f64x2 = vfmaq_f64(sum_squared_x_upper_f64x2, delta_x_upper_f64x2, delta_x_upper_f64x2);
674
+ sum_squared_y_lower_f64x2 = vfmaq_f64(sum_squared_y_lower_f64x2, delta_y_lower_f64x2, delta_y_lower_f64x2),
675
+ sum_squared_y_upper_f64x2 = vfmaq_f64(sum_squared_y_upper_f64x2, delta_y_upper_f64x2, delta_y_upper_f64x2);
676
+ sum_squared_z_lower_f64x2 = vfmaq_f64(sum_squared_z_lower_f64x2, delta_z_lower_f64x2, delta_z_lower_f64x2),
677
+ sum_squared_z_upper_f64x2 = vfmaq_f64(sum_squared_z_upper_f64x2, delta_z_upper_f64x2, delta_z_upper_f64x2);
678
+ }
679
+
680
+ nk_f64_t sum_a_x = vaddvq_f64(vaddq_f64(sum_a_x_lower_f64x2, sum_a_x_upper_f64x2));
681
+ nk_f64_t sum_a_y = vaddvq_f64(vaddq_f64(sum_a_y_lower_f64x2, sum_a_y_upper_f64x2));
682
+ nk_f64_t sum_a_z = vaddvq_f64(vaddq_f64(sum_a_z_lower_f64x2, sum_a_z_upper_f64x2));
683
+ nk_f64_t sum_b_x = vaddvq_f64(vaddq_f64(sum_b_x_lower_f64x2, sum_b_x_upper_f64x2));
684
+ nk_f64_t sum_b_y = vaddvq_f64(vaddq_f64(sum_b_y_lower_f64x2, sum_b_y_upper_f64x2));
685
+ nk_f64_t sum_b_z = vaddvq_f64(vaddq_f64(sum_b_z_lower_f64x2, sum_b_z_upper_f64x2));
686
+ nk_f64_t sum_squared_x = vaddvq_f64(vaddq_f64(sum_squared_x_lower_f64x2, sum_squared_x_upper_f64x2));
687
+ nk_f64_t sum_squared_y = vaddvq_f64(vaddq_f64(sum_squared_y_lower_f64x2, sum_squared_y_upper_f64x2));
688
+ nk_f64_t sum_squared_z = vaddvq_f64(vaddq_f64(sum_squared_z_lower_f64x2, sum_squared_z_upper_f64x2));
689
+
690
+ for (; index < n; ++index) {
691
+ nk_f64_t a_x = a[index * 3 + 0], a_y = a[index * 3 + 1], a_z = a[index * 3 + 2];
692
+ nk_f64_t b_x = b[index * 3 + 0], b_y = b[index * 3 + 1], b_z = b[index * 3 + 2];
693
+ sum_a_x += a_x, sum_a_y += a_y, sum_a_z += a_z;
694
+ sum_b_x += b_x, sum_b_y += b_y, sum_b_z += b_z;
695
+ nk_f64_t delta_x = a_x - b_x, delta_y = a_y - b_y, delta_z = a_z - b_z;
696
+ sum_squared_x += delta_x * delta_x, sum_squared_y += delta_y * delta_y, sum_squared_z += delta_z * delta_z;
697
+ }
698
+
699
+ nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
700
+ nk_f64_t centroid_a_x = sum_a_x * inv_n, centroid_a_y = sum_a_y * inv_n, centroid_a_z = sum_a_z * inv_n;
701
+ nk_f64_t centroid_b_x = sum_b_x * inv_n, centroid_b_y = sum_b_y * inv_n, centroid_b_z = sum_b_z * inv_n;
702
+ if (a_centroid)
703
+ a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
704
+ a_centroid[2] = (nk_f32_t)centroid_a_z;
705
+ if (b_centroid)
706
+ b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
707
+ b_centroid[2] = (nk_f32_t)centroid_b_z;
708
+
709
+ nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x, mean_diff_y = centroid_a_y - centroid_b_y,
710
+ mean_diff_z = centroid_a_z - centroid_b_z;
711
+ nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
712
+ *result = nk_f64_sqrt_neon((sum_squared_x + sum_squared_y + sum_squared_z) * inv_n - mean_diff_sq);
713
+ }
714
+
715
+ NK_PUBLIC void nk_rmsd_f64_neon(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
716
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
717
+ // RMSD uses identity rotation and scale=1.0.
718
+ if (rotation) {
719
+ rotation[0] = 1, rotation[1] = 0, rotation[2] = 0;
720
+ rotation[3] = 0, rotation[4] = 1, rotation[5] = 0;
721
+ rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
722
+ }
723
+ if (scale) *scale = 1.0;
724
+
725
+ float64x2_t const zeros_f64x2 = vdupq_n_f64(0);
726
+
727
+ // Accumulators for centroids and squared differences
728
+ float64x2_t sum_a_x_f64x2 = zeros_f64x2, sum_a_y_f64x2 = zeros_f64x2, sum_a_z_f64x2 = zeros_f64x2;
729
+ float64x2_t sum_b_x_f64x2 = zeros_f64x2, sum_b_y_f64x2 = zeros_f64x2, sum_b_z_f64x2 = zeros_f64x2;
730
+ float64x2_t sum_squared_x_f64x2 = zeros_f64x2, sum_squared_y_f64x2 = zeros_f64x2, sum_squared_z_f64x2 = zeros_f64x2;
731
+
732
+ float64x2_t a_x_f64x2, a_y_f64x2, a_z_f64x2, b_x_f64x2, b_y_f64x2, b_z_f64x2;
733
+ nk_size_t i = 0;
734
+
735
+ // Main loop processing 2 points at a time
736
+ for (; i + 2 <= n; i += 2) {
737
+ nk_deinterleave_f64x2_neon_(a + i * 3, &a_x_f64x2, &a_y_f64x2, &a_z_f64x2);
738
+ nk_deinterleave_f64x2_neon_(b + i * 3, &b_x_f64x2, &b_y_f64x2, &b_z_f64x2);
739
+
740
+ sum_a_x_f64x2 = vaddq_f64(sum_a_x_f64x2, a_x_f64x2);
741
+ sum_a_y_f64x2 = vaddq_f64(sum_a_y_f64x2, a_y_f64x2);
742
+ sum_a_z_f64x2 = vaddq_f64(sum_a_z_f64x2, a_z_f64x2);
743
+ sum_b_x_f64x2 = vaddq_f64(sum_b_x_f64x2, b_x_f64x2);
744
+ sum_b_y_f64x2 = vaddq_f64(sum_b_y_f64x2, b_y_f64x2);
745
+ sum_b_z_f64x2 = vaddq_f64(sum_b_z_f64x2, b_z_f64x2);
746
+
747
+ float64x2_t delta_x_f64x2 = vsubq_f64(a_x_f64x2, b_x_f64x2);
748
+ float64x2_t delta_y_f64x2 = vsubq_f64(a_y_f64x2, b_y_f64x2);
749
+ float64x2_t delta_z_f64x2 = vsubq_f64(a_z_f64x2, b_z_f64x2);
750
+
751
+ sum_squared_x_f64x2 = vfmaq_f64(sum_squared_x_f64x2, delta_x_f64x2, delta_x_f64x2);
752
+ sum_squared_y_f64x2 = vfmaq_f64(sum_squared_y_f64x2, delta_y_f64x2, delta_y_f64x2);
753
+ sum_squared_z_f64x2 = vfmaq_f64(sum_squared_z_f64x2, delta_z_f64x2, delta_z_f64x2);
754
+ }
755
+
756
+ // Reduce vectors to scalars.
757
+ nk_f64_t total_ax = nk_reduce_stable_f64x2_neon_(sum_a_x_f64x2), total_ax_compensation = 0.0;
758
+ nk_f64_t total_ay = nk_reduce_stable_f64x2_neon_(sum_a_y_f64x2), total_ay_compensation = 0.0;
759
+ nk_f64_t total_az = nk_reduce_stable_f64x2_neon_(sum_a_z_f64x2), total_az_compensation = 0.0;
760
+ nk_f64_t total_bx = nk_reduce_stable_f64x2_neon_(sum_b_x_f64x2), total_bx_compensation = 0.0;
761
+ nk_f64_t total_by = nk_reduce_stable_f64x2_neon_(sum_b_y_f64x2), total_by_compensation = 0.0;
762
+ nk_f64_t total_bz = nk_reduce_stable_f64x2_neon_(sum_b_z_f64x2), total_bz_compensation = 0.0;
763
+ nk_f64_t total_squared_x = nk_reduce_stable_f64x2_neon_(sum_squared_x_f64x2), total_squared_x_compensation = 0.0;
764
+ nk_f64_t total_squared_y = nk_reduce_stable_f64x2_neon_(sum_squared_y_f64x2), total_squared_y_compensation = 0.0;
765
+ nk_f64_t total_squared_z = nk_reduce_stable_f64x2_neon_(sum_squared_z_f64x2), total_squared_z_compensation = 0.0;
766
+
767
+ // Scalar tail
768
+ for (; i < n; ++i) {
769
+ nk_f64_t ax = a[i * 3 + 0], ay = a[i * 3 + 1], az = a[i * 3 + 2];
770
+ nk_f64_t bx = b[i * 3 + 0], by = b[i * 3 + 1], bz = b[i * 3 + 2];
771
+ nk_accumulate_sum_f64_(&total_ax, &total_ax_compensation, ax);
772
+ nk_accumulate_sum_f64_(&total_ay, &total_ay_compensation, ay);
773
+ nk_accumulate_sum_f64_(&total_az, &total_az_compensation, az);
774
+ nk_accumulate_sum_f64_(&total_bx, &total_bx_compensation, bx);
775
+ nk_accumulate_sum_f64_(&total_by, &total_by_compensation, by);
776
+ nk_accumulate_sum_f64_(&total_bz, &total_bz_compensation, bz);
777
+ nk_f64_t delta_x = ax - bx, delta_y = ay - by, delta_z = az - bz;
778
+ nk_accumulate_square_f64_(&total_squared_x, &total_squared_x_compensation, delta_x);
779
+ nk_accumulate_square_f64_(&total_squared_y, &total_squared_y_compensation, delta_y);
780
+ nk_accumulate_square_f64_(&total_squared_z, &total_squared_z_compensation, delta_z);
781
+ }
782
+
783
+ total_ax += total_ax_compensation, total_ay += total_ay_compensation, total_az += total_az_compensation;
784
+ total_bx += total_bx_compensation, total_by += total_by_compensation, total_bz += total_bz_compensation;
785
+ total_squared_x += total_squared_x_compensation, total_squared_y += total_squared_y_compensation,
786
+ total_squared_z += total_squared_z_compensation;
787
+
788
+ // Compute centroids
789
+ nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
790
+ nk_f64_t centroid_a_x = total_ax * inv_n, centroid_a_y = total_ay * inv_n, centroid_a_z = total_az * inv_n;
791
+ nk_f64_t centroid_b_x = total_bx * inv_n, centroid_b_y = total_by * inv_n, centroid_b_z = total_bz * inv_n;
792
+ if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
793
+ if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
794
+
795
+ // Compute RMSD
796
+ nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x;
797
+ nk_f64_t mean_diff_y = centroid_a_y - centroid_b_y;
798
+ nk_f64_t mean_diff_z = centroid_a_z - centroid_b_z;
799
+ nk_f64_t sum_squared = total_squared_x + total_squared_y + total_squared_z;
800
+ nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
801
+
802
+ *result = nk_f64_sqrt_neon(sum_squared * inv_n - mean_diff_sq);
803
+ }
804
+
805
+ NK_PUBLIC void nk_kabsch_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
806
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
807
+ nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z, h[9];
808
+ nk_bicentroid_f32_neon_(a, b, n, &centroid_a_x, &centroid_a_y, &centroid_a_z, &centroid_b_x, &centroid_b_y,
809
+ &centroid_b_z);
810
+ nk_cross_covariance_f32_neon_(a, b, n, centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y,
811
+ centroid_b_z, h);
812
+ if (a_centroid)
813
+ a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
814
+ a_centroid[2] = (nk_f32_t)centroid_a_z;
815
+ if (b_centroid)
816
+ b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
817
+ b_centroid[2] = (nk_f32_t)centroid_b_z;
818
+
819
+ nk_f64_t svd_u[9], svd_s[9], svd_v[9];
820
+ nk_svd3x3_f64_(h, svd_u, svd_s, svd_v);
821
+
822
+ nk_f64_t r[9];
823
+ r[0] = svd_v[0] * svd_u[0] + svd_v[1] * svd_u[1] + svd_v[2] * svd_u[2];
824
+ r[1] = svd_v[0] * svd_u[3] + svd_v[1] * svd_u[4] + svd_v[2] * svd_u[5];
825
+ r[2] = svd_v[0] * svd_u[6] + svd_v[1] * svd_u[7] + svd_v[2] * svd_u[8];
826
+ r[3] = svd_v[3] * svd_u[0] + svd_v[4] * svd_u[1] + svd_v[5] * svd_u[2];
827
+ r[4] = svd_v[3] * svd_u[3] + svd_v[4] * svd_u[4] + svd_v[5] * svd_u[5];
828
+ r[5] = svd_v[3] * svd_u[6] + svd_v[4] * svd_u[7] + svd_v[5] * svd_u[8];
829
+ r[6] = svd_v[6] * svd_u[0] + svd_v[7] * svd_u[1] + svd_v[8] * svd_u[2];
830
+ r[7] = svd_v[6] * svd_u[3] + svd_v[7] * svd_u[4] + svd_v[8] * svd_u[5];
831
+ r[8] = svd_v[6] * svd_u[6] + svd_v[7] * svd_u[7] + svd_v[8] * svd_u[8];
832
+
833
+ if (nk_det3x3_f64_(r) < 0) {
834
+ svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
835
+ r[0] = svd_v[0] * svd_u[0] + svd_v[1] * svd_u[1] + svd_v[2] * svd_u[2];
836
+ r[1] = svd_v[0] * svd_u[3] + svd_v[1] * svd_u[4] + svd_v[2] * svd_u[5];
837
+ r[2] = svd_v[0] * svd_u[6] + svd_v[1] * svd_u[7] + svd_v[2] * svd_u[8];
838
+ r[3] = svd_v[3] * svd_u[0] + svd_v[4] * svd_u[1] + svd_v[5] * svd_u[2];
839
+ r[4] = svd_v[3] * svd_u[3] + svd_v[4] * svd_u[4] + svd_v[5] * svd_u[5];
840
+ r[5] = svd_v[3] * svd_u[6] + svd_v[4] * svd_u[7] + svd_v[5] * svd_u[8];
841
+ r[6] = svd_v[6] * svd_u[0] + svd_v[7] * svd_u[1] + svd_v[8] * svd_u[2];
842
+ r[7] = svd_v[6] * svd_u[3] + svd_v[7] * svd_u[4] + svd_v[8] * svd_u[5];
843
+ r[8] = svd_v[6] * svd_u[6] + svd_v[7] * svd_u[7] + svd_v[8] * svd_u[8];
844
+ }
845
+
846
+ if (rotation)
847
+ for (int index = 0; index != 9; ++index) rotation[index] = (nk_f32_t)r[index];
848
+ if (scale) *scale = 1.0f;
849
+ *result = nk_f64_sqrt_neon(nk_transformed_ssd_f32_neon_(a, b, n, r, 1.0, centroid_a_x, centroid_a_y, centroid_a_z,
850
+ centroid_b_x, centroid_b_y, centroid_b_z) /
851
+ (nk_f64_t)n);
852
+ }
853
+
854
+ NK_PUBLIC void nk_kabsch_f64_neon(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
855
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
856
+ float64x2_t const zeros_f64x2 = vdupq_n_f64(0);
857
+
858
+ // 2x unrolling with dual accumulators to hide FMA latency.
859
+ float64x2_t sum_a_x_a_f64x2 = zeros_f64x2, sum_a_y_a_f64x2 = zeros_f64x2, sum_a_z_a_f64x2 = zeros_f64x2;
860
+ float64x2_t sum_b_x_a_f64x2 = zeros_f64x2, sum_b_y_a_f64x2 = zeros_f64x2, sum_b_z_a_f64x2 = zeros_f64x2;
861
+ float64x2_t sum_a_x_b_f64x2 = zeros_f64x2, sum_a_y_b_f64x2 = zeros_f64x2, sum_a_z_b_f64x2 = zeros_f64x2;
862
+ float64x2_t sum_b_x_b_f64x2 = zeros_f64x2, sum_b_y_b_f64x2 = zeros_f64x2, sum_b_z_b_f64x2 = zeros_f64x2;
863
+
864
+ float64x2_t cov_xx_a_f64x2 = zeros_f64x2, cov_xy_a_f64x2 = zeros_f64x2, cov_xz_a_f64x2 = zeros_f64x2;
865
+ float64x2_t cov_yx_a_f64x2 = zeros_f64x2, cov_yy_a_f64x2 = zeros_f64x2, cov_yz_a_f64x2 = zeros_f64x2;
866
+ float64x2_t cov_zx_a_f64x2 = zeros_f64x2, cov_zy_a_f64x2 = zeros_f64x2, cov_zz_a_f64x2 = zeros_f64x2;
867
+ float64x2_t cov_xx_b_f64x2 = zeros_f64x2, cov_xy_b_f64x2 = zeros_f64x2, cov_xz_b_f64x2 = zeros_f64x2;
868
+ float64x2_t cov_yx_b_f64x2 = zeros_f64x2, cov_yy_b_f64x2 = zeros_f64x2, cov_yz_b_f64x2 = zeros_f64x2;
869
+ float64x2_t cov_zx_b_f64x2 = zeros_f64x2, cov_zy_b_f64x2 = zeros_f64x2, cov_zz_b_f64x2 = zeros_f64x2;
870
+
871
+ nk_size_t i = 0;
872
+ float64x2_t a1_x_f64x2, a1_y_f64x2, a1_z_f64x2, b1_x_f64x2, b1_y_f64x2, b1_z_f64x2;
873
+ float64x2_t a2_x_f64x2, a2_y_f64x2, a2_z_f64x2, b2_x_f64x2, b2_y_f64x2, b2_z_f64x2;
874
+
875
+ // Main loop: 4 points per iteration (2x unrolled)
876
+ for (; i + 4 <= n; i += 4) {
877
+ nk_deinterleave_f64x2_neon_(a + i * 3, &a1_x_f64x2, &a1_y_f64x2, &a1_z_f64x2);
878
+ nk_deinterleave_f64x2_neon_(b + i * 3, &b1_x_f64x2, &b1_y_f64x2, &b1_z_f64x2);
879
+ nk_deinterleave_f64x2_neon_(a + (i + 2) * 3, &a2_x_f64x2, &a2_y_f64x2, &a2_z_f64x2);
880
+ nk_deinterleave_f64x2_neon_(b + (i + 2) * 3, &b2_x_f64x2, &b2_y_f64x2, &b2_z_f64x2);
881
+
882
+ // Interleaved accumulation
883
+ sum_a_x_a_f64x2 = vaddq_f64(sum_a_x_a_f64x2, a1_x_f64x2);
884
+ sum_a_x_b_f64x2 = vaddq_f64(sum_a_x_b_f64x2, a2_x_f64x2);
885
+ sum_a_y_a_f64x2 = vaddq_f64(sum_a_y_a_f64x2, a1_y_f64x2);
886
+ sum_a_y_b_f64x2 = vaddq_f64(sum_a_y_b_f64x2, a2_y_f64x2);
887
+ sum_a_z_a_f64x2 = vaddq_f64(sum_a_z_a_f64x2, a1_z_f64x2);
888
+ sum_a_z_b_f64x2 = vaddq_f64(sum_a_z_b_f64x2, a2_z_f64x2);
889
+ sum_b_x_a_f64x2 = vaddq_f64(sum_b_x_a_f64x2, b1_x_f64x2);
890
+ sum_b_x_b_f64x2 = vaddq_f64(sum_b_x_b_f64x2, b2_x_f64x2);
891
+ sum_b_y_a_f64x2 = vaddq_f64(sum_b_y_a_f64x2, b1_y_f64x2);
892
+ sum_b_y_b_f64x2 = vaddq_f64(sum_b_y_b_f64x2, b2_y_f64x2);
893
+ sum_b_z_a_f64x2 = vaddq_f64(sum_b_z_a_f64x2, b1_z_f64x2);
894
+ sum_b_z_b_f64x2 = vaddq_f64(sum_b_z_b_f64x2, b2_z_f64x2);
895
+
896
+ cov_xx_a_f64x2 = vfmaq_f64(cov_xx_a_f64x2, a1_x_f64x2, b1_x_f64x2);
897
+ cov_xx_b_f64x2 = vfmaq_f64(cov_xx_b_f64x2, a2_x_f64x2, b2_x_f64x2);
898
+ cov_xy_a_f64x2 = vfmaq_f64(cov_xy_a_f64x2, a1_x_f64x2, b1_y_f64x2);
899
+ cov_xy_b_f64x2 = vfmaq_f64(cov_xy_b_f64x2, a2_x_f64x2, b2_y_f64x2);
900
+ cov_xz_a_f64x2 = vfmaq_f64(cov_xz_a_f64x2, a1_x_f64x2, b1_z_f64x2);
901
+ cov_xz_b_f64x2 = vfmaq_f64(cov_xz_b_f64x2, a2_x_f64x2, b2_z_f64x2);
902
+ cov_yx_a_f64x2 = vfmaq_f64(cov_yx_a_f64x2, a1_y_f64x2, b1_x_f64x2);
903
+ cov_yx_b_f64x2 = vfmaq_f64(cov_yx_b_f64x2, a2_y_f64x2, b2_x_f64x2);
904
+ cov_yy_a_f64x2 = vfmaq_f64(cov_yy_a_f64x2, a1_y_f64x2, b1_y_f64x2);
905
+ cov_yy_b_f64x2 = vfmaq_f64(cov_yy_b_f64x2, a2_y_f64x2, b2_y_f64x2);
906
+ cov_yz_a_f64x2 = vfmaq_f64(cov_yz_a_f64x2, a1_y_f64x2, b1_z_f64x2);
907
+ cov_yz_b_f64x2 = vfmaq_f64(cov_yz_b_f64x2, a2_y_f64x2, b2_z_f64x2);
908
+ cov_zx_a_f64x2 = vfmaq_f64(cov_zx_a_f64x2, a1_z_f64x2, b1_x_f64x2);
909
+ cov_zx_b_f64x2 = vfmaq_f64(cov_zx_b_f64x2, a2_z_f64x2, b2_x_f64x2);
910
+ cov_zy_a_f64x2 = vfmaq_f64(cov_zy_a_f64x2, a1_z_f64x2, b1_y_f64x2);
911
+ cov_zy_b_f64x2 = vfmaq_f64(cov_zy_b_f64x2, a2_z_f64x2, b2_y_f64x2);
912
+ cov_zz_a_f64x2 = vfmaq_f64(cov_zz_a_f64x2, a1_z_f64x2, b1_z_f64x2);
913
+ cov_zz_b_f64x2 = vfmaq_f64(cov_zz_b_f64x2, a2_z_f64x2, b2_z_f64x2);
914
+ }
915
+
916
+ // 2-point tail
917
+ for (; i + 2 <= n; i += 2) {
918
+ nk_deinterleave_f64x2_neon_(a + i * 3, &a1_x_f64x2, &a1_y_f64x2, &a1_z_f64x2);
919
+ nk_deinterleave_f64x2_neon_(b + i * 3, &b1_x_f64x2, &b1_y_f64x2, &b1_z_f64x2);
920
+ sum_a_x_a_f64x2 = vaddq_f64(sum_a_x_a_f64x2, a1_x_f64x2);
921
+ sum_a_y_a_f64x2 = vaddq_f64(sum_a_y_a_f64x2, a1_y_f64x2);
922
+ sum_a_z_a_f64x2 = vaddq_f64(sum_a_z_a_f64x2, a1_z_f64x2);
923
+ sum_b_x_a_f64x2 = vaddq_f64(sum_b_x_a_f64x2, b1_x_f64x2);
924
+ sum_b_y_a_f64x2 = vaddq_f64(sum_b_y_a_f64x2, b1_y_f64x2);
925
+ sum_b_z_a_f64x2 = vaddq_f64(sum_b_z_a_f64x2, b1_z_f64x2);
926
+ cov_xx_a_f64x2 = vfmaq_f64(cov_xx_a_f64x2, a1_x_f64x2, b1_x_f64x2);
927
+ cov_xy_a_f64x2 = vfmaq_f64(cov_xy_a_f64x2, a1_x_f64x2, b1_y_f64x2);
928
+ cov_xz_a_f64x2 = vfmaq_f64(cov_xz_a_f64x2, a1_x_f64x2, b1_z_f64x2);
929
+ cov_yx_a_f64x2 = vfmaq_f64(cov_yx_a_f64x2, a1_y_f64x2, b1_x_f64x2);
930
+ cov_yy_a_f64x2 = vfmaq_f64(cov_yy_a_f64x2, a1_y_f64x2, b1_y_f64x2);
931
+ cov_yz_a_f64x2 = vfmaq_f64(cov_yz_a_f64x2, a1_y_f64x2, b1_z_f64x2);
932
+ cov_zx_a_f64x2 = vfmaq_f64(cov_zx_a_f64x2, a1_z_f64x2, b1_x_f64x2);
933
+ cov_zy_a_f64x2 = vfmaq_f64(cov_zy_a_f64x2, a1_z_f64x2, b1_y_f64x2);
934
+ cov_zz_a_f64x2 = vfmaq_f64(cov_zz_a_f64x2, a1_z_f64x2, b1_z_f64x2);
935
+ }
936
+
937
+ // Combine dual accumulators
938
+ float64x2_t sum_a_x_f64x2 = vaddq_f64(sum_a_x_a_f64x2, sum_a_x_b_f64x2);
939
+ float64x2_t sum_a_y_f64x2 = vaddq_f64(sum_a_y_a_f64x2, sum_a_y_b_f64x2);
940
+ float64x2_t sum_a_z_f64x2 = vaddq_f64(sum_a_z_a_f64x2, sum_a_z_b_f64x2);
941
+ float64x2_t sum_b_x_f64x2 = vaddq_f64(sum_b_x_a_f64x2, sum_b_x_b_f64x2);
942
+ float64x2_t sum_b_y_f64x2 = vaddq_f64(sum_b_y_a_f64x2, sum_b_y_b_f64x2);
943
+ float64x2_t sum_b_z_f64x2 = vaddq_f64(sum_b_z_a_f64x2, sum_b_z_b_f64x2);
944
+ float64x2_t cov_xx_f64x2 = vaddq_f64(cov_xx_a_f64x2, cov_xx_b_f64x2);
945
+ float64x2_t cov_xy_f64x2 = vaddq_f64(cov_xy_a_f64x2, cov_xy_b_f64x2);
946
+ float64x2_t cov_xz_f64x2 = vaddq_f64(cov_xz_a_f64x2, cov_xz_b_f64x2);
947
+ float64x2_t cov_yx_f64x2 = vaddq_f64(cov_yx_a_f64x2, cov_yx_b_f64x2);
948
+ float64x2_t cov_yy_f64x2 = vaddq_f64(cov_yy_a_f64x2, cov_yy_b_f64x2);
949
+ float64x2_t cov_yz_f64x2 = vaddq_f64(cov_yz_a_f64x2, cov_yz_b_f64x2);
950
+ float64x2_t cov_zx_f64x2 = vaddq_f64(cov_zx_a_f64x2, cov_zx_b_f64x2);
951
+ float64x2_t cov_zy_f64x2 = vaddq_f64(cov_zy_a_f64x2, cov_zy_b_f64x2);
952
+ float64x2_t cov_zz_f64x2 = vaddq_f64(cov_zz_a_f64x2, cov_zz_b_f64x2);
953
+
954
+ // Reduce vector accumulators.
955
+ nk_f64_t sum_a_x = nk_reduce_stable_f64x2_neon_(sum_a_x_f64x2), sum_a_x_compensation = 0.0;
956
+ nk_f64_t sum_a_y = nk_reduce_stable_f64x2_neon_(sum_a_y_f64x2), sum_a_y_compensation = 0.0;
957
+ nk_f64_t sum_a_z = nk_reduce_stable_f64x2_neon_(sum_a_z_f64x2), sum_a_z_compensation = 0.0;
958
+ nk_f64_t sum_b_x = nk_reduce_stable_f64x2_neon_(sum_b_x_f64x2), sum_b_x_compensation = 0.0;
959
+ nk_f64_t sum_b_y = nk_reduce_stable_f64x2_neon_(sum_b_y_f64x2), sum_b_y_compensation = 0.0;
960
+ nk_f64_t sum_b_z = nk_reduce_stable_f64x2_neon_(sum_b_z_f64x2), sum_b_z_compensation = 0.0;
961
+
962
+ nk_f64_t covariance_x_x = nk_reduce_stable_f64x2_neon_(cov_xx_f64x2), covariance_x_x_compensation = 0.0;
963
+ nk_f64_t covariance_x_y = nk_reduce_stable_f64x2_neon_(cov_xy_f64x2), covariance_x_y_compensation = 0.0;
964
+ nk_f64_t covariance_x_z = nk_reduce_stable_f64x2_neon_(cov_xz_f64x2), covariance_x_z_compensation = 0.0;
965
+ nk_f64_t covariance_y_x = nk_reduce_stable_f64x2_neon_(cov_yx_f64x2), covariance_y_x_compensation = 0.0;
966
+ nk_f64_t covariance_y_y = nk_reduce_stable_f64x2_neon_(cov_yy_f64x2), covariance_y_y_compensation = 0.0;
967
+ nk_f64_t covariance_y_z = nk_reduce_stable_f64x2_neon_(cov_yz_f64x2), covariance_y_z_compensation = 0.0;
968
+ nk_f64_t covariance_z_x = nk_reduce_stable_f64x2_neon_(cov_zx_f64x2), covariance_z_x_compensation = 0.0;
969
+ nk_f64_t covariance_z_y = nk_reduce_stable_f64x2_neon_(cov_zy_f64x2), covariance_z_y_compensation = 0.0;
970
+ nk_f64_t covariance_z_z = nk_reduce_stable_f64x2_neon_(cov_zz_f64x2), covariance_z_z_compensation = 0.0;
971
+
972
+ // Scalar tail
973
+ for (; i < n; ++i) {
974
+ nk_f64_t ax = a[i * 3 + 0], ay = a[i * 3 + 1], az = a[i * 3 + 2];
975
+ nk_f64_t bx = b[i * 3 + 0], by = b[i * 3 + 1], bz = b[i * 3 + 2];
976
+ nk_accumulate_sum_f64_(&sum_a_x, &sum_a_x_compensation, ax),
977
+ nk_accumulate_sum_f64_(&sum_a_y, &sum_a_y_compensation, ay),
978
+ nk_accumulate_sum_f64_(&sum_a_z, &sum_a_z_compensation, az);
979
+ nk_accumulate_sum_f64_(&sum_b_x, &sum_b_x_compensation, bx),
980
+ nk_accumulate_sum_f64_(&sum_b_y, &sum_b_y_compensation, by),
981
+ nk_accumulate_sum_f64_(&sum_b_z, &sum_b_z_compensation, bz);
982
+ nk_accumulate_product_f64_(&covariance_x_x, &covariance_x_x_compensation, ax, bx),
983
+ nk_accumulate_product_f64_(&covariance_x_y, &covariance_x_y_compensation, ax, by),
984
+ nk_accumulate_product_f64_(&covariance_x_z, &covariance_x_z_compensation, ax, bz);
985
+ nk_accumulate_product_f64_(&covariance_y_x, &covariance_y_x_compensation, ay, bx),
986
+ nk_accumulate_product_f64_(&covariance_y_y, &covariance_y_y_compensation, ay, by),
987
+ nk_accumulate_product_f64_(&covariance_y_z, &covariance_y_z_compensation, ay, bz);
988
+ nk_accumulate_product_f64_(&covariance_z_x, &covariance_z_x_compensation, az, bx),
989
+ nk_accumulate_product_f64_(&covariance_z_y, &covariance_z_y_compensation, az, by),
990
+ nk_accumulate_product_f64_(&covariance_z_z, &covariance_z_z_compensation, az, bz);
991
+ }
992
+
993
+ sum_a_x += sum_a_x_compensation, sum_a_y += sum_a_y_compensation, sum_a_z += sum_a_z_compensation;
994
+ sum_b_x += sum_b_x_compensation, sum_b_y += sum_b_y_compensation, sum_b_z += sum_b_z_compensation;
995
+ covariance_x_x += covariance_x_x_compensation, covariance_x_y += covariance_x_y_compensation,
996
+ covariance_x_z += covariance_x_z_compensation;
997
+ covariance_y_x += covariance_y_x_compensation, covariance_y_y += covariance_y_y_compensation,
998
+ covariance_y_z += covariance_y_z_compensation;
999
+ covariance_z_x += covariance_z_x_compensation, covariance_z_y += covariance_z_y_compensation,
1000
+ covariance_z_z += covariance_z_z_compensation;
1001
+
1002
+ // Compute centroids
1003
+ nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
1004
+ nk_f64_t centroid_a_x = sum_a_x * inv_n, centroid_a_y = sum_a_y * inv_n, centroid_a_z = sum_a_z * inv_n;
1005
+ nk_f64_t centroid_b_x = sum_b_x * inv_n, centroid_b_y = sum_b_y * inv_n, centroid_b_z = sum_b_z * inv_n;
1006
+ if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
1007
+ if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
1008
+
1009
+ // Apply centering correction: H_centered = H - n * centroid_a * centroid_bᵀ
1010
+ covariance_x_x -= n * centroid_a_x * centroid_b_x;
1011
+ covariance_x_y -= n * centroid_a_x * centroid_b_y;
1012
+ covariance_x_z -= n * centroid_a_x * centroid_b_z;
1013
+ covariance_y_x -= n * centroid_a_y * centroid_b_x;
1014
+ covariance_y_y -= n * centroid_a_y * centroid_b_y;
1015
+ covariance_y_z -= n * centroid_a_y * centroid_b_z;
1016
+ covariance_z_x -= n * centroid_a_z * centroid_b_x;
1017
+ covariance_z_y -= n * centroid_a_z * centroid_b_y;
1018
+ covariance_z_z -= n * centroid_a_z * centroid_b_z;
1019
+
1020
+ // Compute SVD and optimal rotation
1021
+ nk_f64_t cross_covariance[9] = {covariance_x_x, covariance_x_y, covariance_x_z, covariance_y_x, covariance_y_y,
1022
+ covariance_y_z, covariance_z_x, covariance_z_y, covariance_z_z};
1023
+ nk_f64_t svd_u[9], svd_s[9], svd_v[9];
1024
+ nk_svd3x3_f64_(cross_covariance, svd_u, svd_s, svd_v);
1025
+
1026
+ nk_f64_t r[9];
1027
+ nk_rotation_from_svd_f64_neon_(svd_u, svd_v, r);
1028
+
1029
+ // Handle reflection: if det(R) < 0, negate third column of V and recompute R
1030
+ if (nk_det3x3_f64_(r) < 0) {
1031
+ svd_v[2] = -svd_v[2];
1032
+ svd_v[5] = -svd_v[5];
1033
+ svd_v[8] = -svd_v[8];
1034
+ nk_rotation_from_svd_f64_neon_(svd_u, svd_v, r);
1035
+ }
1036
+
1037
+ // Output rotation matrix and scale=1.0.
1038
+ if (rotation)
1039
+ for (int j = 0; j < 9; ++j) rotation[j] = r[j];
1040
+
1041
+ if (scale) *scale = 1.0;
1042
+
1043
+ // Compute RMSD after optimal rotation
1044
+ nk_f64_t sum_squared = nk_transformed_ssd_f64_neon_(a, b, n, r, 1.0, centroid_a_x, centroid_a_y, centroid_a_z,
1045
+ centroid_b_x, centroid_b_y, centroid_b_z);
1046
+ *result = nk_f64_sqrt_neon(sum_squared * inv_n);
1047
+ }
1048
+
1049
+ NK_PUBLIC void nk_umeyama_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
1050
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
1051
+ nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z, h[9], variance_a;
1052
+ nk_bicentroid_f32_neon_(a, b, n, &centroid_a_x, &centroid_a_y, &centroid_a_z, &centroid_b_x, &centroid_b_y,
1053
+ &centroid_b_z);
1054
+ nk_cross_covariance_and_variance_f32_neon_(a, b, n, centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x,
1055
+ centroid_b_y, centroid_b_z, h, &variance_a);
1056
+ if (a_centroid)
1057
+ a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
1058
+ a_centroid[2] = (nk_f32_t)centroid_a_z;
1059
+ if (b_centroid)
1060
+ b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
1061
+ b_centroid[2] = (nk_f32_t)centroid_b_z;
1062
+
1063
+ nk_f64_t svd_u[9], svd_s[9], svd_v[9];
1064
+ nk_svd3x3_f64_(h, svd_u, svd_s, svd_v);
1065
+
1066
+ nk_f64_t r[9];
1067
+ r[0] = svd_v[0] * svd_u[0] + svd_v[1] * svd_u[1] + svd_v[2] * svd_u[2];
1068
+ r[1] = svd_v[0] * svd_u[3] + svd_v[1] * svd_u[4] + svd_v[2] * svd_u[5];
1069
+ r[2] = svd_v[0] * svd_u[6] + svd_v[1] * svd_u[7] + svd_v[2] * svd_u[8];
1070
+ r[3] = svd_v[3] * svd_u[0] + svd_v[4] * svd_u[1] + svd_v[5] * svd_u[2];
1071
+ r[4] = svd_v[3] * svd_u[3] + svd_v[4] * svd_u[4] + svd_v[5] * svd_u[5];
1072
+ r[5] = svd_v[3] * svd_u[6] + svd_v[4] * svd_u[7] + svd_v[5] * svd_u[8];
1073
+ r[6] = svd_v[6] * svd_u[0] + svd_v[7] * svd_u[1] + svd_v[8] * svd_u[2];
1074
+ r[7] = svd_v[6] * svd_u[3] + svd_v[7] * svd_u[4] + svd_v[8] * svd_u[5];
1075
+ r[8] = svd_v[6] * svd_u[6] + svd_v[7] * svd_u[7] + svd_v[8] * svd_u[8];
1076
+
1077
+ nk_f64_t det = nk_det3x3_f64_(r), sign_correction = det < 0 ? -1.0 : 1.0;
1078
+ if (det < 0) {
1079
+ svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
1080
+ r[0] = svd_v[0] * svd_u[0] + svd_v[1] * svd_u[1] + svd_v[2] * svd_u[2];
1081
+ r[1] = svd_v[0] * svd_u[3] + svd_v[1] * svd_u[4] + svd_v[2] * svd_u[5];
1082
+ r[2] = svd_v[0] * svd_u[6] + svd_v[1] * svd_u[7] + svd_v[2] * svd_u[8];
1083
+ r[3] = svd_v[3] * svd_u[0] + svd_v[4] * svd_u[1] + svd_v[5] * svd_u[2];
1084
+ r[4] = svd_v[3] * svd_u[3] + svd_v[4] * svd_u[4] + svd_v[5] * svd_u[5];
1085
+ r[5] = svd_v[3] * svd_u[6] + svd_v[4] * svd_u[7] + svd_v[5] * svd_u[8];
1086
+ r[6] = svd_v[6] * svd_u[0] + svd_v[7] * svd_u[1] + svd_v[8] * svd_u[2];
1087
+ r[7] = svd_v[6] * svd_u[3] + svd_v[7] * svd_u[4] + svd_v[8] * svd_u[5];
1088
+ r[8] = svd_v[6] * svd_u[6] + svd_v[7] * svd_u[7] + svd_v[8] * svd_u[8];
1089
+ }
1090
+
1091
+ nk_f64_t applied_scale = (svd_s[0] + svd_s[4] + sign_correction * svd_s[8]) / ((nk_f64_t)n * variance_a);
1092
+ if (rotation)
1093
+ for (int index = 0; index != 9; ++index) rotation[index] = (nk_f32_t)r[index];
1094
+ if (scale) *scale = (nk_f32_t)applied_scale;
1095
+ *result = nk_f64_sqrt_neon(nk_transformed_ssd_f32_neon_(a, b, n, r, applied_scale, centroid_a_x, centroid_a_y,
1096
+ centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z) /
1097
+ (nk_f64_t)n);
1098
+ }
1099
+
1100
+ NK_PUBLIC void nk_umeyama_f64_neon(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
1101
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
1102
+ float64x2_t const zeros_f64x2 = vdupq_n_f64(0);
1103
+
1104
+ // 2x unrolling with dual accumulators to hide FMA latency.
1105
+ float64x2_t sum_a_x_a_f64x2 = zeros_f64x2, sum_a_y_a_f64x2 = zeros_f64x2, sum_a_z_a_f64x2 = zeros_f64x2;
1106
+ float64x2_t sum_b_x_a_f64x2 = zeros_f64x2, sum_b_y_a_f64x2 = zeros_f64x2, sum_b_z_a_f64x2 = zeros_f64x2;
1107
+ float64x2_t sum_a_x_b_f64x2 = zeros_f64x2, sum_a_y_b_f64x2 = zeros_f64x2, sum_a_z_b_f64x2 = zeros_f64x2;
1108
+ float64x2_t sum_b_x_b_f64x2 = zeros_f64x2, sum_b_y_b_f64x2 = zeros_f64x2, sum_b_z_b_f64x2 = zeros_f64x2;
1109
+
1110
+ float64x2_t cov_xx_a_f64x2 = zeros_f64x2, cov_xy_a_f64x2 = zeros_f64x2, cov_xz_a_f64x2 = zeros_f64x2;
1111
+ float64x2_t cov_yx_a_f64x2 = zeros_f64x2, cov_yy_a_f64x2 = zeros_f64x2, cov_yz_a_f64x2 = zeros_f64x2;
1112
+ float64x2_t cov_zx_a_f64x2 = zeros_f64x2, cov_zy_a_f64x2 = zeros_f64x2, cov_zz_a_f64x2 = zeros_f64x2;
1113
+ float64x2_t cov_xx_b_f64x2 = zeros_f64x2, cov_xy_b_f64x2 = zeros_f64x2, cov_xz_b_f64x2 = zeros_f64x2;
1114
+ float64x2_t cov_yx_b_f64x2 = zeros_f64x2, cov_yy_b_f64x2 = zeros_f64x2, cov_yz_b_f64x2 = zeros_f64x2;
1115
+ float64x2_t cov_zx_b_f64x2 = zeros_f64x2, cov_zy_b_f64x2 = zeros_f64x2, cov_zz_b_f64x2 = zeros_f64x2;
1116
+ float64x2_t variance_a_a_f64x2 = zeros_f64x2, variance_a_b_f64x2 = zeros_f64x2;
1117
+
1118
+ nk_size_t i = 0;
1119
+ float64x2_t a1_x_f64x2, a1_y_f64x2, a1_z_f64x2, b1_x_f64x2, b1_y_f64x2, b1_z_f64x2;
1120
+ float64x2_t a2_x_f64x2, a2_y_f64x2, a2_z_f64x2, b2_x_f64x2, b2_y_f64x2, b2_z_f64x2;
1121
+
1122
+ // Main loop: 4 points per iteration (2x unrolled)
1123
+ for (; i + 4 <= n; i += 4) {
1124
+ nk_deinterleave_f64x2_neon_(a + i * 3, &a1_x_f64x2, &a1_y_f64x2, &a1_z_f64x2);
1125
+ nk_deinterleave_f64x2_neon_(b + i * 3, &b1_x_f64x2, &b1_y_f64x2, &b1_z_f64x2);
1126
+ nk_deinterleave_f64x2_neon_(a + (i + 2) * 3, &a2_x_f64x2, &a2_y_f64x2, &a2_z_f64x2);
1127
+ nk_deinterleave_f64x2_neon_(b + (i + 2) * 3, &b2_x_f64x2, &b2_y_f64x2, &b2_z_f64x2);
1128
+
1129
+ // Interleaved accumulation
1130
+ sum_a_x_a_f64x2 = vaddq_f64(sum_a_x_a_f64x2, a1_x_f64x2);
1131
+ sum_a_x_b_f64x2 = vaddq_f64(sum_a_x_b_f64x2, a2_x_f64x2);
1132
+ sum_a_y_a_f64x2 = vaddq_f64(sum_a_y_a_f64x2, a1_y_f64x2);
1133
+ sum_a_y_b_f64x2 = vaddq_f64(sum_a_y_b_f64x2, a2_y_f64x2);
1134
+ sum_a_z_a_f64x2 = vaddq_f64(sum_a_z_a_f64x2, a1_z_f64x2);
1135
+ sum_a_z_b_f64x2 = vaddq_f64(sum_a_z_b_f64x2, a2_z_f64x2);
1136
+ sum_b_x_a_f64x2 = vaddq_f64(sum_b_x_a_f64x2, b1_x_f64x2);
1137
+ sum_b_x_b_f64x2 = vaddq_f64(sum_b_x_b_f64x2, b2_x_f64x2);
1138
+ sum_b_y_a_f64x2 = vaddq_f64(sum_b_y_a_f64x2, b1_y_f64x2);
1139
+ sum_b_y_b_f64x2 = vaddq_f64(sum_b_y_b_f64x2, b2_y_f64x2);
1140
+ sum_b_z_a_f64x2 = vaddq_f64(sum_b_z_a_f64x2, b1_z_f64x2);
1141
+ sum_b_z_b_f64x2 = vaddq_f64(sum_b_z_b_f64x2, b2_z_f64x2);
1142
+
1143
+ cov_xx_a_f64x2 = vfmaq_f64(cov_xx_a_f64x2, a1_x_f64x2, b1_x_f64x2);
1144
+ cov_xx_b_f64x2 = vfmaq_f64(cov_xx_b_f64x2, a2_x_f64x2, b2_x_f64x2);
1145
+ cov_xy_a_f64x2 = vfmaq_f64(cov_xy_a_f64x2, a1_x_f64x2, b1_y_f64x2);
1146
+ cov_xy_b_f64x2 = vfmaq_f64(cov_xy_b_f64x2, a2_x_f64x2, b2_y_f64x2);
1147
+ cov_xz_a_f64x2 = vfmaq_f64(cov_xz_a_f64x2, a1_x_f64x2, b1_z_f64x2);
1148
+ cov_xz_b_f64x2 = vfmaq_f64(cov_xz_b_f64x2, a2_x_f64x2, b2_z_f64x2);
1149
+ cov_yx_a_f64x2 = vfmaq_f64(cov_yx_a_f64x2, a1_y_f64x2, b1_x_f64x2);
1150
+ cov_yx_b_f64x2 = vfmaq_f64(cov_yx_b_f64x2, a2_y_f64x2, b2_x_f64x2);
1151
+ cov_yy_a_f64x2 = vfmaq_f64(cov_yy_a_f64x2, a1_y_f64x2, b1_y_f64x2);
1152
+ cov_yy_b_f64x2 = vfmaq_f64(cov_yy_b_f64x2, a2_y_f64x2, b2_y_f64x2);
1153
+ cov_yz_a_f64x2 = vfmaq_f64(cov_yz_a_f64x2, a1_y_f64x2, b1_z_f64x2);
1154
+ cov_yz_b_f64x2 = vfmaq_f64(cov_yz_b_f64x2, a2_y_f64x2, b2_z_f64x2);
1155
+ cov_zx_a_f64x2 = vfmaq_f64(cov_zx_a_f64x2, a1_z_f64x2, b1_x_f64x2);
1156
+ cov_zx_b_f64x2 = vfmaq_f64(cov_zx_b_f64x2, a2_z_f64x2, b2_x_f64x2);
1157
+ cov_zy_a_f64x2 = vfmaq_f64(cov_zy_a_f64x2, a1_z_f64x2, b1_y_f64x2);
1158
+ cov_zy_b_f64x2 = vfmaq_f64(cov_zy_b_f64x2, a2_z_f64x2, b2_y_f64x2);
1159
+ cov_zz_a_f64x2 = vfmaq_f64(cov_zz_a_f64x2, a1_z_f64x2, b1_z_f64x2);
1160
+ cov_zz_b_f64x2 = vfmaq_f64(cov_zz_b_f64x2, a2_z_f64x2, b2_z_f64x2);
1161
+
1162
+ variance_a_a_f64x2 = vfmaq_f64(variance_a_a_f64x2, a1_x_f64x2, a1_x_f64x2);
1163
+ variance_a_b_f64x2 = vfmaq_f64(variance_a_b_f64x2, a2_x_f64x2, a2_x_f64x2);
1164
+ variance_a_a_f64x2 = vfmaq_f64(variance_a_a_f64x2, a1_y_f64x2, a1_y_f64x2);
1165
+ variance_a_b_f64x2 = vfmaq_f64(variance_a_b_f64x2, a2_y_f64x2, a2_y_f64x2);
1166
+ variance_a_a_f64x2 = vfmaq_f64(variance_a_a_f64x2, a1_z_f64x2, a1_z_f64x2);
1167
+ variance_a_b_f64x2 = vfmaq_f64(variance_a_b_f64x2, a2_z_f64x2, a2_z_f64x2);
1168
+ }
1169
+
1170
+ // 2-point tail
1171
+ for (; i + 2 <= n; i += 2) {
1172
+ nk_deinterleave_f64x2_neon_(a + i * 3, &a1_x_f64x2, &a1_y_f64x2, &a1_z_f64x2);
1173
+ nk_deinterleave_f64x2_neon_(b + i * 3, &b1_x_f64x2, &b1_y_f64x2, &b1_z_f64x2);
1174
+ sum_a_x_a_f64x2 = vaddq_f64(sum_a_x_a_f64x2, a1_x_f64x2);
1175
+ sum_a_y_a_f64x2 = vaddq_f64(sum_a_y_a_f64x2, a1_y_f64x2);
1176
+ sum_a_z_a_f64x2 = vaddq_f64(sum_a_z_a_f64x2, a1_z_f64x2);
1177
+ sum_b_x_a_f64x2 = vaddq_f64(sum_b_x_a_f64x2, b1_x_f64x2);
1178
+ sum_b_y_a_f64x2 = vaddq_f64(sum_b_y_a_f64x2, b1_y_f64x2);
1179
+ sum_b_z_a_f64x2 = vaddq_f64(sum_b_z_a_f64x2, b1_z_f64x2);
1180
+ cov_xx_a_f64x2 = vfmaq_f64(cov_xx_a_f64x2, a1_x_f64x2, b1_x_f64x2);
1181
+ cov_xy_a_f64x2 = vfmaq_f64(cov_xy_a_f64x2, a1_x_f64x2, b1_y_f64x2);
1182
+ cov_xz_a_f64x2 = vfmaq_f64(cov_xz_a_f64x2, a1_x_f64x2, b1_z_f64x2);
1183
+ cov_yx_a_f64x2 = vfmaq_f64(cov_yx_a_f64x2, a1_y_f64x2, b1_x_f64x2);
1184
+ cov_yy_a_f64x2 = vfmaq_f64(cov_yy_a_f64x2, a1_y_f64x2, b1_y_f64x2);
1185
+ cov_yz_a_f64x2 = vfmaq_f64(cov_yz_a_f64x2, a1_y_f64x2, b1_z_f64x2);
1186
+ cov_zx_a_f64x2 = vfmaq_f64(cov_zx_a_f64x2, a1_z_f64x2, b1_x_f64x2);
1187
+ cov_zy_a_f64x2 = vfmaq_f64(cov_zy_a_f64x2, a1_z_f64x2, b1_y_f64x2);
1188
+ cov_zz_a_f64x2 = vfmaq_f64(cov_zz_a_f64x2, a1_z_f64x2, b1_z_f64x2);
1189
+ variance_a_a_f64x2 = vfmaq_f64(variance_a_a_f64x2, a1_x_f64x2, a1_x_f64x2);
1190
+ variance_a_a_f64x2 = vfmaq_f64(variance_a_a_f64x2, a1_y_f64x2, a1_y_f64x2);
1191
+ variance_a_a_f64x2 = vfmaq_f64(variance_a_a_f64x2, a1_z_f64x2, a1_z_f64x2);
1192
+ }
1193
+
1194
+ // Combine dual accumulators
1195
+ float64x2_t sum_a_x_f64x2 = vaddq_f64(sum_a_x_a_f64x2, sum_a_x_b_f64x2);
1196
+ float64x2_t sum_a_y_f64x2 = vaddq_f64(sum_a_y_a_f64x2, sum_a_y_b_f64x2);
1197
+ float64x2_t sum_a_z_f64x2 = vaddq_f64(sum_a_z_a_f64x2, sum_a_z_b_f64x2);
1198
+ float64x2_t sum_b_x_f64x2 = vaddq_f64(sum_b_x_a_f64x2, sum_b_x_b_f64x2);
1199
+ float64x2_t sum_b_y_f64x2 = vaddq_f64(sum_b_y_a_f64x2, sum_b_y_b_f64x2);
1200
+ float64x2_t sum_b_z_f64x2 = vaddq_f64(sum_b_z_a_f64x2, sum_b_z_b_f64x2);
1201
+ float64x2_t cov_xx_f64x2 = vaddq_f64(cov_xx_a_f64x2, cov_xx_b_f64x2);
1202
+ float64x2_t cov_xy_f64x2 = vaddq_f64(cov_xy_a_f64x2, cov_xy_b_f64x2);
1203
+ float64x2_t cov_xz_f64x2 = vaddq_f64(cov_xz_a_f64x2, cov_xz_b_f64x2);
1204
+ float64x2_t cov_yx_f64x2 = vaddq_f64(cov_yx_a_f64x2, cov_yx_b_f64x2);
1205
+ float64x2_t cov_yy_f64x2 = vaddq_f64(cov_yy_a_f64x2, cov_yy_b_f64x2);
1206
+ float64x2_t cov_yz_f64x2 = vaddq_f64(cov_yz_a_f64x2, cov_yz_b_f64x2);
1207
+ float64x2_t cov_zx_f64x2 = vaddq_f64(cov_zx_a_f64x2, cov_zx_b_f64x2);
1208
+ float64x2_t cov_zy_f64x2 = vaddq_f64(cov_zy_a_f64x2, cov_zy_b_f64x2);
1209
+ float64x2_t cov_zz_f64x2 = vaddq_f64(cov_zz_a_f64x2, cov_zz_b_f64x2);
1210
+ float64x2_t variance_a_f64x2 = vaddq_f64(variance_a_a_f64x2, variance_a_b_f64x2);
1211
+
1212
+ // Reduce vector accumulators.
1213
+ nk_f64_t sum_a_x = nk_reduce_stable_f64x2_neon_(sum_a_x_f64x2), sum_a_x_compensation = 0.0;
1214
+ nk_f64_t sum_a_y = nk_reduce_stable_f64x2_neon_(sum_a_y_f64x2), sum_a_y_compensation = 0.0;
1215
+ nk_f64_t sum_a_z = nk_reduce_stable_f64x2_neon_(sum_a_z_f64x2), sum_a_z_compensation = 0.0;
1216
+ nk_f64_t sum_b_x = nk_reduce_stable_f64x2_neon_(sum_b_x_f64x2), sum_b_x_compensation = 0.0;
1217
+ nk_f64_t sum_b_y = nk_reduce_stable_f64x2_neon_(sum_b_y_f64x2), sum_b_y_compensation = 0.0;
1218
+ nk_f64_t sum_b_z = nk_reduce_stable_f64x2_neon_(sum_b_z_f64x2), sum_b_z_compensation = 0.0;
1219
+ nk_f64_t covariance_x_x = nk_reduce_stable_f64x2_neon_(cov_xx_f64x2), covariance_x_x_compensation = 0.0;
1220
+ nk_f64_t covariance_x_y = nk_reduce_stable_f64x2_neon_(cov_xy_f64x2), covariance_x_y_compensation = 0.0;
1221
+ nk_f64_t covariance_x_z = nk_reduce_stable_f64x2_neon_(cov_xz_f64x2), covariance_x_z_compensation = 0.0;
1222
+ nk_f64_t covariance_y_x = nk_reduce_stable_f64x2_neon_(cov_yx_f64x2), covariance_y_x_compensation = 0.0;
1223
+ nk_f64_t covariance_y_y = nk_reduce_stable_f64x2_neon_(cov_yy_f64x2), covariance_y_y_compensation = 0.0;
1224
+ nk_f64_t covariance_y_z = nk_reduce_stable_f64x2_neon_(cov_yz_f64x2), covariance_y_z_compensation = 0.0;
1225
+ nk_f64_t covariance_z_x = nk_reduce_stable_f64x2_neon_(cov_zx_f64x2), covariance_z_x_compensation = 0.0;
1226
+ nk_f64_t covariance_z_y = nk_reduce_stable_f64x2_neon_(cov_zy_f64x2), covariance_z_y_compensation = 0.0;
1227
+ nk_f64_t covariance_z_z = nk_reduce_stable_f64x2_neon_(cov_zz_f64x2), covariance_z_z_compensation = 0.0;
1228
+ nk_f64_t sum_sq_a = nk_reduce_stable_f64x2_neon_(variance_a_f64x2), sum_sq_a_compensation = 0.0;
1229
+
1230
+ // Scalar tail
1231
+ for (; i < n; ++i) {
1232
+ nk_f64_t ax = a[i * 3 + 0], ay = a[i * 3 + 1], az = a[i * 3 + 2];
1233
+ nk_f64_t bx = b[i * 3 + 0], by = b[i * 3 + 1], bz = b[i * 3 + 2];
1234
+ nk_accumulate_sum_f64_(&sum_a_x, &sum_a_x_compensation, ax),
1235
+ nk_accumulate_sum_f64_(&sum_a_y, &sum_a_y_compensation, ay),
1236
+ nk_accumulate_sum_f64_(&sum_a_z, &sum_a_z_compensation, az);
1237
+ nk_accumulate_sum_f64_(&sum_b_x, &sum_b_x_compensation, bx),
1238
+ nk_accumulate_sum_f64_(&sum_b_y, &sum_b_y_compensation, by),
1239
+ nk_accumulate_sum_f64_(&sum_b_z, &sum_b_z_compensation, bz);
1240
+ nk_accumulate_product_f64_(&covariance_x_x, &covariance_x_x_compensation, ax, bx),
1241
+ nk_accumulate_product_f64_(&covariance_x_y, &covariance_x_y_compensation, ax, by),
1242
+ nk_accumulate_product_f64_(&covariance_x_z, &covariance_x_z_compensation, ax, bz);
1243
+ nk_accumulate_product_f64_(&covariance_y_x, &covariance_y_x_compensation, ay, bx),
1244
+ nk_accumulate_product_f64_(&covariance_y_y, &covariance_y_y_compensation, ay, by),
1245
+ nk_accumulate_product_f64_(&covariance_y_z, &covariance_y_z_compensation, ay, bz);
1246
+ nk_accumulate_product_f64_(&covariance_z_x, &covariance_z_x_compensation, az, bx),
1247
+ nk_accumulate_product_f64_(&covariance_z_y, &covariance_z_y_compensation, az, by),
1248
+ nk_accumulate_product_f64_(&covariance_z_z, &covariance_z_z_compensation, az, bz);
1249
+ nk_accumulate_square_f64_(&sum_sq_a, &sum_sq_a_compensation, ax),
1250
+ nk_accumulate_square_f64_(&sum_sq_a, &sum_sq_a_compensation, ay),
1251
+ nk_accumulate_square_f64_(&sum_sq_a, &sum_sq_a_compensation, az);
1252
+ }
1253
+
1254
+ sum_a_x += sum_a_x_compensation, sum_a_y += sum_a_y_compensation, sum_a_z += sum_a_z_compensation;
1255
+ sum_b_x += sum_b_x_compensation, sum_b_y += sum_b_y_compensation, sum_b_z += sum_b_z_compensation;
1256
+ covariance_x_x += covariance_x_x_compensation, covariance_x_y += covariance_x_y_compensation,
1257
+ covariance_x_z += covariance_x_z_compensation;
1258
+ covariance_y_x += covariance_y_x_compensation, covariance_y_y += covariance_y_y_compensation,
1259
+ covariance_y_z += covariance_y_z_compensation;
1260
+ covariance_z_x += covariance_z_x_compensation, covariance_z_y += covariance_z_y_compensation,
1261
+ covariance_z_z += covariance_z_z_compensation;
1262
+ sum_sq_a += sum_sq_a_compensation;
1263
+
1264
+ // Compute centroids
1265
+ nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
1266
+ nk_f64_t centroid_a_x = sum_a_x * inv_n, centroid_a_y = sum_a_y * inv_n, centroid_a_z = sum_a_z * inv_n;
1267
+ nk_f64_t centroid_b_x = sum_b_x * inv_n, centroid_b_y = sum_b_y * inv_n, centroid_b_z = sum_b_z * inv_n;
1268
+ if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
1269
+ if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
1270
+
1271
+ // Compute variance of A (centered)
1272
+ nk_f64_t centroid_sq = centroid_a_x * centroid_a_x + centroid_a_y * centroid_a_y + centroid_a_z * centroid_a_z;
1273
+ nk_f64_t var_a = sum_sq_a * inv_n - centroid_sq;
1274
+
1275
+ // Apply centering correction: H_centered = H - n * centroid_a * centroid_bᵀ
1276
+ covariance_x_x -= n * centroid_a_x * centroid_b_x;
1277
+ covariance_x_y -= n * centroid_a_x * centroid_b_y;
1278
+ covariance_x_z -= n * centroid_a_x * centroid_b_z;
1279
+ covariance_y_x -= n * centroid_a_y * centroid_b_x;
1280
+ covariance_y_y -= n * centroid_a_y * centroid_b_y;
1281
+ covariance_y_z -= n * centroid_a_y * centroid_b_z;
1282
+ covariance_z_x -= n * centroid_a_z * centroid_b_x;
1283
+ covariance_z_y -= n * centroid_a_z * centroid_b_y;
1284
+ covariance_z_z -= n * centroid_a_z * centroid_b_z;
1285
+
1286
+ // Compute SVD
1287
+ nk_f64_t cross_covariance[9] = {covariance_x_x, covariance_x_y, covariance_x_z, covariance_y_x, covariance_y_y,
1288
+ covariance_y_z, covariance_z_x, covariance_z_y, covariance_z_z};
1289
+ nk_f64_t svd_u[9], svd_s[9], svd_v[9];
1290
+ nk_svd3x3_f64_(cross_covariance, svd_u, svd_s, svd_v);
1291
+
1292
+ nk_f64_t r[9];
1293
+ nk_rotation_from_svd_f64_neon_(svd_u, svd_v, r);
1294
+
1295
+ // Handle reflection and compute scale
1296
+ nk_f64_t det = nk_det3x3_f64_(r);
1297
+ nk_f64_t trace_d_s = svd_s[0] + svd_s[4] + (det < 0 ? -svd_s[8] : svd_s[8]);
1298
+ nk_f64_t computed_scale = trace_d_s / (n * var_a);
1299
+
1300
+ if (det < 0) {
1301
+ svd_v[2] = -svd_v[2];
1302
+ svd_v[5] = -svd_v[5];
1303
+ svd_v[8] = -svd_v[8];
1304
+ nk_rotation_from_svd_f64_neon_(svd_u, svd_v, r);
1305
+ }
1306
+
1307
+ if (rotation)
1308
+ for (int j = 0; j < 9; ++j) rotation[j] = r[j];
1309
+ if (scale) *scale = computed_scale;
1310
+
1311
+ // Compute RMSD after transformation
1312
+ nk_f64_t sum_squared = nk_transformed_ssd_f64_neon_(a, b, n, r, computed_scale, centroid_a_x, centroid_a_y,
1313
+ centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z);
1314
+ *result = nk_f64_sqrt_neon(sum_squared * inv_n);
1315
+ }
1316
+
1317
+ #if defined(__clang__)
1318
+ #pragma clang attribute pop
1319
+ #elif defined(__GNUC__)
1320
+ #pragma GCC pop_options
1321
+ #endif
1322
+
1323
+ #if defined(__cplusplus)
1324
+ } // extern "C"
1325
+ #endif
1326
+
1327
+ #endif // NK_TARGET_NEON
1328
+ #endif // NK_TARGET_ARM_
1329
+ #endif // NK_MESH_NEON_H