numkong 7.0.0 → 7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +197 -124
  2. package/binding.gyp +34 -484
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -10,13 +10,12 @@
10
10
  *
11
11
  * Point cloud operations use these ARM NEON instructions:
12
12
  *
13
- * Intrinsic Instruction Latency Throughput
14
- * A76 M4+/V1+/Oryon
15
- * vfmaq_f32 FMLA (V.4S, V.4S, V.4S) 4cy 2/cy 4/cy
16
- * vmulq_n_f32 FMUL (V.4S, V.4S, V.S[0]) 3cy 2/cy 4/cy
17
- * vsubq_f32 FSUB (V.4S, V.4S, V.4S) 2cy 2/cy 4/cy
18
- * vaddvq_f32 FADDP+FADDP (reduce) 5cy 1/cy 1/cy
19
- * vld3q_f32 LD3 ({Vt.4S, Vt2.4S, Vt3.4S}) 6cy 1/cy 1/cy
13
+ * Intrinsic Instruction A76 M5
14
+ * vfmaq_f32 FMLA (V.4S, V.4S, V.4S) 4cy @ 2p 3cy @ 4p
15
+ * vmulq_n_f32 FMUL (V.4S, V.4S, V.S[0]) 3cy @ 2p 3cy @ 4p
16
+ * vsubq_f32 FSUB (V.4S, V.4S, V.4S) 2cy @ 2p 2cy @ 4p
17
+ * vaddvq_f32 FADDP+FADDP (reduce) 5cy @ 1p 8cy @ 1p
18
+ * vld3q_f32 LD3 ({Vt.4S, Vt2.4S, Vt3.4S}) 4cy @ 1p 4cy @ 1p
20
19
  *
21
20
  * LD3 provides hardware stride-3 deinterleaving for XYZ point data. The 6cy latency and
22
21
  * 1/cy throughput make it the memory bottleneck regardless of core microarchitecture.
@@ -54,10 +53,10 @@ NK_INTERNAL void nk_deinterleave_f32x4_neon_(nk_f32_t const *ptr, float32x4_t *x
54
53
  //
55
54
  // Input: 12 contiguous floats [x0,y0,z0, x1,y1,z1, x2,y2,z2, x3,y3,z3]
56
55
  // Output: x[4], y[4], z[4] vectors
57
- float32x4x3_t xyz = vld3q_f32(ptr);
58
- *x_out = xyz.val[0];
59
- *y_out = xyz.val[1];
60
- *z_out = xyz.val[2];
56
+ float32x4x3_t xyz_f32x4x3 = vld3q_f32(ptr);
57
+ *x_out = xyz_f32x4x3.val[0];
58
+ *y_out = xyz_f32x4x3.val[1];
59
+ *z_out = xyz_f32x4x3.val[2];
61
60
  }
62
61
 
63
62
  NK_INTERNAL void nk_deinterleave_f64x2_neon_(nk_f64_t const *ptr, float64x2_t *x_out, float64x2_t *y_out,
@@ -73,10 +72,6 @@ NK_INTERNAL void nk_deinterleave_f64x2_neon_(nk_f64_t const *ptr, float64x2_t *x
73
72
  *z_out = vcombine_f64(vld1_f64(&ptr[2]), vld1_f64(&ptr[5]));
74
73
  }
75
74
 
76
- NK_INTERNAL float64x2_t nk_promote_upper_f32x4_to_f64x2_neon_(float32x4_t values_f32x4) {
77
- return vcvt_f64_f32(vget_high_f32(values_f32x4));
78
- }
79
-
80
75
  NK_INTERNAL nk_f64_t nk_reduce_stable_f64x2_neon_(float64x2_t values_f64x2) {
81
76
  nk_b128_vec_t values;
82
77
  values.f64x2 = values_f64x2;
@@ -102,250 +97,6 @@ NK_INTERNAL void nk_accumulate_square_f64x2_neon_(float64x2_t *sum_f64x2, float6
102
97
  *compensation_f64x2 = vaddq_f64(*compensation_f64x2, vaddq_f64(sum_error_f64x2, product_error_f64x2));
103
98
  }
104
99
 
105
- NK_INTERNAL void nk_bicentroid_f32_neon_( //
106
- nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, //
107
- nk_f64_t *ca_x, nk_f64_t *ca_y, nk_f64_t *ca_z, //
108
- nk_f64_t *cb_x, nk_f64_t *cb_y, nk_f64_t *cb_z) { //
109
- float64x2_t zero_f64x2 = vdupq_n_f64(0.0);
110
- float64x2_t sum_a_x_lower_f64x2 = zero_f64x2, sum_a_x_upper_f64x2 = zero_f64x2;
111
- float64x2_t sum_a_y_lower_f64x2 = zero_f64x2, sum_a_y_upper_f64x2 = zero_f64x2;
112
- float64x2_t sum_a_z_lower_f64x2 = zero_f64x2, sum_a_z_upper_f64x2 = zero_f64x2;
113
- float64x2_t sum_b_x_lower_f64x2 = zero_f64x2, sum_b_x_upper_f64x2 = zero_f64x2;
114
- float64x2_t sum_b_y_lower_f64x2 = zero_f64x2, sum_b_y_upper_f64x2 = zero_f64x2;
115
- float64x2_t sum_b_z_lower_f64x2 = zero_f64x2, sum_b_z_upper_f64x2 = zero_f64x2;
116
- nk_size_t index = 0;
117
-
118
- for (; index + 4 <= n; index += 4) {
119
- float32x4_t a_x_f32x4, a_y_f32x4, a_z_f32x4, b_x_f32x4, b_y_f32x4, b_z_f32x4;
120
- nk_deinterleave_f32x4_neon_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4),
121
- nk_deinterleave_f32x4_neon_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
122
-
123
- float64x2_t a_x_lower_f64x2 = vcvt_f64_f32(vget_low_f32(a_x_f32x4));
124
- float64x2_t a_x_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(a_x_f32x4);
125
- float64x2_t a_y_lower_f64x2 = vcvt_f64_f32(vget_low_f32(a_y_f32x4));
126
- float64x2_t a_y_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(a_y_f32x4);
127
- float64x2_t a_z_lower_f64x2 = vcvt_f64_f32(vget_low_f32(a_z_f32x4));
128
- float64x2_t a_z_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(a_z_f32x4);
129
- float64x2_t b_x_lower_f64x2 = vcvt_f64_f32(vget_low_f32(b_x_f32x4));
130
- float64x2_t b_x_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(b_x_f32x4);
131
- float64x2_t b_y_lower_f64x2 = vcvt_f64_f32(vget_low_f32(b_y_f32x4));
132
- float64x2_t b_y_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(b_y_f32x4);
133
- float64x2_t b_z_lower_f64x2 = vcvt_f64_f32(vget_low_f32(b_z_f32x4));
134
- float64x2_t b_z_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(b_z_f32x4);
135
-
136
- sum_a_x_lower_f64x2 = vaddq_f64(sum_a_x_lower_f64x2, a_x_lower_f64x2),
137
- sum_a_x_upper_f64x2 = vaddq_f64(sum_a_x_upper_f64x2, a_x_upper_f64x2);
138
- sum_a_y_lower_f64x2 = vaddq_f64(sum_a_y_lower_f64x2, a_y_lower_f64x2),
139
- sum_a_y_upper_f64x2 = vaddq_f64(sum_a_y_upper_f64x2, a_y_upper_f64x2);
140
- sum_a_z_lower_f64x2 = vaddq_f64(sum_a_z_lower_f64x2, a_z_lower_f64x2),
141
- sum_a_z_upper_f64x2 = vaddq_f64(sum_a_z_upper_f64x2, a_z_upper_f64x2);
142
- sum_b_x_lower_f64x2 = vaddq_f64(sum_b_x_lower_f64x2, b_x_lower_f64x2),
143
- sum_b_x_upper_f64x2 = vaddq_f64(sum_b_x_upper_f64x2, b_x_upper_f64x2);
144
- sum_b_y_lower_f64x2 = vaddq_f64(sum_b_y_lower_f64x2, b_y_lower_f64x2),
145
- sum_b_y_upper_f64x2 = vaddq_f64(sum_b_y_upper_f64x2, b_y_upper_f64x2);
146
- sum_b_z_lower_f64x2 = vaddq_f64(sum_b_z_lower_f64x2, b_z_lower_f64x2),
147
- sum_b_z_upper_f64x2 = vaddq_f64(sum_b_z_upper_f64x2, b_z_upper_f64x2);
148
- }
149
-
150
- nk_f64_t sum_a_x = vaddvq_f64(vaddq_f64(sum_a_x_lower_f64x2, sum_a_x_upper_f64x2));
151
- nk_f64_t sum_a_y = vaddvq_f64(vaddq_f64(sum_a_y_lower_f64x2, sum_a_y_upper_f64x2));
152
- nk_f64_t sum_a_z = vaddvq_f64(vaddq_f64(sum_a_z_lower_f64x2, sum_a_z_upper_f64x2));
153
- nk_f64_t sum_b_x = vaddvq_f64(vaddq_f64(sum_b_x_lower_f64x2, sum_b_x_upper_f64x2));
154
- nk_f64_t sum_b_y = vaddvq_f64(vaddq_f64(sum_b_y_lower_f64x2, sum_b_y_upper_f64x2));
155
- nk_f64_t sum_b_z = vaddvq_f64(vaddq_f64(sum_b_z_lower_f64x2, sum_b_z_upper_f64x2));
156
-
157
- for (; index < n; ++index) {
158
- sum_a_x += a[index * 3 + 0], sum_a_y += a[index * 3 + 1], sum_a_z += a[index * 3 + 2];
159
- sum_b_x += b[index * 3 + 0], sum_b_y += b[index * 3 + 1], sum_b_z += b[index * 3 + 2];
160
- }
161
-
162
- nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
163
- *ca_x = sum_a_x * inv_n, *ca_y = sum_a_y * inv_n, *ca_z = sum_a_z * inv_n;
164
- *cb_x = sum_b_x * inv_n, *cb_y = sum_b_y * inv_n, *cb_z = sum_b_z * inv_n;
165
- }
166
-
167
- NK_INTERNAL void nk_cross_covariance_f32_neon_( //
168
- nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t centroid_a_x, nk_f64_t centroid_a_y,
169
- nk_f64_t centroid_a_z, nk_f64_t centroid_b_x, nk_f64_t centroid_b_y, nk_f64_t centroid_b_z, nk_f64_t h[9]) {
170
- float64x2_t zero_f64x2 = vdupq_n_f64(0.0);
171
- float64x2_t centroid_a_x_f64x2 = vdupq_n_f64(centroid_a_x), centroid_a_y_f64x2 = vdupq_n_f64(centroid_a_y);
172
- float64x2_t centroid_a_z_f64x2 = vdupq_n_f64(centroid_a_z), centroid_b_x_f64x2 = vdupq_n_f64(centroid_b_x);
173
- float64x2_t centroid_b_y_f64x2 = vdupq_n_f64(centroid_b_y), centroid_b_z_f64x2 = vdupq_n_f64(centroid_b_z);
174
- float64x2_t cross_00_lower_f64x2 = zero_f64x2, cross_00_upper_f64x2 = zero_f64x2;
175
- float64x2_t cross_01_lower_f64x2 = zero_f64x2, cross_01_upper_f64x2 = zero_f64x2;
176
- float64x2_t cross_02_lower_f64x2 = zero_f64x2, cross_02_upper_f64x2 = zero_f64x2;
177
- float64x2_t cross_10_lower_f64x2 = zero_f64x2, cross_10_upper_f64x2 = zero_f64x2;
178
- float64x2_t cross_11_lower_f64x2 = zero_f64x2, cross_11_upper_f64x2 = zero_f64x2;
179
- float64x2_t cross_12_lower_f64x2 = zero_f64x2, cross_12_upper_f64x2 = zero_f64x2;
180
- float64x2_t cross_20_lower_f64x2 = zero_f64x2, cross_20_upper_f64x2 = zero_f64x2;
181
- float64x2_t cross_21_lower_f64x2 = zero_f64x2, cross_21_upper_f64x2 = zero_f64x2;
182
- float64x2_t cross_22_lower_f64x2 = zero_f64x2, cross_22_upper_f64x2 = zero_f64x2;
183
- nk_size_t index = 0;
184
-
185
- for (; index + 4 <= n; index += 4) {
186
- float32x4_t a_x_f32x4, a_y_f32x4, a_z_f32x4, b_x_f32x4, b_y_f32x4, b_z_f32x4;
187
- nk_deinterleave_f32x4_neon_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4),
188
- nk_deinterleave_f32x4_neon_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
189
-
190
- float64x2_t centered_a_x_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(a_x_f32x4)), centroid_a_x_f64x2);
191
- float64x2_t centered_a_x_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(a_x_f32x4),
192
- centroid_a_x_f64x2);
193
- float64x2_t centered_a_y_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(a_y_f32x4)), centroid_a_y_f64x2);
194
- float64x2_t centered_a_y_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(a_y_f32x4),
195
- centroid_a_y_f64x2);
196
- float64x2_t centered_a_z_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(a_z_f32x4)), centroid_a_z_f64x2);
197
- float64x2_t centered_a_z_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(a_z_f32x4),
198
- centroid_a_z_f64x2);
199
- float64x2_t centered_b_x_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(b_x_f32x4)), centroid_b_x_f64x2);
200
- float64x2_t centered_b_x_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(b_x_f32x4),
201
- centroid_b_x_f64x2);
202
- float64x2_t centered_b_y_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(b_y_f32x4)), centroid_b_y_f64x2);
203
- float64x2_t centered_b_y_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(b_y_f32x4),
204
- centroid_b_y_f64x2);
205
- float64x2_t centered_b_z_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(b_z_f32x4)), centroid_b_z_f64x2);
206
- float64x2_t centered_b_z_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(b_z_f32x4),
207
- centroid_b_z_f64x2);
208
-
209
- cross_00_lower_f64x2 = vfmaq_f64(cross_00_lower_f64x2, centered_a_x_lower_f64x2, centered_b_x_lower_f64x2),
210
- cross_00_upper_f64x2 = vfmaq_f64(cross_00_upper_f64x2, centered_a_x_upper_f64x2, centered_b_x_upper_f64x2);
211
- cross_01_lower_f64x2 = vfmaq_f64(cross_01_lower_f64x2, centered_a_x_lower_f64x2, centered_b_y_lower_f64x2),
212
- cross_01_upper_f64x2 = vfmaq_f64(cross_01_upper_f64x2, centered_a_x_upper_f64x2, centered_b_y_upper_f64x2);
213
- cross_02_lower_f64x2 = vfmaq_f64(cross_02_lower_f64x2, centered_a_x_lower_f64x2, centered_b_z_lower_f64x2),
214
- cross_02_upper_f64x2 = vfmaq_f64(cross_02_upper_f64x2, centered_a_x_upper_f64x2, centered_b_z_upper_f64x2);
215
- cross_10_lower_f64x2 = vfmaq_f64(cross_10_lower_f64x2, centered_a_y_lower_f64x2, centered_b_x_lower_f64x2),
216
- cross_10_upper_f64x2 = vfmaq_f64(cross_10_upper_f64x2, centered_a_y_upper_f64x2, centered_b_x_upper_f64x2);
217
- cross_11_lower_f64x2 = vfmaq_f64(cross_11_lower_f64x2, centered_a_y_lower_f64x2, centered_b_y_lower_f64x2),
218
- cross_11_upper_f64x2 = vfmaq_f64(cross_11_upper_f64x2, centered_a_y_upper_f64x2, centered_b_y_upper_f64x2);
219
- cross_12_lower_f64x2 = vfmaq_f64(cross_12_lower_f64x2, centered_a_y_lower_f64x2, centered_b_z_lower_f64x2),
220
- cross_12_upper_f64x2 = vfmaq_f64(cross_12_upper_f64x2, centered_a_y_upper_f64x2, centered_b_z_upper_f64x2);
221
- cross_20_lower_f64x2 = vfmaq_f64(cross_20_lower_f64x2, centered_a_z_lower_f64x2, centered_b_x_lower_f64x2),
222
- cross_20_upper_f64x2 = vfmaq_f64(cross_20_upper_f64x2, centered_a_z_upper_f64x2, centered_b_x_upper_f64x2);
223
- cross_21_lower_f64x2 = vfmaq_f64(cross_21_lower_f64x2, centered_a_z_lower_f64x2, centered_b_y_lower_f64x2),
224
- cross_21_upper_f64x2 = vfmaq_f64(cross_21_upper_f64x2, centered_a_z_upper_f64x2, centered_b_y_upper_f64x2);
225
- cross_22_lower_f64x2 = vfmaq_f64(cross_22_lower_f64x2, centered_a_z_lower_f64x2, centered_b_z_lower_f64x2),
226
- cross_22_upper_f64x2 = vfmaq_f64(cross_22_upper_f64x2, centered_a_z_upper_f64x2, centered_b_z_upper_f64x2);
227
- }
228
-
229
- h[0] = vaddvq_f64(vaddq_f64(cross_00_lower_f64x2, cross_00_upper_f64x2));
230
- h[1] = vaddvq_f64(vaddq_f64(cross_01_lower_f64x2, cross_01_upper_f64x2));
231
- h[2] = vaddvq_f64(vaddq_f64(cross_02_lower_f64x2, cross_02_upper_f64x2));
232
- h[3] = vaddvq_f64(vaddq_f64(cross_10_lower_f64x2, cross_10_upper_f64x2));
233
- h[4] = vaddvq_f64(vaddq_f64(cross_11_lower_f64x2, cross_11_upper_f64x2));
234
- h[5] = vaddvq_f64(vaddq_f64(cross_12_lower_f64x2, cross_12_upper_f64x2));
235
- h[6] = vaddvq_f64(vaddq_f64(cross_20_lower_f64x2, cross_20_upper_f64x2));
236
- h[7] = vaddvq_f64(vaddq_f64(cross_21_lower_f64x2, cross_21_upper_f64x2));
237
- h[8] = vaddvq_f64(vaddq_f64(cross_22_lower_f64x2, cross_22_upper_f64x2));
238
-
239
- for (; index < n; ++index) {
240
- nk_f64_t centered_a_x = (nk_f64_t)a[index * 3 + 0] - centroid_a_x;
241
- nk_f64_t centered_a_y = (nk_f64_t)a[index * 3 + 1] - centroid_a_y;
242
- nk_f64_t centered_a_z = (nk_f64_t)a[index * 3 + 2] - centroid_a_z;
243
- nk_f64_t centered_b_x = (nk_f64_t)b[index * 3 + 0] - centroid_b_x;
244
- nk_f64_t centered_b_y = (nk_f64_t)b[index * 3 + 1] - centroid_b_y;
245
- nk_f64_t centered_b_z = (nk_f64_t)b[index * 3 + 2] - centroid_b_z;
246
- h[0] += centered_a_x * centered_b_x, h[1] += centered_a_x * centered_b_y, h[2] += centered_a_x * centered_b_z;
247
- h[3] += centered_a_y * centered_b_x, h[4] += centered_a_y * centered_b_y, h[5] += centered_a_y * centered_b_z;
248
- h[6] += centered_a_z * centered_b_x, h[7] += centered_a_z * centered_b_y, h[8] += centered_a_z * centered_b_z;
249
- }
250
- }
251
-
252
- NK_INTERNAL void nk_cross_covariance_and_variance_f32_neon_( //
253
- nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t centroid_a_x, nk_f64_t centroid_a_y,
254
- nk_f64_t centroid_a_z, nk_f64_t centroid_b_x, nk_f64_t centroid_b_y, nk_f64_t centroid_b_z, nk_f64_t h[9],
255
- nk_f64_t *variance_a) {
256
- float64x2_t zero_f64x2 = vdupq_n_f64(0.0);
257
- float64x2_t centroid_a_x_f64x2 = vdupq_n_f64(centroid_a_x), centroid_a_y_f64x2 = vdupq_n_f64(centroid_a_y);
258
- float64x2_t centroid_a_z_f64x2 = vdupq_n_f64(centroid_a_z), centroid_b_x_f64x2 = vdupq_n_f64(centroid_b_x);
259
- float64x2_t centroid_b_y_f64x2 = vdupq_n_f64(centroid_b_y), centroid_b_z_f64x2 = vdupq_n_f64(centroid_b_z);
260
- float64x2_t cross_00_lower_f64x2 = zero_f64x2, cross_00_upper_f64x2 = zero_f64x2;
261
- float64x2_t cross_01_lower_f64x2 = zero_f64x2, cross_01_upper_f64x2 = zero_f64x2;
262
- float64x2_t cross_02_lower_f64x2 = zero_f64x2, cross_02_upper_f64x2 = zero_f64x2;
263
- float64x2_t cross_10_lower_f64x2 = zero_f64x2, cross_10_upper_f64x2 = zero_f64x2;
264
- float64x2_t cross_11_lower_f64x2 = zero_f64x2, cross_11_upper_f64x2 = zero_f64x2;
265
- float64x2_t cross_12_lower_f64x2 = zero_f64x2, cross_12_upper_f64x2 = zero_f64x2;
266
- float64x2_t cross_20_lower_f64x2 = zero_f64x2, cross_20_upper_f64x2 = zero_f64x2;
267
- float64x2_t cross_21_lower_f64x2 = zero_f64x2, cross_21_upper_f64x2 = zero_f64x2;
268
- float64x2_t cross_22_lower_f64x2 = zero_f64x2, cross_22_upper_f64x2 = zero_f64x2;
269
- float64x2_t variance_lower_f64x2 = zero_f64x2, variance_upper_f64x2 = zero_f64x2;
270
- nk_size_t index = 0;
271
-
272
- for (; index + 4 <= n; index += 4) {
273
- float32x4_t a_x_f32x4, a_y_f32x4, a_z_f32x4, b_x_f32x4, b_y_f32x4, b_z_f32x4;
274
- nk_deinterleave_f32x4_neon_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4),
275
- nk_deinterleave_f32x4_neon_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
276
-
277
- float64x2_t centered_a_x_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(a_x_f32x4)), centroid_a_x_f64x2);
278
- float64x2_t centered_a_x_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(a_x_f32x4),
279
- centroid_a_x_f64x2);
280
- float64x2_t centered_a_y_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(a_y_f32x4)), centroid_a_y_f64x2);
281
- float64x2_t centered_a_y_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(a_y_f32x4),
282
- centroid_a_y_f64x2);
283
- float64x2_t centered_a_z_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(a_z_f32x4)), centroid_a_z_f64x2);
284
- float64x2_t centered_a_z_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(a_z_f32x4),
285
- centroid_a_z_f64x2);
286
- float64x2_t centered_b_x_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(b_x_f32x4)), centroid_b_x_f64x2);
287
- float64x2_t centered_b_x_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(b_x_f32x4),
288
- centroid_b_x_f64x2);
289
- float64x2_t centered_b_y_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(b_y_f32x4)), centroid_b_y_f64x2);
290
- float64x2_t centered_b_y_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(b_y_f32x4),
291
- centroid_b_y_f64x2);
292
- float64x2_t centered_b_z_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(b_z_f32x4)), centroid_b_z_f64x2);
293
- float64x2_t centered_b_z_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(b_z_f32x4),
294
- centroid_b_z_f64x2);
295
-
296
- cross_00_lower_f64x2 = vfmaq_f64(cross_00_lower_f64x2, centered_a_x_lower_f64x2, centered_b_x_lower_f64x2),
297
- cross_00_upper_f64x2 = vfmaq_f64(cross_00_upper_f64x2, centered_a_x_upper_f64x2, centered_b_x_upper_f64x2);
298
- cross_01_lower_f64x2 = vfmaq_f64(cross_01_lower_f64x2, centered_a_x_lower_f64x2, centered_b_y_lower_f64x2),
299
- cross_01_upper_f64x2 = vfmaq_f64(cross_01_upper_f64x2, centered_a_x_upper_f64x2, centered_b_y_upper_f64x2);
300
- cross_02_lower_f64x2 = vfmaq_f64(cross_02_lower_f64x2, centered_a_x_lower_f64x2, centered_b_z_lower_f64x2),
301
- cross_02_upper_f64x2 = vfmaq_f64(cross_02_upper_f64x2, centered_a_x_upper_f64x2, centered_b_z_upper_f64x2);
302
- cross_10_lower_f64x2 = vfmaq_f64(cross_10_lower_f64x2, centered_a_y_lower_f64x2, centered_b_x_lower_f64x2),
303
- cross_10_upper_f64x2 = vfmaq_f64(cross_10_upper_f64x2, centered_a_y_upper_f64x2, centered_b_x_upper_f64x2);
304
- cross_11_lower_f64x2 = vfmaq_f64(cross_11_lower_f64x2, centered_a_y_lower_f64x2, centered_b_y_lower_f64x2),
305
- cross_11_upper_f64x2 = vfmaq_f64(cross_11_upper_f64x2, centered_a_y_upper_f64x2, centered_b_y_upper_f64x2);
306
- cross_12_lower_f64x2 = vfmaq_f64(cross_12_lower_f64x2, centered_a_y_lower_f64x2, centered_b_z_lower_f64x2),
307
- cross_12_upper_f64x2 = vfmaq_f64(cross_12_upper_f64x2, centered_a_y_upper_f64x2, centered_b_z_upper_f64x2);
308
- cross_20_lower_f64x2 = vfmaq_f64(cross_20_lower_f64x2, centered_a_z_lower_f64x2, centered_b_x_lower_f64x2),
309
- cross_20_upper_f64x2 = vfmaq_f64(cross_20_upper_f64x2, centered_a_z_upper_f64x2, centered_b_x_upper_f64x2);
310
- cross_21_lower_f64x2 = vfmaq_f64(cross_21_lower_f64x2, centered_a_z_lower_f64x2, centered_b_y_lower_f64x2),
311
- cross_21_upper_f64x2 = vfmaq_f64(cross_21_upper_f64x2, centered_a_z_upper_f64x2, centered_b_y_upper_f64x2);
312
- cross_22_lower_f64x2 = vfmaq_f64(cross_22_lower_f64x2, centered_a_z_lower_f64x2, centered_b_z_lower_f64x2),
313
- cross_22_upper_f64x2 = vfmaq_f64(cross_22_upper_f64x2, centered_a_z_upper_f64x2, centered_b_z_upper_f64x2);
314
-
315
- variance_lower_f64x2 = vfmaq_f64(variance_lower_f64x2, centered_a_x_lower_f64x2, centered_a_x_lower_f64x2),
316
- variance_upper_f64x2 = vfmaq_f64(variance_upper_f64x2, centered_a_x_upper_f64x2, centered_a_x_upper_f64x2);
317
- variance_lower_f64x2 = vfmaq_f64(variance_lower_f64x2, centered_a_y_lower_f64x2, centered_a_y_lower_f64x2),
318
- variance_upper_f64x2 = vfmaq_f64(variance_upper_f64x2, centered_a_y_upper_f64x2, centered_a_y_upper_f64x2);
319
- variance_lower_f64x2 = vfmaq_f64(variance_lower_f64x2, centered_a_z_lower_f64x2, centered_a_z_lower_f64x2),
320
- variance_upper_f64x2 = vfmaq_f64(variance_upper_f64x2, centered_a_z_upper_f64x2, centered_a_z_upper_f64x2);
321
- }
322
-
323
- h[0] = vaddvq_f64(vaddq_f64(cross_00_lower_f64x2, cross_00_upper_f64x2));
324
- h[1] = vaddvq_f64(vaddq_f64(cross_01_lower_f64x2, cross_01_upper_f64x2));
325
- h[2] = vaddvq_f64(vaddq_f64(cross_02_lower_f64x2, cross_02_upper_f64x2));
326
- h[3] = vaddvq_f64(vaddq_f64(cross_10_lower_f64x2, cross_10_upper_f64x2));
327
- h[4] = vaddvq_f64(vaddq_f64(cross_11_lower_f64x2, cross_11_upper_f64x2));
328
- h[5] = vaddvq_f64(vaddq_f64(cross_12_lower_f64x2, cross_12_upper_f64x2));
329
- h[6] = vaddvq_f64(vaddq_f64(cross_20_lower_f64x2, cross_20_upper_f64x2));
330
- h[7] = vaddvq_f64(vaddq_f64(cross_21_lower_f64x2, cross_21_upper_f64x2));
331
- h[8] = vaddvq_f64(vaddq_f64(cross_22_lower_f64x2, cross_22_upper_f64x2));
332
- *variance_a = vaddvq_f64(vaddq_f64(variance_lower_f64x2, variance_upper_f64x2)) / (nk_f64_t)n;
333
-
334
- for (; index < n; ++index) {
335
- nk_f64_t centered_a_x = (nk_f64_t)a[index * 3 + 0] - centroid_a_x;
336
- nk_f64_t centered_a_y = (nk_f64_t)a[index * 3 + 1] - centroid_a_y;
337
- nk_f64_t centered_a_z = (nk_f64_t)a[index * 3 + 2] - centroid_a_z;
338
- nk_f64_t centered_b_x = (nk_f64_t)b[index * 3 + 0] - centroid_b_x;
339
- nk_f64_t centered_b_y = (nk_f64_t)b[index * 3 + 1] - centroid_b_y;
340
- nk_f64_t centered_b_z = (nk_f64_t)b[index * 3 + 2] - centroid_b_z;
341
- h[0] += centered_a_x * centered_b_x, h[1] += centered_a_x * centered_b_y, h[2] += centered_a_x * centered_b_z;
342
- h[3] += centered_a_y * centered_b_x, h[4] += centered_a_y * centered_b_y, h[5] += centered_a_y * centered_b_z;
343
- h[6] += centered_a_z * centered_b_x, h[7] += centered_a_z * centered_b_y, h[8] += centered_a_z * centered_b_z;
344
- *variance_a += (centered_a_x * centered_a_x + centered_a_y * centered_a_y + centered_a_z * centered_a_z) /
345
- (nk_f64_t)n;
346
- }
347
- }
348
-
349
100
  NK_INTERNAL nk_f64_t nk_transformed_ssd_f32_neon_( //
350
101
  nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t const *r, nk_f64_t scale, nk_f64_t centroid_a_x,
351
102
  nk_f64_t centroid_a_y, nk_f64_t centroid_a_z, nk_f64_t centroid_b_x, nk_f64_t centroid_b_y, nk_f64_t centroid_b_z) {
@@ -361,7 +112,7 @@ NK_INTERNAL nk_f64_t nk_transformed_ssd_f32_neon_( //
361
112
  float64x2_t centroid_a_x_f64x2 = vdupq_n_f64(centroid_a_x), centroid_a_y_f64x2 = vdupq_n_f64(centroid_a_y);
362
113
  float64x2_t centroid_a_z_f64x2 = vdupq_n_f64(centroid_a_z), centroid_b_x_f64x2 = vdupq_n_f64(centroid_b_x);
363
114
  float64x2_t centroid_b_y_f64x2 = vdupq_n_f64(centroid_b_y), centroid_b_z_f64x2 = vdupq_n_f64(centroid_b_z);
364
- float64x2_t sum_squared_lower_f64x2 = vdupq_n_f64(0.0), sum_squared_upper_f64x2 = vdupq_n_f64(0.0);
115
+ float64x2_t sum_squared_low_f64x2 = vdupq_n_f64(0.0), sum_squared_high_f64x2 = vdupq_n_f64(0.0);
365
116
  nk_size_t index = 0;
366
117
 
367
118
  for (; index + 4 <= n; index += 4) {
@@ -369,76 +120,70 @@ NK_INTERNAL nk_f64_t nk_transformed_ssd_f32_neon_( //
369
120
  nk_deinterleave_f32x4_neon_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4),
370
121
  nk_deinterleave_f32x4_neon_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
371
122
 
372
- float64x2_t centered_a_x_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(a_x_f32x4)), centroid_a_x_f64x2);
373
- float64x2_t centered_a_x_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(a_x_f32x4),
374
- centroid_a_x_f64x2);
375
- float64x2_t centered_a_y_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(a_y_f32x4)), centroid_a_y_f64x2);
376
- float64x2_t centered_a_y_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(a_y_f32x4),
377
- centroid_a_y_f64x2);
378
- float64x2_t centered_a_z_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(a_z_f32x4)), centroid_a_z_f64x2);
379
- float64x2_t centered_a_z_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(a_z_f32x4),
380
- centroid_a_z_f64x2);
381
- float64x2_t centered_b_x_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(b_x_f32x4)), centroid_b_x_f64x2);
382
- float64x2_t centered_b_x_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(b_x_f32x4),
383
- centroid_b_x_f64x2);
384
- float64x2_t centered_b_y_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(b_y_f32x4)), centroid_b_y_f64x2);
385
- float64x2_t centered_b_y_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(b_y_f32x4),
386
- centroid_b_y_f64x2);
387
- float64x2_t centered_b_z_lower_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(b_z_f32x4)), centroid_b_z_f64x2);
388
- float64x2_t centered_b_z_upper_f64x2 = vsubq_f64(nk_promote_upper_f32x4_to_f64x2_neon_(b_z_f32x4),
389
- centroid_b_z_f64x2);
390
-
391
- float64x2_t rotated_a_x_lower_f64x2 = vfmaq_f64(
392
- vfmaq_f64(vmulq_f64(scaled_rotation_x_x_f64x2, centered_a_x_lower_f64x2), scaled_rotation_x_y_f64x2,
393
- centered_a_y_lower_f64x2),
394
- scaled_rotation_x_z_f64x2, centered_a_z_lower_f64x2);
395
- float64x2_t rotated_a_x_upper_f64x2 = vfmaq_f64(
396
- vfmaq_f64(vmulq_f64(scaled_rotation_x_x_f64x2, centered_a_x_upper_f64x2), scaled_rotation_x_y_f64x2,
397
- centered_a_y_upper_f64x2),
398
- scaled_rotation_x_z_f64x2, centered_a_z_upper_f64x2);
399
- float64x2_t rotated_a_y_lower_f64x2 = vfmaq_f64(
400
- vfmaq_f64(vmulq_f64(scaled_rotation_y_x_f64x2, centered_a_x_lower_f64x2), scaled_rotation_y_y_f64x2,
401
- centered_a_y_lower_f64x2),
402
- scaled_rotation_y_z_f64x2, centered_a_z_lower_f64x2);
403
- float64x2_t rotated_a_y_upper_f64x2 = vfmaq_f64(
404
- vfmaq_f64(vmulq_f64(scaled_rotation_y_x_f64x2, centered_a_x_upper_f64x2), scaled_rotation_y_y_f64x2,
405
- centered_a_y_upper_f64x2),
406
- scaled_rotation_y_z_f64x2, centered_a_z_upper_f64x2);
407
- float64x2_t rotated_a_z_lower_f64x2 = vfmaq_f64(
408
- vfmaq_f64(vmulq_f64(scaled_rotation_z_x_f64x2, centered_a_x_lower_f64x2), scaled_rotation_z_y_f64x2,
409
- centered_a_y_lower_f64x2),
410
- scaled_rotation_z_z_f64x2, centered_a_z_lower_f64x2);
411
- float64x2_t rotated_a_z_upper_f64x2 = vfmaq_f64(
412
- vfmaq_f64(vmulq_f64(scaled_rotation_z_x_f64x2, centered_a_x_upper_f64x2), scaled_rotation_z_y_f64x2,
413
- centered_a_y_upper_f64x2),
414
- scaled_rotation_z_z_f64x2, centered_a_z_upper_f64x2);
415
-
416
- float64x2_t delta_x_lower_f64x2 = vsubq_f64(rotated_a_x_lower_f64x2, centered_b_x_lower_f64x2);
417
- float64x2_t delta_x_upper_f64x2 = vsubq_f64(rotated_a_x_upper_f64x2, centered_b_x_upper_f64x2);
418
- float64x2_t delta_y_lower_f64x2 = vsubq_f64(rotated_a_y_lower_f64x2, centered_b_y_lower_f64x2);
419
- float64x2_t delta_y_upper_f64x2 = vsubq_f64(rotated_a_y_upper_f64x2, centered_b_y_upper_f64x2);
420
- float64x2_t delta_z_lower_f64x2 = vsubq_f64(rotated_a_z_lower_f64x2, centered_b_z_lower_f64x2);
421
- float64x2_t delta_z_upper_f64x2 = vsubq_f64(rotated_a_z_upper_f64x2, centered_b_z_upper_f64x2);
422
-
423
- sum_squared_lower_f64x2 = vfmaq_f64(sum_squared_lower_f64x2, delta_x_lower_f64x2, delta_x_lower_f64x2),
424
- sum_squared_upper_f64x2 = vfmaq_f64(sum_squared_upper_f64x2, delta_x_upper_f64x2, delta_x_upper_f64x2);
425
- sum_squared_lower_f64x2 = vfmaq_f64(sum_squared_lower_f64x2, delta_y_lower_f64x2, delta_y_lower_f64x2),
426
- sum_squared_upper_f64x2 = vfmaq_f64(sum_squared_upper_f64x2, delta_y_upper_f64x2, delta_y_upper_f64x2);
427
- sum_squared_lower_f64x2 = vfmaq_f64(sum_squared_lower_f64x2, delta_z_lower_f64x2, delta_z_lower_f64x2),
428
- sum_squared_upper_f64x2 = vfmaq_f64(sum_squared_upper_f64x2, delta_z_upper_f64x2, delta_z_upper_f64x2);
123
+ float64x2_t centered_a_x_low_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(a_x_f32x4)), centroid_a_x_f64x2);
124
+ float64x2_t centered_a_x_high_f64x2 = vsubq_f64(vcvt_high_f64_f32(a_x_f32x4), centroid_a_x_f64x2);
125
+ float64x2_t centered_a_y_low_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(a_y_f32x4)), centroid_a_y_f64x2);
126
+ float64x2_t centered_a_y_high_f64x2 = vsubq_f64(vcvt_high_f64_f32(a_y_f32x4), centroid_a_y_f64x2);
127
+ float64x2_t centered_a_z_low_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(a_z_f32x4)), centroid_a_z_f64x2);
128
+ float64x2_t centered_a_z_high_f64x2 = vsubq_f64(vcvt_high_f64_f32(a_z_f32x4), centroid_a_z_f64x2);
129
+ float64x2_t centered_b_x_low_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(b_x_f32x4)), centroid_b_x_f64x2);
130
+ float64x2_t centered_b_x_high_f64x2 = vsubq_f64(vcvt_high_f64_f32(b_x_f32x4), centroid_b_x_f64x2);
131
+ float64x2_t centered_b_y_low_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(b_y_f32x4)), centroid_b_y_f64x2);
132
+ float64x2_t centered_b_y_high_f64x2 = vsubq_f64(vcvt_high_f64_f32(b_y_f32x4), centroid_b_y_f64x2);
133
+ float64x2_t centered_b_z_low_f64x2 = vsubq_f64(vcvt_f64_f32(vget_low_f32(b_z_f32x4)), centroid_b_z_f64x2);
134
+ float64x2_t centered_b_z_high_f64x2 = vsubq_f64(vcvt_high_f64_f32(b_z_f32x4), centroid_b_z_f64x2);
135
+
136
+ float64x2_t rotated_a_x_low_f64x2 = vfmaq_f64(
137
+ vfmaq_f64(vmulq_f64(scaled_rotation_x_x_f64x2, centered_a_x_low_f64x2), scaled_rotation_x_y_f64x2,
138
+ centered_a_y_low_f64x2),
139
+ scaled_rotation_x_z_f64x2, centered_a_z_low_f64x2);
140
+ float64x2_t rotated_a_x_high_f64x2 = vfmaq_f64(
141
+ vfmaq_f64(vmulq_f64(scaled_rotation_x_x_f64x2, centered_a_x_high_f64x2), scaled_rotation_x_y_f64x2,
142
+ centered_a_y_high_f64x2),
143
+ scaled_rotation_x_z_f64x2, centered_a_z_high_f64x2);
144
+ float64x2_t rotated_a_y_low_f64x2 = vfmaq_f64(
145
+ vfmaq_f64(vmulq_f64(scaled_rotation_y_x_f64x2, centered_a_x_low_f64x2), scaled_rotation_y_y_f64x2,
146
+ centered_a_y_low_f64x2),
147
+ scaled_rotation_y_z_f64x2, centered_a_z_low_f64x2);
148
+ float64x2_t rotated_a_y_high_f64x2 = vfmaq_f64(
149
+ vfmaq_f64(vmulq_f64(scaled_rotation_y_x_f64x2, centered_a_x_high_f64x2), scaled_rotation_y_y_f64x2,
150
+ centered_a_y_high_f64x2),
151
+ scaled_rotation_y_z_f64x2, centered_a_z_high_f64x2);
152
+ float64x2_t rotated_a_z_low_f64x2 = vfmaq_f64(
153
+ vfmaq_f64(vmulq_f64(scaled_rotation_z_x_f64x2, centered_a_x_low_f64x2), scaled_rotation_z_y_f64x2,
154
+ centered_a_y_low_f64x2),
155
+ scaled_rotation_z_z_f64x2, centered_a_z_low_f64x2);
156
+ float64x2_t rotated_a_z_high_f64x2 = vfmaq_f64(
157
+ vfmaq_f64(vmulq_f64(scaled_rotation_z_x_f64x2, centered_a_x_high_f64x2), scaled_rotation_z_y_f64x2,
158
+ centered_a_y_high_f64x2),
159
+ scaled_rotation_z_z_f64x2, centered_a_z_high_f64x2);
160
+
161
+ float64x2_t delta_x_low_f64x2 = vsubq_f64(rotated_a_x_low_f64x2, centered_b_x_low_f64x2);
162
+ float64x2_t delta_x_high_f64x2 = vsubq_f64(rotated_a_x_high_f64x2, centered_b_x_high_f64x2);
163
+ float64x2_t delta_y_low_f64x2 = vsubq_f64(rotated_a_y_low_f64x2, centered_b_y_low_f64x2);
164
+ float64x2_t delta_y_high_f64x2 = vsubq_f64(rotated_a_y_high_f64x2, centered_b_y_high_f64x2);
165
+ float64x2_t delta_z_low_f64x2 = vsubq_f64(rotated_a_z_low_f64x2, centered_b_z_low_f64x2);
166
+ float64x2_t delta_z_high_f64x2 = vsubq_f64(rotated_a_z_high_f64x2, centered_b_z_high_f64x2);
167
+
168
+ sum_squared_low_f64x2 = vfmaq_f64(sum_squared_low_f64x2, delta_x_low_f64x2, delta_x_low_f64x2),
169
+ sum_squared_high_f64x2 = vfmaq_f64(sum_squared_high_f64x2, delta_x_high_f64x2, delta_x_high_f64x2);
170
+ sum_squared_low_f64x2 = vfmaq_f64(sum_squared_low_f64x2, delta_y_low_f64x2, delta_y_low_f64x2),
171
+ sum_squared_high_f64x2 = vfmaq_f64(sum_squared_high_f64x2, delta_y_high_f64x2, delta_y_high_f64x2);
172
+ sum_squared_low_f64x2 = vfmaq_f64(sum_squared_low_f64x2, delta_z_low_f64x2, delta_z_low_f64x2),
173
+ sum_squared_high_f64x2 = vfmaq_f64(sum_squared_high_f64x2, delta_z_high_f64x2, delta_z_high_f64x2);
429
174
  }
430
175
 
431
- nk_f64_t sum_squared = vaddvq_f64(vaddq_f64(sum_squared_lower_f64x2, sum_squared_upper_f64x2));
176
+ nk_f64_t sum_squared = vaddvq_f64(vaddq_f64(sum_squared_low_f64x2, sum_squared_high_f64x2));
432
177
  for (; index < n; ++index) {
433
- nk_f64_t centered_a_x = (nk_f64_t)a[index * 3 + 0] - centroid_a_x;
434
- nk_f64_t centered_a_y = (nk_f64_t)a[index * 3 + 1] - centroid_a_y;
435
- nk_f64_t centered_a_z = (nk_f64_t)a[index * 3 + 2] - centroid_a_z;
436
- nk_f64_t centered_b_x = (nk_f64_t)b[index * 3 + 0] - centroid_b_x;
437
- nk_f64_t centered_b_y = (nk_f64_t)b[index * 3 + 1] - centroid_b_y;
438
- nk_f64_t centered_b_z = (nk_f64_t)b[index * 3 + 2] - centroid_b_z;
439
- nk_f64_t rotated_a_x = scale * (r[0] * centered_a_x + r[1] * centered_a_y + r[2] * centered_a_z);
440
- nk_f64_t rotated_a_y = scale * (r[3] * centered_a_x + r[4] * centered_a_y + r[5] * centered_a_z);
441
- nk_f64_t rotated_a_z = scale * (r[6] * centered_a_x + r[7] * centered_a_y + r[8] * centered_a_z);
178
+ nk_f64_t centered_a_x = (nk_f64_t)a[index * 3 + 0] - centroid_a_x,
179
+ centered_a_y = (nk_f64_t)a[index * 3 + 1] - centroid_a_y,
180
+ centered_a_z = (nk_f64_t)a[index * 3 + 2] - centroid_a_z;
181
+ nk_f64_t centered_b_x = (nk_f64_t)b[index * 3 + 0] - centroid_b_x,
182
+ centered_b_y = (nk_f64_t)b[index * 3 + 1] - centroid_b_y,
183
+ centered_b_z = (nk_f64_t)b[index * 3 + 2] - centroid_b_z;
184
+ nk_f64_t rotated_a_x = scale * (r[0] * centered_a_x + r[1] * centered_a_y + r[2] * centered_a_z),
185
+ rotated_a_y = scale * (r[3] * centered_a_x + r[4] * centered_a_y + r[5] * centered_a_z),
186
+ rotated_a_z = scale * (r[6] * centered_a_x + r[7] * centered_a_y + r[8] * centered_a_z);
442
187
  nk_f64_t delta_x = rotated_a_x - centered_b_x, delta_y = rotated_a_y - centered_b_y,
443
188
  delta_z = rotated_a_z - centered_b_z;
444
189
  sum_squared += delta_x * delta_x + delta_y * delta_y + delta_z * delta_z;
@@ -482,100 +227,100 @@ NK_INTERNAL nk_f64_t nk_transformed_ssd_f64_neon_(nk_f64_t const *a, nk_f64_t co
482
227
  // Main loop: process 4 points per iteration (2x unrolled, 2 points per batch)
483
228
  for (; j + 4 <= n; j += 4) {
484
229
  // First batch of 2 points
485
- float64x2_t a1_x, a1_y, a1_z, b1_x, b1_y, b1_z;
486
- nk_deinterleave_f64x2_neon_(a + j * 3, &a1_x, &a1_y, &a1_z);
487
- nk_deinterleave_f64x2_neon_(b + j * 3, &b1_x, &b1_y, &b1_z);
230
+ float64x2_t a1_x_f64x2, a1_y_f64x2, a1_z_f64x2, b1_x_f64x2, b1_y_f64x2, b1_z_f64x2;
231
+ nk_deinterleave_f64x2_neon_(a + j * 3, &a1_x_f64x2, &a1_y_f64x2, &a1_z_f64x2);
232
+ nk_deinterleave_f64x2_neon_(b + j * 3, &b1_x_f64x2, &b1_y_f64x2, &b1_z_f64x2);
488
233
 
489
234
  // Second batch of 2 points
490
- float64x2_t a2_x, a2_y, a2_z, b2_x, b2_y, b2_z;
491
- nk_deinterleave_f64x2_neon_(a + (j + 2) * 3, &a2_x, &a2_y, &a2_z);
492
- nk_deinterleave_f64x2_neon_(b + (j + 2) * 3, &b2_x, &b2_y, &b2_z);
235
+ float64x2_t a2_x_f64x2, a2_y_f64x2, a2_z_f64x2, b2_x_f64x2, b2_y_f64x2, b2_z_f64x2;
236
+ nk_deinterleave_f64x2_neon_(a + (j + 2) * 3, &a2_x_f64x2, &a2_y_f64x2, &a2_z_f64x2);
237
+ nk_deinterleave_f64x2_neon_(b + (j + 2) * 3, &b2_x_f64x2, &b2_y_f64x2, &b2_z_f64x2);
493
238
 
494
239
  // Center first batch
495
- float64x2_t pa1_x = vsubq_f64(a1_x, centroid_a_x_f64x2);
496
- float64x2_t pa1_y = vsubq_f64(a1_y, centroid_a_y_f64x2);
497
- float64x2_t pa1_z = vsubq_f64(a1_z, centroid_a_z_f64x2);
498
- float64x2_t pb1_x = vsubq_f64(b1_x, centroid_b_x_f64x2);
499
- float64x2_t pb1_y = vsubq_f64(b1_y, centroid_b_y_f64x2);
500
- float64x2_t pb1_z = vsubq_f64(b1_z, centroid_b_z_f64x2);
240
+ float64x2_t centered_a1_x_f64x2 = vsubq_f64(a1_x_f64x2, centroid_a_x_f64x2);
241
+ float64x2_t centered_a1_y_f64x2 = vsubq_f64(a1_y_f64x2, centroid_a_y_f64x2);
242
+ float64x2_t centered_a1_z_f64x2 = vsubq_f64(a1_z_f64x2, centroid_a_z_f64x2);
243
+ float64x2_t centered_b1_x_f64x2 = vsubq_f64(b1_x_f64x2, centroid_b_x_f64x2);
244
+ float64x2_t centered_b1_y_f64x2 = vsubq_f64(b1_y_f64x2, centroid_b_y_f64x2);
245
+ float64x2_t centered_b1_z_f64x2 = vsubq_f64(b1_z_f64x2, centroid_b_z_f64x2);
501
246
 
502
247
  // Center second batch
503
- float64x2_t pa2_x = vsubq_f64(a2_x, centroid_a_x_f64x2);
504
- float64x2_t pa2_y = vsubq_f64(a2_y, centroid_a_y_f64x2);
505
- float64x2_t pa2_z = vsubq_f64(a2_z, centroid_a_z_f64x2);
506
- float64x2_t pb2_x = vsubq_f64(b2_x, centroid_b_x_f64x2);
507
- float64x2_t pb2_y = vsubq_f64(b2_y, centroid_b_y_f64x2);
508
- float64x2_t pb2_z = vsubq_f64(b2_z, centroid_b_z_f64x2);
248
+ float64x2_t centered_a2_x_f64x2 = vsubq_f64(a2_x_f64x2, centroid_a_x_f64x2);
249
+ float64x2_t centered_a2_y_f64x2 = vsubq_f64(a2_y_f64x2, centroid_a_y_f64x2);
250
+ float64x2_t centered_a2_z_f64x2 = vsubq_f64(a2_z_f64x2, centroid_a_z_f64x2);
251
+ float64x2_t centered_b2_x_f64x2 = vsubq_f64(b2_x_f64x2, centroid_b_x_f64x2);
252
+ float64x2_t centered_b2_y_f64x2 = vsubq_f64(b2_y_f64x2, centroid_b_y_f64x2);
253
+ float64x2_t centered_b2_z_f64x2 = vsubq_f64(b2_z_f64x2, centroid_b_z_f64x2);
509
254
 
510
255
  // Rotate and scale first batch
511
- float64x2_t ra1_x = vfmaq_f64(
512
- vfmaq_f64(vmulq_f64(scaled_rotation_x_x_f64x2, pa1_x), scaled_rotation_x_y_f64x2, pa1_y),
513
- scaled_rotation_x_z_f64x2, pa1_z);
514
- float64x2_t ra1_y = vfmaq_f64(
515
- vfmaq_f64(vmulq_f64(scaled_rotation_y_x_f64x2, pa1_x), scaled_rotation_y_y_f64x2, pa1_y),
516
- scaled_rotation_y_z_f64x2, pa1_z);
517
- float64x2_t ra1_z = vfmaq_f64(
518
- vfmaq_f64(vmulq_f64(scaled_rotation_z_x_f64x2, pa1_x), scaled_rotation_z_y_f64x2, pa1_y),
519
- scaled_rotation_z_z_f64x2, pa1_z);
256
+ float64x2_t rotated_a1_x_f64x2 = vfmaq_f64(vfmaq_f64(vmulq_f64(scaled_rotation_x_x_f64x2, centered_a1_x_f64x2),
257
+ scaled_rotation_x_y_f64x2, centered_a1_y_f64x2),
258
+ scaled_rotation_x_z_f64x2, centered_a1_z_f64x2);
259
+ float64x2_t rotated_a1_y_f64x2 = vfmaq_f64(vfmaq_f64(vmulq_f64(scaled_rotation_y_x_f64x2, centered_a1_x_f64x2),
260
+ scaled_rotation_y_y_f64x2, centered_a1_y_f64x2),
261
+ scaled_rotation_y_z_f64x2, centered_a1_z_f64x2);
262
+ float64x2_t rotated_a1_z_f64x2 = vfmaq_f64(vfmaq_f64(vmulq_f64(scaled_rotation_z_x_f64x2, centered_a1_x_f64x2),
263
+ scaled_rotation_z_y_f64x2, centered_a1_y_f64x2),
264
+ scaled_rotation_z_z_f64x2, centered_a1_z_f64x2);
520
265
 
521
266
  // Rotate and scale second batch
522
- float64x2_t ra2_x = vfmaq_f64(
523
- vfmaq_f64(vmulq_f64(scaled_rotation_x_x_f64x2, pa2_x), scaled_rotation_x_y_f64x2, pa2_y),
524
- scaled_rotation_x_z_f64x2, pa2_z);
525
- float64x2_t ra2_y = vfmaq_f64(
526
- vfmaq_f64(vmulq_f64(scaled_rotation_y_x_f64x2, pa2_x), scaled_rotation_y_y_f64x2, pa2_y),
527
- scaled_rotation_y_z_f64x2, pa2_z);
528
- float64x2_t ra2_z = vfmaq_f64(
529
- vfmaq_f64(vmulq_f64(scaled_rotation_z_x_f64x2, pa2_x), scaled_rotation_z_y_f64x2, pa2_y),
530
- scaled_rotation_z_z_f64x2, pa2_z);
267
+ float64x2_t rotated_a2_x_f64x2 = vfmaq_f64(vfmaq_f64(vmulq_f64(scaled_rotation_x_x_f64x2, centered_a2_x_f64x2),
268
+ scaled_rotation_x_y_f64x2, centered_a2_y_f64x2),
269
+ scaled_rotation_x_z_f64x2, centered_a2_z_f64x2);
270
+ float64x2_t rotated_a2_y_f64x2 = vfmaq_f64(vfmaq_f64(vmulq_f64(scaled_rotation_y_x_f64x2, centered_a2_x_f64x2),
271
+ scaled_rotation_y_y_f64x2, centered_a2_y_f64x2),
272
+ scaled_rotation_y_z_f64x2, centered_a2_z_f64x2);
273
+ float64x2_t rotated_a2_z_f64x2 = vfmaq_f64(vfmaq_f64(vmulq_f64(scaled_rotation_z_x_f64x2, centered_a2_x_f64x2),
274
+ scaled_rotation_z_y_f64x2, centered_a2_y_f64x2),
275
+ scaled_rotation_z_z_f64x2, centered_a2_z_f64x2);
531
276
 
532
277
  // Deltas
533
- float64x2_t delta1_x = vsubq_f64(ra1_x, pb1_x);
534
- float64x2_t delta1_y = vsubq_f64(ra1_y, pb1_y);
535
- float64x2_t delta1_z = vsubq_f64(ra1_z, pb1_z);
536
- float64x2_t delta2_x = vsubq_f64(ra2_x, pb2_x);
537
- float64x2_t delta2_y = vsubq_f64(ra2_y, pb2_y);
538
- float64x2_t delta2_z = vsubq_f64(ra2_z, pb2_z);
278
+ float64x2_t delta1_x_f64x2 = vsubq_f64(rotated_a1_x_f64x2, centered_b1_x_f64x2);
279
+ float64x2_t delta1_y_f64x2 = vsubq_f64(rotated_a1_y_f64x2, centered_b1_y_f64x2);
280
+ float64x2_t delta1_z_f64x2 = vsubq_f64(rotated_a1_z_f64x2, centered_b1_z_f64x2);
281
+ float64x2_t delta2_x_f64x2 = vsubq_f64(rotated_a2_x_f64x2, centered_b2_x_f64x2);
282
+ float64x2_t delta2_y_f64x2 = vsubq_f64(rotated_a2_y_f64x2, centered_b2_y_f64x2);
283
+ float64x2_t delta2_z_f64x2 = vsubq_f64(rotated_a2_z_f64x2, centered_b2_z_f64x2);
539
284
 
540
285
  // Accumulate to independent accumulators (interleaved for latency hiding)
541
- nk_accumulate_square_f64x2_neon_(&sum_squared_a_f64x2, &sum_squared_a_compensation_f64x2, delta1_x);
542
- nk_accumulate_square_f64x2_neon_(&sum_squared_b_f64x2, &sum_squared_b_compensation_f64x2, delta2_x);
543
- nk_accumulate_square_f64x2_neon_(&sum_squared_a_f64x2, &sum_squared_a_compensation_f64x2, delta1_y);
544
- nk_accumulate_square_f64x2_neon_(&sum_squared_b_f64x2, &sum_squared_b_compensation_f64x2, delta2_y);
545
- nk_accumulate_square_f64x2_neon_(&sum_squared_a_f64x2, &sum_squared_a_compensation_f64x2, delta1_z);
546
- nk_accumulate_square_f64x2_neon_(&sum_squared_b_f64x2, &sum_squared_b_compensation_f64x2, delta2_z);
286
+ nk_accumulate_square_f64x2_neon_(&sum_squared_a_f64x2, &sum_squared_a_compensation_f64x2, delta1_x_f64x2);
287
+ nk_accumulate_square_f64x2_neon_(&sum_squared_b_f64x2, &sum_squared_b_compensation_f64x2, delta2_x_f64x2);
288
+ nk_accumulate_square_f64x2_neon_(&sum_squared_a_f64x2, &sum_squared_a_compensation_f64x2, delta1_y_f64x2);
289
+ nk_accumulate_square_f64x2_neon_(&sum_squared_b_f64x2, &sum_squared_b_compensation_f64x2, delta2_y_f64x2);
290
+ nk_accumulate_square_f64x2_neon_(&sum_squared_a_f64x2, &sum_squared_a_compensation_f64x2, delta1_z_f64x2);
291
+ nk_accumulate_square_f64x2_neon_(&sum_squared_b_f64x2, &sum_squared_b_compensation_f64x2, delta2_z_f64x2);
547
292
  }
548
293
 
549
294
  // Handle remaining 2 points
550
295
  if (j + 2 <= n) {
551
- float64x2_t a_x, a_y, a_z, b_x, b_y, b_z;
552
- nk_deinterleave_f64x2_neon_(a + j * 3, &a_x, &a_y, &a_z);
553
- nk_deinterleave_f64x2_neon_(b + j * 3, &b_x, &b_y, &b_z);
554
-
555
- float64x2_t pa_x = vsubq_f64(a_x, centroid_a_x_f64x2);
556
- float64x2_t pa_y = vsubq_f64(a_y, centroid_a_y_f64x2);
557
- float64x2_t pa_z = vsubq_f64(a_z, centroid_a_z_f64x2);
558
- float64x2_t pb_x = vsubq_f64(b_x, centroid_b_x_f64x2);
559
- float64x2_t pb_y = vsubq_f64(b_y, centroid_b_y_f64x2);
560
- float64x2_t pb_z = vsubq_f64(b_z, centroid_b_z_f64x2);
561
-
562
- float64x2_t ra_x = vfmaq_f64(
563
- vfmaq_f64(vmulq_f64(scaled_rotation_x_x_f64x2, pa_x), scaled_rotation_x_y_f64x2, pa_y),
564
- scaled_rotation_x_z_f64x2, pa_z);
565
- float64x2_t ra_y = vfmaq_f64(
566
- vfmaq_f64(vmulq_f64(scaled_rotation_y_x_f64x2, pa_x), scaled_rotation_y_y_f64x2, pa_y),
567
- scaled_rotation_y_z_f64x2, pa_z);
568
- float64x2_t ra_z = vfmaq_f64(
569
- vfmaq_f64(vmulq_f64(scaled_rotation_z_x_f64x2, pa_x), scaled_rotation_z_y_f64x2, pa_y),
570
- scaled_rotation_z_z_f64x2, pa_z);
571
-
572
- float64x2_t delta_x = vsubq_f64(ra_x, pb_x);
573
- float64x2_t delta_y = vsubq_f64(ra_y, pb_y);
574
- float64x2_t delta_z = vsubq_f64(ra_z, pb_z);
575
-
576
- nk_accumulate_square_f64x2_neon_(&sum_squared_a_f64x2, &sum_squared_a_compensation_f64x2, delta_x);
577
- nk_accumulate_square_f64x2_neon_(&sum_squared_a_f64x2, &sum_squared_a_compensation_f64x2, delta_y);
578
- nk_accumulate_square_f64x2_neon_(&sum_squared_a_f64x2, &sum_squared_a_compensation_f64x2, delta_z);
296
+ float64x2_t a_x_f64x2, a_y_f64x2, a_z_f64x2, b_x_f64x2, b_y_f64x2, b_z_f64x2;
297
+ nk_deinterleave_f64x2_neon_(a + j * 3, &a_x_f64x2, &a_y_f64x2, &a_z_f64x2);
298
+ nk_deinterleave_f64x2_neon_(b + j * 3, &b_x_f64x2, &b_y_f64x2, &b_z_f64x2);
299
+
300
+ float64x2_t centered_a_x_f64x2 = vsubq_f64(a_x_f64x2, centroid_a_x_f64x2);
301
+ float64x2_t centered_a_y_f64x2 = vsubq_f64(a_y_f64x2, centroid_a_y_f64x2);
302
+ float64x2_t centered_a_z_f64x2 = vsubq_f64(a_z_f64x2, centroid_a_z_f64x2);
303
+ float64x2_t centered_b_x_f64x2 = vsubq_f64(b_x_f64x2, centroid_b_x_f64x2);
304
+ float64x2_t centered_b_y_f64x2 = vsubq_f64(b_y_f64x2, centroid_b_y_f64x2);
305
+ float64x2_t centered_b_z_f64x2 = vsubq_f64(b_z_f64x2, centroid_b_z_f64x2);
306
+
307
+ float64x2_t rotated_a_x_f64x2 = vfmaq_f64(vfmaq_f64(vmulq_f64(scaled_rotation_x_x_f64x2, centered_a_x_f64x2),
308
+ scaled_rotation_x_y_f64x2, centered_a_y_f64x2),
309
+ scaled_rotation_x_z_f64x2, centered_a_z_f64x2);
310
+ float64x2_t rotated_a_y_f64x2 = vfmaq_f64(vfmaq_f64(vmulq_f64(scaled_rotation_y_x_f64x2, centered_a_x_f64x2),
311
+ scaled_rotation_y_y_f64x2, centered_a_y_f64x2),
312
+ scaled_rotation_y_z_f64x2, centered_a_z_f64x2);
313
+ float64x2_t rotated_a_z_f64x2 = vfmaq_f64(vfmaq_f64(vmulq_f64(scaled_rotation_z_x_f64x2, centered_a_x_f64x2),
314
+ scaled_rotation_z_y_f64x2, centered_a_y_f64x2),
315
+ scaled_rotation_z_z_f64x2, centered_a_z_f64x2);
316
+
317
+ float64x2_t delta_x_f64x2 = vsubq_f64(rotated_a_x_f64x2, centered_b_x_f64x2);
318
+ float64x2_t delta_y_f64x2 = vsubq_f64(rotated_a_y_f64x2, centered_b_y_f64x2);
319
+ float64x2_t delta_z_f64x2 = vsubq_f64(rotated_a_z_f64x2, centered_b_z_f64x2);
320
+
321
+ nk_accumulate_square_f64x2_neon_(&sum_squared_a_f64x2, &sum_squared_a_compensation_f64x2, delta_x_f64x2);
322
+ nk_accumulate_square_f64x2_neon_(&sum_squared_a_f64x2, &sum_squared_a_compensation_f64x2, delta_y_f64x2);
323
+ nk_accumulate_square_f64x2_neon_(&sum_squared_a_f64x2, &sum_squared_a_compensation_f64x2, delta_z_f64x2);
579
324
  j += 2;
580
325
  }
581
326
 
@@ -588,20 +333,16 @@ NK_INTERNAL nk_f64_t nk_transformed_ssd_f64_neon_(nk_f64_t const *a, nk_f64_t co
588
333
 
589
334
  // Scalar tail
590
335
  for (; j < n; ++j) {
591
- nk_f64_t pa_x = a[j * 3 + 0] - centroid_a_x;
592
- nk_f64_t pa_y = a[j * 3 + 1] - centroid_a_y;
593
- nk_f64_t pa_z = a[j * 3 + 2] - centroid_a_z;
594
- nk_f64_t pb_x = b[j * 3 + 0] - centroid_b_x;
595
- nk_f64_t pb_y = b[j * 3 + 1] - centroid_b_y;
596
- nk_f64_t pb_z = b[j * 3 + 2] - centroid_b_z;
597
-
598
- nk_f64_t ra_x = scale * (r[0] * pa_x + r[1] * pa_y + r[2] * pa_z);
599
- nk_f64_t ra_y = scale * (r[3] * pa_x + r[4] * pa_y + r[5] * pa_z);
600
- nk_f64_t ra_z = scale * (r[6] * pa_x + r[7] * pa_y + r[8] * pa_z);
601
-
602
- nk_f64_t delta_x = ra_x - pb_x;
603
- nk_f64_t delta_y = ra_y - pb_y;
604
- nk_f64_t delta_z = ra_z - pb_z;
336
+ nk_f64_t pa_x = a[j * 3 + 0] - centroid_a_x, pa_y = a[j * 3 + 1] - centroid_a_y,
337
+ pa_z = a[j * 3 + 2] - centroid_a_z;
338
+ nk_f64_t pb_x = b[j * 3 + 0] - centroid_b_x, pb_y = b[j * 3 + 1] - centroid_b_y,
339
+ pb_z = b[j * 3 + 2] - centroid_b_z;
340
+
341
+ nk_f64_t ra_x = scale * (r[0] * pa_x + r[1] * pa_y + r[2] * pa_z),
342
+ ra_y = scale * (r[3] * pa_x + r[4] * pa_y + r[5] * pa_z),
343
+ ra_z = scale * (r[6] * pa_x + r[7] * pa_y + r[8] * pa_z);
344
+
345
+ nk_f64_t delta_x = ra_x - pb_x, delta_y = ra_y - pb_y, delta_z = ra_z - pb_z;
605
346
  nk_accumulate_square_f64_(&sum_squared, &sum_squared_compensation, delta_x);
606
347
  nk_accumulate_square_f64_(&sum_squared, &sum_squared_compensation, delta_y);
607
348
  nk_accumulate_square_f64_(&sum_squared, &sum_squared_compensation, delta_z);
@@ -612,23 +353,21 @@ NK_INTERNAL nk_f64_t nk_transformed_ssd_f64_neon_(nk_f64_t const *a, nk_f64_t co
612
353
 
613
354
  NK_PUBLIC void nk_rmsd_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
614
355
  nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
615
- if (rotation) {
616
- rotation[0] = 1, rotation[1] = 0, rotation[2] = 0;
617
- rotation[3] = 0, rotation[4] = 1, rotation[5] = 0;
356
+ if (rotation)
357
+ rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
618
358
  rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
619
- }
620
359
  if (scale) *scale = 1.0f;
621
360
 
622
361
  float64x2_t zero_f64x2 = vdupq_n_f64(0.0);
623
- float64x2_t sum_a_x_lower_f64x2 = zero_f64x2, sum_a_x_upper_f64x2 = zero_f64x2;
624
- float64x2_t sum_a_y_lower_f64x2 = zero_f64x2, sum_a_y_upper_f64x2 = zero_f64x2;
625
- float64x2_t sum_a_z_lower_f64x2 = zero_f64x2, sum_a_z_upper_f64x2 = zero_f64x2;
626
- float64x2_t sum_b_x_lower_f64x2 = zero_f64x2, sum_b_x_upper_f64x2 = zero_f64x2;
627
- float64x2_t sum_b_y_lower_f64x2 = zero_f64x2, sum_b_y_upper_f64x2 = zero_f64x2;
628
- float64x2_t sum_b_z_lower_f64x2 = zero_f64x2, sum_b_z_upper_f64x2 = zero_f64x2;
629
- float64x2_t sum_squared_x_lower_f64x2 = zero_f64x2, sum_squared_x_upper_f64x2 = zero_f64x2;
630
- float64x2_t sum_squared_y_lower_f64x2 = zero_f64x2, sum_squared_y_upper_f64x2 = zero_f64x2;
631
- float64x2_t sum_squared_z_lower_f64x2 = zero_f64x2, sum_squared_z_upper_f64x2 = zero_f64x2;
362
+ float64x2_t sum_a_x_low_f64x2 = zero_f64x2, sum_a_x_high_f64x2 = zero_f64x2;
363
+ float64x2_t sum_a_y_low_f64x2 = zero_f64x2, sum_a_y_high_f64x2 = zero_f64x2;
364
+ float64x2_t sum_a_z_low_f64x2 = zero_f64x2, sum_a_z_high_f64x2 = zero_f64x2;
365
+ float64x2_t sum_b_x_low_f64x2 = zero_f64x2, sum_b_x_high_f64x2 = zero_f64x2;
366
+ float64x2_t sum_b_y_low_f64x2 = zero_f64x2, sum_b_y_high_f64x2 = zero_f64x2;
367
+ float64x2_t sum_b_z_low_f64x2 = zero_f64x2, sum_b_z_high_f64x2 = zero_f64x2;
368
+ float64x2_t sum_squared_x_low_f64x2 = zero_f64x2, sum_squared_x_high_f64x2 = zero_f64x2;
369
+ float64x2_t sum_squared_y_low_f64x2 = zero_f64x2, sum_squared_y_high_f64x2 = zero_f64x2;
370
+ float64x2_t sum_squared_z_low_f64x2 = zero_f64x2, sum_squared_z_high_f64x2 = zero_f64x2;
632
371
  nk_size_t index = 0;
633
372
 
634
373
  for (; index + 4 <= n; index += 4) {
@@ -636,56 +375,56 @@ NK_PUBLIC void nk_rmsd_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_size_t
636
375
  nk_deinterleave_f32x4_neon_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4),
637
376
  nk_deinterleave_f32x4_neon_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
638
377
 
639
- float64x2_t a_x_lower_f64x2 = vcvt_f64_f32(vget_low_f32(a_x_f32x4));
640
- float64x2_t a_x_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(a_x_f32x4);
641
- float64x2_t a_y_lower_f64x2 = vcvt_f64_f32(vget_low_f32(a_y_f32x4));
642
- float64x2_t a_y_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(a_y_f32x4);
643
- float64x2_t a_z_lower_f64x2 = vcvt_f64_f32(vget_low_f32(a_z_f32x4));
644
- float64x2_t a_z_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(a_z_f32x4);
645
- float64x2_t b_x_lower_f64x2 = vcvt_f64_f32(vget_low_f32(b_x_f32x4));
646
- float64x2_t b_x_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(b_x_f32x4);
647
- float64x2_t b_y_lower_f64x2 = vcvt_f64_f32(vget_low_f32(b_y_f32x4));
648
- float64x2_t b_y_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(b_y_f32x4);
649
- float64x2_t b_z_lower_f64x2 = vcvt_f64_f32(vget_low_f32(b_z_f32x4));
650
- float64x2_t b_z_upper_f64x2 = nk_promote_upper_f32x4_to_f64x2_neon_(b_z_f32x4);
651
-
652
- sum_a_x_lower_f64x2 = vaddq_f64(sum_a_x_lower_f64x2, a_x_lower_f64x2),
653
- sum_a_x_upper_f64x2 = vaddq_f64(sum_a_x_upper_f64x2, a_x_upper_f64x2);
654
- sum_a_y_lower_f64x2 = vaddq_f64(sum_a_y_lower_f64x2, a_y_lower_f64x2),
655
- sum_a_y_upper_f64x2 = vaddq_f64(sum_a_y_upper_f64x2, a_y_upper_f64x2);
656
- sum_a_z_lower_f64x2 = vaddq_f64(sum_a_z_lower_f64x2, a_z_lower_f64x2),
657
- sum_a_z_upper_f64x2 = vaddq_f64(sum_a_z_upper_f64x2, a_z_upper_f64x2);
658
- sum_b_x_lower_f64x2 = vaddq_f64(sum_b_x_lower_f64x2, b_x_lower_f64x2),
659
- sum_b_x_upper_f64x2 = vaddq_f64(sum_b_x_upper_f64x2, b_x_upper_f64x2);
660
- sum_b_y_lower_f64x2 = vaddq_f64(sum_b_y_lower_f64x2, b_y_lower_f64x2),
661
- sum_b_y_upper_f64x2 = vaddq_f64(sum_b_y_upper_f64x2, b_y_upper_f64x2);
662
- sum_b_z_lower_f64x2 = vaddq_f64(sum_b_z_lower_f64x2, b_z_lower_f64x2),
663
- sum_b_z_upper_f64x2 = vaddq_f64(sum_b_z_upper_f64x2, b_z_upper_f64x2);
664
-
665
- float64x2_t delta_x_lower_f64x2 = vsubq_f64(a_x_lower_f64x2, b_x_lower_f64x2);
666
- float64x2_t delta_x_upper_f64x2 = vsubq_f64(a_x_upper_f64x2, b_x_upper_f64x2);
667
- float64x2_t delta_y_lower_f64x2 = vsubq_f64(a_y_lower_f64x2, b_y_lower_f64x2);
668
- float64x2_t delta_y_upper_f64x2 = vsubq_f64(a_y_upper_f64x2, b_y_upper_f64x2);
669
- float64x2_t delta_z_lower_f64x2 = vsubq_f64(a_z_lower_f64x2, b_z_lower_f64x2);
670
- float64x2_t delta_z_upper_f64x2 = vsubq_f64(a_z_upper_f64x2, b_z_upper_f64x2);
671
-
672
- sum_squared_x_lower_f64x2 = vfmaq_f64(sum_squared_x_lower_f64x2, delta_x_lower_f64x2, delta_x_lower_f64x2),
673
- sum_squared_x_upper_f64x2 = vfmaq_f64(sum_squared_x_upper_f64x2, delta_x_upper_f64x2, delta_x_upper_f64x2);
674
- sum_squared_y_lower_f64x2 = vfmaq_f64(sum_squared_y_lower_f64x2, delta_y_lower_f64x2, delta_y_lower_f64x2),
675
- sum_squared_y_upper_f64x2 = vfmaq_f64(sum_squared_y_upper_f64x2, delta_y_upper_f64x2, delta_y_upper_f64x2);
676
- sum_squared_z_lower_f64x2 = vfmaq_f64(sum_squared_z_lower_f64x2, delta_z_lower_f64x2, delta_z_lower_f64x2),
677
- sum_squared_z_upper_f64x2 = vfmaq_f64(sum_squared_z_upper_f64x2, delta_z_upper_f64x2, delta_z_upper_f64x2);
378
+ float64x2_t a_x_low_f64x2 = vcvt_f64_f32(vget_low_f32(a_x_f32x4));
379
+ float64x2_t a_x_high_f64x2 = vcvt_high_f64_f32(a_x_f32x4);
380
+ float64x2_t a_y_low_f64x2 = vcvt_f64_f32(vget_low_f32(a_y_f32x4));
381
+ float64x2_t a_y_high_f64x2 = vcvt_high_f64_f32(a_y_f32x4);
382
+ float64x2_t a_z_low_f64x2 = vcvt_f64_f32(vget_low_f32(a_z_f32x4));
383
+ float64x2_t a_z_high_f64x2 = vcvt_high_f64_f32(a_z_f32x4);
384
+ float64x2_t b_x_low_f64x2 = vcvt_f64_f32(vget_low_f32(b_x_f32x4));
385
+ float64x2_t b_x_high_f64x2 = vcvt_high_f64_f32(b_x_f32x4);
386
+ float64x2_t b_y_low_f64x2 = vcvt_f64_f32(vget_low_f32(b_y_f32x4));
387
+ float64x2_t b_y_high_f64x2 = vcvt_high_f64_f32(b_y_f32x4);
388
+ float64x2_t b_z_low_f64x2 = vcvt_f64_f32(vget_low_f32(b_z_f32x4));
389
+ float64x2_t b_z_high_f64x2 = vcvt_high_f64_f32(b_z_f32x4);
390
+
391
+ sum_a_x_low_f64x2 = vaddq_f64(sum_a_x_low_f64x2, a_x_low_f64x2),
392
+ sum_a_x_high_f64x2 = vaddq_f64(sum_a_x_high_f64x2, a_x_high_f64x2);
393
+ sum_a_y_low_f64x2 = vaddq_f64(sum_a_y_low_f64x2, a_y_low_f64x2),
394
+ sum_a_y_high_f64x2 = vaddq_f64(sum_a_y_high_f64x2, a_y_high_f64x2);
395
+ sum_a_z_low_f64x2 = vaddq_f64(sum_a_z_low_f64x2, a_z_low_f64x2),
396
+ sum_a_z_high_f64x2 = vaddq_f64(sum_a_z_high_f64x2, a_z_high_f64x2);
397
+ sum_b_x_low_f64x2 = vaddq_f64(sum_b_x_low_f64x2, b_x_low_f64x2),
398
+ sum_b_x_high_f64x2 = vaddq_f64(sum_b_x_high_f64x2, b_x_high_f64x2);
399
+ sum_b_y_low_f64x2 = vaddq_f64(sum_b_y_low_f64x2, b_y_low_f64x2),
400
+ sum_b_y_high_f64x2 = vaddq_f64(sum_b_y_high_f64x2, b_y_high_f64x2);
401
+ sum_b_z_low_f64x2 = vaddq_f64(sum_b_z_low_f64x2, b_z_low_f64x2),
402
+ sum_b_z_high_f64x2 = vaddq_f64(sum_b_z_high_f64x2, b_z_high_f64x2);
403
+
404
+ float64x2_t delta_x_low_f64x2 = vsubq_f64(a_x_low_f64x2, b_x_low_f64x2);
405
+ float64x2_t delta_x_high_f64x2 = vsubq_f64(a_x_high_f64x2, b_x_high_f64x2);
406
+ float64x2_t delta_y_low_f64x2 = vsubq_f64(a_y_low_f64x2, b_y_low_f64x2);
407
+ float64x2_t delta_y_high_f64x2 = vsubq_f64(a_y_high_f64x2, b_y_high_f64x2);
408
+ float64x2_t delta_z_low_f64x2 = vsubq_f64(a_z_low_f64x2, b_z_low_f64x2);
409
+ float64x2_t delta_z_high_f64x2 = vsubq_f64(a_z_high_f64x2, b_z_high_f64x2);
410
+
411
+ sum_squared_x_low_f64x2 = vfmaq_f64(sum_squared_x_low_f64x2, delta_x_low_f64x2, delta_x_low_f64x2),
412
+ sum_squared_x_high_f64x2 = vfmaq_f64(sum_squared_x_high_f64x2, delta_x_high_f64x2, delta_x_high_f64x2);
413
+ sum_squared_y_low_f64x2 = vfmaq_f64(sum_squared_y_low_f64x2, delta_y_low_f64x2, delta_y_low_f64x2),
414
+ sum_squared_y_high_f64x2 = vfmaq_f64(sum_squared_y_high_f64x2, delta_y_high_f64x2, delta_y_high_f64x2);
415
+ sum_squared_z_low_f64x2 = vfmaq_f64(sum_squared_z_low_f64x2, delta_z_low_f64x2, delta_z_low_f64x2),
416
+ sum_squared_z_high_f64x2 = vfmaq_f64(sum_squared_z_high_f64x2, delta_z_high_f64x2, delta_z_high_f64x2);
678
417
  }
679
418
 
680
- nk_f64_t sum_a_x = vaddvq_f64(vaddq_f64(sum_a_x_lower_f64x2, sum_a_x_upper_f64x2));
681
- nk_f64_t sum_a_y = vaddvq_f64(vaddq_f64(sum_a_y_lower_f64x2, sum_a_y_upper_f64x2));
682
- nk_f64_t sum_a_z = vaddvq_f64(vaddq_f64(sum_a_z_lower_f64x2, sum_a_z_upper_f64x2));
683
- nk_f64_t sum_b_x = vaddvq_f64(vaddq_f64(sum_b_x_lower_f64x2, sum_b_x_upper_f64x2));
684
- nk_f64_t sum_b_y = vaddvq_f64(vaddq_f64(sum_b_y_lower_f64x2, sum_b_y_upper_f64x2));
685
- nk_f64_t sum_b_z = vaddvq_f64(vaddq_f64(sum_b_z_lower_f64x2, sum_b_z_upper_f64x2));
686
- nk_f64_t sum_squared_x = vaddvq_f64(vaddq_f64(sum_squared_x_lower_f64x2, sum_squared_x_upper_f64x2));
687
- nk_f64_t sum_squared_y = vaddvq_f64(vaddq_f64(sum_squared_y_lower_f64x2, sum_squared_y_upper_f64x2));
688
- nk_f64_t sum_squared_z = vaddvq_f64(vaddq_f64(sum_squared_z_lower_f64x2, sum_squared_z_upper_f64x2));
419
+ nk_f64_t sum_a_x = vaddvq_f64(vaddq_f64(sum_a_x_low_f64x2, sum_a_x_high_f64x2));
420
+ nk_f64_t sum_a_y = vaddvq_f64(vaddq_f64(sum_a_y_low_f64x2, sum_a_y_high_f64x2));
421
+ nk_f64_t sum_a_z = vaddvq_f64(vaddq_f64(sum_a_z_low_f64x2, sum_a_z_high_f64x2));
422
+ nk_f64_t sum_b_x = vaddvq_f64(vaddq_f64(sum_b_x_low_f64x2, sum_b_x_high_f64x2));
423
+ nk_f64_t sum_b_y = vaddvq_f64(vaddq_f64(sum_b_y_low_f64x2, sum_b_y_high_f64x2));
424
+ nk_f64_t sum_b_z = vaddvq_f64(vaddq_f64(sum_b_z_low_f64x2, sum_b_z_high_f64x2));
425
+ nk_f64_t sum_squared_x = vaddvq_f64(vaddq_f64(sum_squared_x_low_f64x2, sum_squared_x_high_f64x2));
426
+ nk_f64_t sum_squared_y = vaddvq_f64(vaddq_f64(sum_squared_y_low_f64x2, sum_squared_y_high_f64x2));
427
+ nk_f64_t sum_squared_z = vaddvq_f64(vaddq_f64(sum_squared_z_low_f64x2, sum_squared_z_high_f64x2));
689
428
 
690
429
  for (; index < n; ++index) {
691
430
  nk_f64_t a_x = a[index * 3 + 0], a_y = a[index * 3 + 1], a_z = a[index * 3 + 2];
@@ -715,11 +454,9 @@ NK_PUBLIC void nk_rmsd_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_size_t
715
454
  NK_PUBLIC void nk_rmsd_f64_neon(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
716
455
  nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
717
456
  // RMSD uses identity rotation and scale=1.0.
718
- if (rotation) {
719
- rotation[0] = 1, rotation[1] = 0, rotation[2] = 0;
720
- rotation[3] = 0, rotation[4] = 1, rotation[5] = 0;
457
+ if (rotation)
458
+ rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
721
459
  rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
722
- }
723
460
  if (scale) *scale = 1.0;
724
461
 
725
462
  float64x2_t const zeros_f64x2 = vdupq_n_f64(0);
@@ -804,11 +541,115 @@ NK_PUBLIC void nk_rmsd_f64_neon(nk_f64_t const *a, nk_f64_t const *b, nk_size_t
804
541
 
805
542
  NK_PUBLIC void nk_kabsch_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
806
543
  nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
807
- nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z, h[9];
808
- nk_bicentroid_f32_neon_(a, b, n, &centroid_a_x, &centroid_a_y, &centroid_a_z, &centroid_b_x, &centroid_b_y,
809
- &centroid_b_z);
810
- nk_cross_covariance_f32_neon_(a, b, n, centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y,
811
- centroid_b_z, h);
544
+ float64x2_t zero_f64x2 = vdupq_n_f64(0.0);
545
+
546
+ // Centroid accumulators (f64, lower/upper halves of f32x4)
547
+ float64x2_t sum_a_x_low_f64x2 = zero_f64x2, sum_a_x_high_f64x2 = zero_f64x2;
548
+ float64x2_t sum_a_y_low_f64x2 = zero_f64x2, sum_a_y_high_f64x2 = zero_f64x2;
549
+ float64x2_t sum_a_z_low_f64x2 = zero_f64x2, sum_a_z_high_f64x2 = zero_f64x2;
550
+ float64x2_t sum_b_x_low_f64x2 = zero_f64x2, sum_b_x_high_f64x2 = zero_f64x2;
551
+ float64x2_t sum_b_y_low_f64x2 = zero_f64x2, sum_b_y_high_f64x2 = zero_f64x2;
552
+ float64x2_t sum_b_z_low_f64x2 = zero_f64x2, sum_b_z_high_f64x2 = zero_f64x2;
553
+
554
+ // Covariance accumulators (f64, lower/upper halves)
555
+ float64x2_t cov_xx_low_f64x2 = zero_f64x2, cov_xx_high_f64x2 = zero_f64x2;
556
+ float64x2_t cov_xy_low_f64x2 = zero_f64x2, cov_xy_high_f64x2 = zero_f64x2;
557
+ float64x2_t cov_xz_low_f64x2 = zero_f64x2, cov_xz_high_f64x2 = zero_f64x2;
558
+ float64x2_t cov_yx_low_f64x2 = zero_f64x2, cov_yx_high_f64x2 = zero_f64x2;
559
+ float64x2_t cov_yy_low_f64x2 = zero_f64x2, cov_yy_high_f64x2 = zero_f64x2;
560
+ float64x2_t cov_yz_low_f64x2 = zero_f64x2, cov_yz_high_f64x2 = zero_f64x2;
561
+ float64x2_t cov_zx_low_f64x2 = zero_f64x2, cov_zx_high_f64x2 = zero_f64x2;
562
+ float64x2_t cov_zy_low_f64x2 = zero_f64x2, cov_zy_high_f64x2 = zero_f64x2;
563
+ float64x2_t cov_zz_low_f64x2 = zero_f64x2, cov_zz_high_f64x2 = zero_f64x2;
564
+
565
+ nk_size_t index = 0;
566
+ for (; index + 4 <= n; index += 4) {
567
+ float32x4_t a_x_f32x4, a_y_f32x4, a_z_f32x4, b_x_f32x4, b_y_f32x4, b_z_f32x4;
568
+ nk_deinterleave_f32x4_neon_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4),
569
+ nk_deinterleave_f32x4_neon_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
570
+
571
+ float64x2_t a_x_low_f64x2 = vcvt_f64_f32(vget_low_f32(a_x_f32x4));
572
+ float64x2_t a_x_high_f64x2 = vcvt_high_f64_f32(a_x_f32x4);
573
+ float64x2_t a_y_low_f64x2 = vcvt_f64_f32(vget_low_f32(a_y_f32x4));
574
+ float64x2_t a_y_high_f64x2 = vcvt_high_f64_f32(a_y_f32x4);
575
+ float64x2_t a_z_low_f64x2 = vcvt_f64_f32(vget_low_f32(a_z_f32x4));
576
+ float64x2_t a_z_high_f64x2 = vcvt_high_f64_f32(a_z_f32x4);
577
+ float64x2_t b_x_low_f64x2 = vcvt_f64_f32(vget_low_f32(b_x_f32x4));
578
+ float64x2_t b_x_high_f64x2 = vcvt_high_f64_f32(b_x_f32x4);
579
+ float64x2_t b_y_low_f64x2 = vcvt_f64_f32(vget_low_f32(b_y_f32x4));
580
+ float64x2_t b_y_high_f64x2 = vcvt_high_f64_f32(b_y_f32x4);
581
+ float64x2_t b_z_low_f64x2 = vcvt_f64_f32(vget_low_f32(b_z_f32x4));
582
+ float64x2_t b_z_high_f64x2 = vcvt_high_f64_f32(b_z_f32x4);
583
+
584
+ // Accumulate centroids
585
+ sum_a_x_low_f64x2 = vaddq_f64(sum_a_x_low_f64x2, a_x_low_f64x2),
586
+ sum_a_x_high_f64x2 = vaddq_f64(sum_a_x_high_f64x2, a_x_high_f64x2);
587
+ sum_a_y_low_f64x2 = vaddq_f64(sum_a_y_low_f64x2, a_y_low_f64x2),
588
+ sum_a_y_high_f64x2 = vaddq_f64(sum_a_y_high_f64x2, a_y_high_f64x2);
589
+ sum_a_z_low_f64x2 = vaddq_f64(sum_a_z_low_f64x2, a_z_low_f64x2),
590
+ sum_a_z_high_f64x2 = vaddq_f64(sum_a_z_high_f64x2, a_z_high_f64x2);
591
+ sum_b_x_low_f64x2 = vaddq_f64(sum_b_x_low_f64x2, b_x_low_f64x2),
592
+ sum_b_x_high_f64x2 = vaddq_f64(sum_b_x_high_f64x2, b_x_high_f64x2);
593
+ sum_b_y_low_f64x2 = vaddq_f64(sum_b_y_low_f64x2, b_y_low_f64x2),
594
+ sum_b_y_high_f64x2 = vaddq_f64(sum_b_y_high_f64x2, b_y_high_f64x2);
595
+ sum_b_z_low_f64x2 = vaddq_f64(sum_b_z_low_f64x2, b_z_low_f64x2),
596
+ sum_b_z_high_f64x2 = vaddq_f64(sum_b_z_high_f64x2, b_z_high_f64x2);
597
+
598
+ // Accumulate raw outer products (uncentered)
599
+ cov_xx_low_f64x2 = vfmaq_f64(cov_xx_low_f64x2, a_x_low_f64x2, b_x_low_f64x2),
600
+ cov_xx_high_f64x2 = vfmaq_f64(cov_xx_high_f64x2, a_x_high_f64x2, b_x_high_f64x2);
601
+ cov_xy_low_f64x2 = vfmaq_f64(cov_xy_low_f64x2, a_x_low_f64x2, b_y_low_f64x2),
602
+ cov_xy_high_f64x2 = vfmaq_f64(cov_xy_high_f64x2, a_x_high_f64x2, b_y_high_f64x2);
603
+ cov_xz_low_f64x2 = vfmaq_f64(cov_xz_low_f64x2, a_x_low_f64x2, b_z_low_f64x2),
604
+ cov_xz_high_f64x2 = vfmaq_f64(cov_xz_high_f64x2, a_x_high_f64x2, b_z_high_f64x2);
605
+ cov_yx_low_f64x2 = vfmaq_f64(cov_yx_low_f64x2, a_y_low_f64x2, b_x_low_f64x2),
606
+ cov_yx_high_f64x2 = vfmaq_f64(cov_yx_high_f64x2, a_y_high_f64x2, b_x_high_f64x2);
607
+ cov_yy_low_f64x2 = vfmaq_f64(cov_yy_low_f64x2, a_y_low_f64x2, b_y_low_f64x2),
608
+ cov_yy_high_f64x2 = vfmaq_f64(cov_yy_high_f64x2, a_y_high_f64x2, b_y_high_f64x2);
609
+ cov_yz_low_f64x2 = vfmaq_f64(cov_yz_low_f64x2, a_y_low_f64x2, b_z_low_f64x2),
610
+ cov_yz_high_f64x2 = vfmaq_f64(cov_yz_high_f64x2, a_y_high_f64x2, b_z_high_f64x2);
611
+ cov_zx_low_f64x2 = vfmaq_f64(cov_zx_low_f64x2, a_z_low_f64x2, b_x_low_f64x2),
612
+ cov_zx_high_f64x2 = vfmaq_f64(cov_zx_high_f64x2, a_z_high_f64x2, b_x_high_f64x2);
613
+ cov_zy_low_f64x2 = vfmaq_f64(cov_zy_low_f64x2, a_z_low_f64x2, b_y_low_f64x2),
614
+ cov_zy_high_f64x2 = vfmaq_f64(cov_zy_high_f64x2, a_z_high_f64x2, b_y_high_f64x2);
615
+ cov_zz_low_f64x2 = vfmaq_f64(cov_zz_low_f64x2, a_z_low_f64x2, b_z_low_f64x2),
616
+ cov_zz_high_f64x2 = vfmaq_f64(cov_zz_high_f64x2, a_z_high_f64x2, b_z_high_f64x2);
617
+ }
618
+
619
+ // Reduce centroid accumulators
620
+ nk_f64_t sum_a_x = vaddvq_f64(vaddq_f64(sum_a_x_low_f64x2, sum_a_x_high_f64x2));
621
+ nk_f64_t sum_a_y = vaddvq_f64(vaddq_f64(sum_a_y_low_f64x2, sum_a_y_high_f64x2));
622
+ nk_f64_t sum_a_z = vaddvq_f64(vaddq_f64(sum_a_z_low_f64x2, sum_a_z_high_f64x2));
623
+ nk_f64_t sum_b_x = vaddvq_f64(vaddq_f64(sum_b_x_low_f64x2, sum_b_x_high_f64x2));
624
+ nk_f64_t sum_b_y = vaddvq_f64(vaddq_f64(sum_b_y_low_f64x2, sum_b_y_high_f64x2));
625
+ nk_f64_t sum_b_z = vaddvq_f64(vaddq_f64(sum_b_z_low_f64x2, sum_b_z_high_f64x2));
626
+
627
+ // Reduce covariance accumulators
628
+ nk_f64_t covariance_x_x = vaddvq_f64(vaddq_f64(cov_xx_low_f64x2, cov_xx_high_f64x2));
629
+ nk_f64_t covariance_x_y = vaddvq_f64(vaddq_f64(cov_xy_low_f64x2, cov_xy_high_f64x2));
630
+ nk_f64_t covariance_x_z = vaddvq_f64(vaddq_f64(cov_xz_low_f64x2, cov_xz_high_f64x2));
631
+ nk_f64_t covariance_y_x = vaddvq_f64(vaddq_f64(cov_yx_low_f64x2, cov_yx_high_f64x2));
632
+ nk_f64_t covariance_y_y = vaddvq_f64(vaddq_f64(cov_yy_low_f64x2, cov_yy_high_f64x2));
633
+ nk_f64_t covariance_y_z = vaddvq_f64(vaddq_f64(cov_yz_low_f64x2, cov_yz_high_f64x2));
634
+ nk_f64_t covariance_z_x = vaddvq_f64(vaddq_f64(cov_zx_low_f64x2, cov_zx_high_f64x2));
635
+ nk_f64_t covariance_z_y = vaddvq_f64(vaddq_f64(cov_zy_low_f64x2, cov_zy_high_f64x2));
636
+ nk_f64_t covariance_z_z = vaddvq_f64(vaddq_f64(cov_zz_low_f64x2, cov_zz_high_f64x2));
637
+
638
+ // Scalar tail
639
+ for (; index < n; ++index) {
640
+ nk_f64_t ax = (nk_f64_t)a[index * 3 + 0], ay = (nk_f64_t)a[index * 3 + 1], az = (nk_f64_t)a[index * 3 + 2];
641
+ nk_f64_t bx = (nk_f64_t)b[index * 3 + 0], by = (nk_f64_t)b[index * 3 + 1], bz = (nk_f64_t)b[index * 3 + 2];
642
+ sum_a_x += ax, sum_a_y += ay, sum_a_z += az;
643
+ sum_b_x += bx, sum_b_y += by, sum_b_z += bz;
644
+ covariance_x_x += ax * bx, covariance_x_y += ax * by, covariance_x_z += ax * bz;
645
+ covariance_y_x += ay * bx, covariance_y_y += ay * by, covariance_y_z += ay * bz;
646
+ covariance_z_x += az * bx, covariance_z_y += az * by, covariance_z_z += az * bz;
647
+ }
648
+
649
+ // Compute centroids
650
+ nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
651
+ nk_f64_t centroid_a_x = sum_a_x * inv_n, centroid_a_y = sum_a_y * inv_n, centroid_a_z = sum_a_z * inv_n;
652
+ nk_f64_t centroid_b_x = sum_b_x * inv_n, centroid_b_y = sum_b_y * inv_n, centroid_b_z = sum_b_z * inv_n;
812
653
  if (a_centroid)
813
654
  a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
814
655
  a_centroid[2] = (nk_f32_t)centroid_a_z;
@@ -816,6 +657,18 @@ NK_PUBLIC void nk_kabsch_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_size_
816
657
  b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
817
658
  b_centroid[2] = (nk_f32_t)centroid_b_z;
818
659
 
660
+ // Apply centering correction: H_centered = sum(a * bᵀ) - n * centroid_a * centroid_bᵀ
661
+ nk_f64_t h[9];
662
+ h[0] = covariance_x_x - (nk_f64_t)n * centroid_a_x * centroid_b_x;
663
+ h[1] = covariance_x_y - (nk_f64_t)n * centroid_a_x * centroid_b_y;
664
+ h[2] = covariance_x_z - (nk_f64_t)n * centroid_a_x * centroid_b_z;
665
+ h[3] = covariance_y_x - (nk_f64_t)n * centroid_a_y * centroid_b_x;
666
+ h[4] = covariance_y_y - (nk_f64_t)n * centroid_a_y * centroid_b_y;
667
+ h[5] = covariance_y_z - (nk_f64_t)n * centroid_a_y * centroid_b_z;
668
+ h[6] = covariance_z_x - (nk_f64_t)n * centroid_a_z * centroid_b_x;
669
+ h[7] = covariance_z_y - (nk_f64_t)n * centroid_a_z * centroid_b_y;
670
+ h[8] = covariance_z_z - (nk_f64_t)n * centroid_a_z * centroid_b_z;
671
+
819
672
  nk_f64_t svd_u[9], svd_s[9], svd_v[9];
820
673
  nk_svd3x3_f64_(h, svd_u, svd_s, svd_v);
821
674
 
@@ -844,7 +697,7 @@ NK_PUBLIC void nk_kabsch_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_size_
844
697
  }
845
698
 
846
699
  if (rotation)
847
- for (int index = 0; index != 9; ++index) rotation[index] = (nk_f32_t)r[index];
700
+ for (int j = 0; j != 9; ++j) rotation[j] = (nk_f32_t)r[j];
848
701
  if (scale) *scale = 1.0f;
849
702
  *result = nk_f64_sqrt_neon(nk_transformed_ssd_f32_neon_(a, b, n, r, 1.0, centroid_a_x, centroid_a_y, centroid_a_z,
850
703
  centroid_b_x, centroid_b_y, centroid_b_z) /
@@ -1007,15 +860,15 @@ NK_PUBLIC void nk_kabsch_f64_neon(nk_f64_t const *a, nk_f64_t const *b, nk_size_
1007
860
  if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
1008
861
 
1009
862
  // Apply centering correction: H_centered = H - n * centroid_a * centroid_bᵀ
1010
- covariance_x_x -= n * centroid_a_x * centroid_b_x;
1011
- covariance_x_y -= n * centroid_a_x * centroid_b_y;
1012
- covariance_x_z -= n * centroid_a_x * centroid_b_z;
1013
- covariance_y_x -= n * centroid_a_y * centroid_b_x;
1014
- covariance_y_y -= n * centroid_a_y * centroid_b_y;
1015
- covariance_y_z -= n * centroid_a_y * centroid_b_z;
1016
- covariance_z_x -= n * centroid_a_z * centroid_b_x;
1017
- covariance_z_y -= n * centroid_a_z * centroid_b_y;
1018
- covariance_z_z -= n * centroid_a_z * centroid_b_z;
863
+ covariance_x_x -= (nk_f64_t)n * centroid_a_x * centroid_b_x;
864
+ covariance_x_y -= (nk_f64_t)n * centroid_a_x * centroid_b_y;
865
+ covariance_x_z -= (nk_f64_t)n * centroid_a_x * centroid_b_z;
866
+ covariance_y_x -= (nk_f64_t)n * centroid_a_y * centroid_b_x;
867
+ covariance_y_y -= (nk_f64_t)n * centroid_a_y * centroid_b_y;
868
+ covariance_y_z -= (nk_f64_t)n * centroid_a_y * centroid_b_z;
869
+ covariance_z_x -= (nk_f64_t)n * centroid_a_z * centroid_b_x;
870
+ covariance_z_y -= (nk_f64_t)n * centroid_a_z * centroid_b_y;
871
+ covariance_z_z -= (nk_f64_t)n * centroid_a_z * centroid_b_z;
1019
872
 
1020
873
  // Compute SVD and optimal rotation
1021
874
  nk_f64_t cross_covariance[9] = {covariance_x_x, covariance_x_y, covariance_x_z, covariance_y_x, covariance_y_y,
@@ -1028,9 +881,7 @@ NK_PUBLIC void nk_kabsch_f64_neon(nk_f64_t const *a, nk_f64_t const *b, nk_size_
1028
881
 
1029
882
  // Handle reflection: if det(R) < 0, negate third column of V and recompute R
1030
883
  if (nk_det3x3_f64_(r) < 0) {
1031
- svd_v[2] = -svd_v[2];
1032
- svd_v[5] = -svd_v[5];
1033
- svd_v[8] = -svd_v[8];
884
+ svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
1034
885
  nk_rotation_from_svd_f64_neon_(svd_u, svd_v, r);
1035
886
  }
1036
887
 
@@ -1048,11 +899,128 @@ NK_PUBLIC void nk_kabsch_f64_neon(nk_f64_t const *a, nk_f64_t const *b, nk_size_
1048
899
 
1049
900
  NK_PUBLIC void nk_umeyama_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
1050
901
  nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
1051
- nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z, h[9], variance_a;
1052
- nk_bicentroid_f32_neon_(a, b, n, &centroid_a_x, &centroid_a_y, &centroid_a_z, &centroid_b_x, &centroid_b_y,
1053
- &centroid_b_z);
1054
- nk_cross_covariance_and_variance_f32_neon_(a, b, n, centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x,
1055
- centroid_b_y, centroid_b_z, h, &variance_a);
902
+ float64x2_t zero_f64x2 = vdupq_n_f64(0.0);
903
+
904
+ // Centroid accumulators (f64, lower/upper halves of f32x4)
905
+ float64x2_t sum_a_x_low_f64x2 = zero_f64x2, sum_a_x_high_f64x2 = zero_f64x2;
906
+ float64x2_t sum_a_y_low_f64x2 = zero_f64x2, sum_a_y_high_f64x2 = zero_f64x2;
907
+ float64x2_t sum_a_z_low_f64x2 = zero_f64x2, sum_a_z_high_f64x2 = zero_f64x2;
908
+ float64x2_t sum_b_x_low_f64x2 = zero_f64x2, sum_b_x_high_f64x2 = zero_f64x2;
909
+ float64x2_t sum_b_y_low_f64x2 = zero_f64x2, sum_b_y_high_f64x2 = zero_f64x2;
910
+ float64x2_t sum_b_z_low_f64x2 = zero_f64x2, sum_b_z_high_f64x2 = zero_f64x2;
911
+
912
+ // Covariance accumulators (f64, lower/upper halves)
913
+ float64x2_t cov_xx_low_f64x2 = zero_f64x2, cov_xx_high_f64x2 = zero_f64x2;
914
+ float64x2_t cov_xy_low_f64x2 = zero_f64x2, cov_xy_high_f64x2 = zero_f64x2;
915
+ float64x2_t cov_xz_low_f64x2 = zero_f64x2, cov_xz_high_f64x2 = zero_f64x2;
916
+ float64x2_t cov_yx_low_f64x2 = zero_f64x2, cov_yx_high_f64x2 = zero_f64x2;
917
+ float64x2_t cov_yy_low_f64x2 = zero_f64x2, cov_yy_high_f64x2 = zero_f64x2;
918
+ float64x2_t cov_yz_low_f64x2 = zero_f64x2, cov_yz_high_f64x2 = zero_f64x2;
919
+ float64x2_t cov_zx_low_f64x2 = zero_f64x2, cov_zx_high_f64x2 = zero_f64x2;
920
+ float64x2_t cov_zy_low_f64x2 = zero_f64x2, cov_zy_high_f64x2 = zero_f64x2;
921
+ float64x2_t cov_zz_low_f64x2 = zero_f64x2, cov_zz_high_f64x2 = zero_f64x2;
922
+
923
+ // Variance of A accumulator
924
+ float64x2_t variance_low_f64x2 = zero_f64x2, variance_high_f64x2 = zero_f64x2;
925
+
926
+ nk_size_t index = 0;
927
+ for (; index + 4 <= n; index += 4) {
928
+ float32x4_t a_x_f32x4, a_y_f32x4, a_z_f32x4, b_x_f32x4, b_y_f32x4, b_z_f32x4;
929
+ nk_deinterleave_f32x4_neon_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4),
930
+ nk_deinterleave_f32x4_neon_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
931
+
932
+ float64x2_t a_x_low_f64x2 = vcvt_f64_f32(vget_low_f32(a_x_f32x4));
933
+ float64x2_t a_x_high_f64x2 = vcvt_high_f64_f32(a_x_f32x4);
934
+ float64x2_t a_y_low_f64x2 = vcvt_f64_f32(vget_low_f32(a_y_f32x4));
935
+ float64x2_t a_y_high_f64x2 = vcvt_high_f64_f32(a_y_f32x4);
936
+ float64x2_t a_z_low_f64x2 = vcvt_f64_f32(vget_low_f32(a_z_f32x4));
937
+ float64x2_t a_z_high_f64x2 = vcvt_high_f64_f32(a_z_f32x4);
938
+ float64x2_t b_x_low_f64x2 = vcvt_f64_f32(vget_low_f32(b_x_f32x4));
939
+ float64x2_t b_x_high_f64x2 = vcvt_high_f64_f32(b_x_f32x4);
940
+ float64x2_t b_y_low_f64x2 = vcvt_f64_f32(vget_low_f32(b_y_f32x4));
941
+ float64x2_t b_y_high_f64x2 = vcvt_high_f64_f32(b_y_f32x4);
942
+ float64x2_t b_z_low_f64x2 = vcvt_f64_f32(vget_low_f32(b_z_f32x4));
943
+ float64x2_t b_z_high_f64x2 = vcvt_high_f64_f32(b_z_f32x4);
944
+
945
+ // Accumulate centroids
946
+ sum_a_x_low_f64x2 = vaddq_f64(sum_a_x_low_f64x2, a_x_low_f64x2),
947
+ sum_a_x_high_f64x2 = vaddq_f64(sum_a_x_high_f64x2, a_x_high_f64x2);
948
+ sum_a_y_low_f64x2 = vaddq_f64(sum_a_y_low_f64x2, a_y_low_f64x2),
949
+ sum_a_y_high_f64x2 = vaddq_f64(sum_a_y_high_f64x2, a_y_high_f64x2);
950
+ sum_a_z_low_f64x2 = vaddq_f64(sum_a_z_low_f64x2, a_z_low_f64x2),
951
+ sum_a_z_high_f64x2 = vaddq_f64(sum_a_z_high_f64x2, a_z_high_f64x2);
952
+ sum_b_x_low_f64x2 = vaddq_f64(sum_b_x_low_f64x2, b_x_low_f64x2),
953
+ sum_b_x_high_f64x2 = vaddq_f64(sum_b_x_high_f64x2, b_x_high_f64x2);
954
+ sum_b_y_low_f64x2 = vaddq_f64(sum_b_y_low_f64x2, b_y_low_f64x2),
955
+ sum_b_y_high_f64x2 = vaddq_f64(sum_b_y_high_f64x2, b_y_high_f64x2);
956
+ sum_b_z_low_f64x2 = vaddq_f64(sum_b_z_low_f64x2, b_z_low_f64x2),
957
+ sum_b_z_high_f64x2 = vaddq_f64(sum_b_z_high_f64x2, b_z_high_f64x2);
958
+
959
+ // Accumulate raw outer products (uncentered)
960
+ cov_xx_low_f64x2 = vfmaq_f64(cov_xx_low_f64x2, a_x_low_f64x2, b_x_low_f64x2),
961
+ cov_xx_high_f64x2 = vfmaq_f64(cov_xx_high_f64x2, a_x_high_f64x2, b_x_high_f64x2);
962
+ cov_xy_low_f64x2 = vfmaq_f64(cov_xy_low_f64x2, a_x_low_f64x2, b_y_low_f64x2),
963
+ cov_xy_high_f64x2 = vfmaq_f64(cov_xy_high_f64x2, a_x_high_f64x2, b_y_high_f64x2);
964
+ cov_xz_low_f64x2 = vfmaq_f64(cov_xz_low_f64x2, a_x_low_f64x2, b_z_low_f64x2),
965
+ cov_xz_high_f64x2 = vfmaq_f64(cov_xz_high_f64x2, a_x_high_f64x2, b_z_high_f64x2);
966
+ cov_yx_low_f64x2 = vfmaq_f64(cov_yx_low_f64x2, a_y_low_f64x2, b_x_low_f64x2),
967
+ cov_yx_high_f64x2 = vfmaq_f64(cov_yx_high_f64x2, a_y_high_f64x2, b_x_high_f64x2);
968
+ cov_yy_low_f64x2 = vfmaq_f64(cov_yy_low_f64x2, a_y_low_f64x2, b_y_low_f64x2),
969
+ cov_yy_high_f64x2 = vfmaq_f64(cov_yy_high_f64x2, a_y_high_f64x2, b_y_high_f64x2);
970
+ cov_yz_low_f64x2 = vfmaq_f64(cov_yz_low_f64x2, a_y_low_f64x2, b_z_low_f64x2),
971
+ cov_yz_high_f64x2 = vfmaq_f64(cov_yz_high_f64x2, a_y_high_f64x2, b_z_high_f64x2);
972
+ cov_zx_low_f64x2 = vfmaq_f64(cov_zx_low_f64x2, a_z_low_f64x2, b_x_low_f64x2),
973
+ cov_zx_high_f64x2 = vfmaq_f64(cov_zx_high_f64x2, a_z_high_f64x2, b_x_high_f64x2);
974
+ cov_zy_low_f64x2 = vfmaq_f64(cov_zy_low_f64x2, a_z_low_f64x2, b_y_low_f64x2),
975
+ cov_zy_high_f64x2 = vfmaq_f64(cov_zy_high_f64x2, a_z_high_f64x2, b_y_high_f64x2);
976
+ cov_zz_low_f64x2 = vfmaq_f64(cov_zz_low_f64x2, a_z_low_f64x2, b_z_low_f64x2),
977
+ cov_zz_high_f64x2 = vfmaq_f64(cov_zz_high_f64x2, a_z_high_f64x2, b_z_high_f64x2);
978
+
979
+ // Accumulate variance of A (sum of squared coordinates)
980
+ variance_low_f64x2 = vfmaq_f64(variance_low_f64x2, a_x_low_f64x2, a_x_low_f64x2),
981
+ variance_high_f64x2 = vfmaq_f64(variance_high_f64x2, a_x_high_f64x2, a_x_high_f64x2);
982
+ variance_low_f64x2 = vfmaq_f64(variance_low_f64x2, a_y_low_f64x2, a_y_low_f64x2),
983
+ variance_high_f64x2 = vfmaq_f64(variance_high_f64x2, a_y_high_f64x2, a_y_high_f64x2);
984
+ variance_low_f64x2 = vfmaq_f64(variance_low_f64x2, a_z_low_f64x2, a_z_low_f64x2),
985
+ variance_high_f64x2 = vfmaq_f64(variance_high_f64x2, a_z_high_f64x2, a_z_high_f64x2);
986
+ }
987
+
988
+ // Reduce centroid accumulators
989
+ nk_f64_t sum_a_x = vaddvq_f64(vaddq_f64(sum_a_x_low_f64x2, sum_a_x_high_f64x2));
990
+ nk_f64_t sum_a_y = vaddvq_f64(vaddq_f64(sum_a_y_low_f64x2, sum_a_y_high_f64x2));
991
+ nk_f64_t sum_a_z = vaddvq_f64(vaddq_f64(sum_a_z_low_f64x2, sum_a_z_high_f64x2));
992
+ nk_f64_t sum_b_x = vaddvq_f64(vaddq_f64(sum_b_x_low_f64x2, sum_b_x_high_f64x2));
993
+ nk_f64_t sum_b_y = vaddvq_f64(vaddq_f64(sum_b_y_low_f64x2, sum_b_y_high_f64x2));
994
+ nk_f64_t sum_b_z = vaddvq_f64(vaddq_f64(sum_b_z_low_f64x2, sum_b_z_high_f64x2));
995
+
996
+ // Reduce covariance accumulators
997
+ nk_f64_t covariance_x_x = vaddvq_f64(vaddq_f64(cov_xx_low_f64x2, cov_xx_high_f64x2));
998
+ nk_f64_t covariance_x_y = vaddvq_f64(vaddq_f64(cov_xy_low_f64x2, cov_xy_high_f64x2));
999
+ nk_f64_t covariance_x_z = vaddvq_f64(vaddq_f64(cov_xz_low_f64x2, cov_xz_high_f64x2));
1000
+ nk_f64_t covariance_y_x = vaddvq_f64(vaddq_f64(cov_yx_low_f64x2, cov_yx_high_f64x2));
1001
+ nk_f64_t covariance_y_y = vaddvq_f64(vaddq_f64(cov_yy_low_f64x2, cov_yy_high_f64x2));
1002
+ nk_f64_t covariance_y_z = vaddvq_f64(vaddq_f64(cov_yz_low_f64x2, cov_yz_high_f64x2));
1003
+ nk_f64_t covariance_z_x = vaddvq_f64(vaddq_f64(cov_zx_low_f64x2, cov_zx_high_f64x2));
1004
+ nk_f64_t covariance_z_y = vaddvq_f64(vaddq_f64(cov_zy_low_f64x2, cov_zy_high_f64x2));
1005
+ nk_f64_t covariance_z_z = vaddvq_f64(vaddq_f64(cov_zz_low_f64x2, cov_zz_high_f64x2));
1006
+ nk_f64_t sum_sq_a = vaddvq_f64(vaddq_f64(variance_low_f64x2, variance_high_f64x2));
1007
+
1008
+ // Scalar tail
1009
+ for (; index < n; ++index) {
1010
+ nk_f64_t ax = (nk_f64_t)a[index * 3 + 0], ay = (nk_f64_t)a[index * 3 + 1], az = (nk_f64_t)a[index * 3 + 2];
1011
+ nk_f64_t bx = (nk_f64_t)b[index * 3 + 0], by = (nk_f64_t)b[index * 3 + 1], bz = (nk_f64_t)b[index * 3 + 2];
1012
+ sum_a_x += ax, sum_a_y += ay, sum_a_z += az;
1013
+ sum_b_x += bx, sum_b_y += by, sum_b_z += bz;
1014
+ covariance_x_x += ax * bx, covariance_x_y += ax * by, covariance_x_z += ax * bz;
1015
+ covariance_y_x += ay * bx, covariance_y_y += ay * by, covariance_y_z += ay * bz;
1016
+ covariance_z_x += az * bx, covariance_z_y += az * by, covariance_z_z += az * bz;
1017
+ sum_sq_a += ax * ax + ay * ay + az * az;
1018
+ }
1019
+
1020
+ // Compute centroids
1021
+ nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
1022
+ nk_f64_t centroid_a_x = sum_a_x * inv_n, centroid_a_y = sum_a_y * inv_n, centroid_a_z = sum_a_z * inv_n;
1023
+ nk_f64_t centroid_b_x = sum_b_x * inv_n, centroid_b_y = sum_b_y * inv_n, centroid_b_z = sum_b_z * inv_n;
1056
1024
  if (a_centroid)
1057
1025
  a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
1058
1026
  a_centroid[2] = (nk_f32_t)centroid_a_z;
@@ -1060,6 +1028,22 @@ NK_PUBLIC void nk_umeyama_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_size
1060
1028
  b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
1061
1029
  b_centroid[2] = (nk_f32_t)centroid_b_z;
1062
1030
 
1031
+ // Compute variance of A (centered): var = sum(a^2)/n - centroid^2
1032
+ nk_f64_t variance_a = sum_sq_a * inv_n -
1033
+ (centroid_a_x * centroid_a_x + centroid_a_y * centroid_a_y + centroid_a_z * centroid_a_z);
1034
+
1035
+ // Apply centering correction: H_centered = sum(a * bᵀ) - n * centroid_a * centroid_bᵀ
1036
+ nk_f64_t h[9];
1037
+ h[0] = covariance_x_x - (nk_f64_t)n * centroid_a_x * centroid_b_x;
1038
+ h[1] = covariance_x_y - (nk_f64_t)n * centroid_a_x * centroid_b_y;
1039
+ h[2] = covariance_x_z - (nk_f64_t)n * centroid_a_x * centroid_b_z;
1040
+ h[3] = covariance_y_x - (nk_f64_t)n * centroid_a_y * centroid_b_x;
1041
+ h[4] = covariance_y_y - (nk_f64_t)n * centroid_a_y * centroid_b_y;
1042
+ h[5] = covariance_y_z - (nk_f64_t)n * centroid_a_y * centroid_b_z;
1043
+ h[6] = covariance_z_x - (nk_f64_t)n * centroid_a_z * centroid_b_x;
1044
+ h[7] = covariance_z_y - (nk_f64_t)n * centroid_a_z * centroid_b_y;
1045
+ h[8] = covariance_z_z - (nk_f64_t)n * centroid_a_z * centroid_b_z;
1046
+
1063
1047
  nk_f64_t svd_u[9], svd_s[9], svd_v[9];
1064
1048
  nk_svd3x3_f64_(h, svd_u, svd_s, svd_v);
1065
1049
 
@@ -1090,7 +1074,7 @@ NK_PUBLIC void nk_umeyama_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_size
1090
1074
 
1091
1075
  nk_f64_t applied_scale = (svd_s[0] + svd_s[4] + sign_correction * svd_s[8]) / ((nk_f64_t)n * variance_a);
1092
1076
  if (rotation)
1093
- for (int index = 0; index != 9; ++index) rotation[index] = (nk_f32_t)r[index];
1077
+ for (int j = 0; j != 9; ++j) rotation[j] = (nk_f32_t)r[j];
1094
1078
  if (scale) *scale = (nk_f32_t)applied_scale;
1095
1079
  *result = nk_f64_sqrt_neon(nk_transformed_ssd_f32_neon_(a, b, n, r, applied_scale, centroid_a_x, centroid_a_y,
1096
1080
  centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z) /
@@ -1273,15 +1257,15 @@ NK_PUBLIC void nk_umeyama_f64_neon(nk_f64_t const *a, nk_f64_t const *b, nk_size
1273
1257
  nk_f64_t var_a = sum_sq_a * inv_n - centroid_sq;
1274
1258
 
1275
1259
  // Apply centering correction: H_centered = H - n * centroid_a * centroid_bᵀ
1276
- covariance_x_x -= n * centroid_a_x * centroid_b_x;
1277
- covariance_x_y -= n * centroid_a_x * centroid_b_y;
1278
- covariance_x_z -= n * centroid_a_x * centroid_b_z;
1279
- covariance_y_x -= n * centroid_a_y * centroid_b_x;
1280
- covariance_y_y -= n * centroid_a_y * centroid_b_y;
1281
- covariance_y_z -= n * centroid_a_y * centroid_b_z;
1282
- covariance_z_x -= n * centroid_a_z * centroid_b_x;
1283
- covariance_z_y -= n * centroid_a_z * centroid_b_y;
1284
- covariance_z_z -= n * centroid_a_z * centroid_b_z;
1260
+ covariance_x_x -= (nk_f64_t)n * centroid_a_x * centroid_b_x;
1261
+ covariance_x_y -= (nk_f64_t)n * centroid_a_x * centroid_b_y;
1262
+ covariance_x_z -= (nk_f64_t)n * centroid_a_x * centroid_b_z;
1263
+ covariance_y_x -= (nk_f64_t)n * centroid_a_y * centroid_b_x;
1264
+ covariance_y_y -= (nk_f64_t)n * centroid_a_y * centroid_b_y;
1265
+ covariance_y_z -= (nk_f64_t)n * centroid_a_y * centroid_b_z;
1266
+ covariance_z_x -= (nk_f64_t)n * centroid_a_z * centroid_b_x;
1267
+ covariance_z_y -= (nk_f64_t)n * centroid_a_z * centroid_b_y;
1268
+ covariance_z_z -= (nk_f64_t)n * centroid_a_z * centroid_b_z;
1285
1269
 
1286
1270
  // Compute SVD
1287
1271
  nk_f64_t cross_covariance[9] = {covariance_x_x, covariance_x_y, covariance_x_z, covariance_y_x, covariance_y_y,
@@ -1295,12 +1279,10 @@ NK_PUBLIC void nk_umeyama_f64_neon(nk_f64_t const *a, nk_f64_t const *b, nk_size
1295
1279
  // Handle reflection and compute scale
1296
1280
  nk_f64_t det = nk_det3x3_f64_(r);
1297
1281
  nk_f64_t trace_d_s = svd_s[0] + svd_s[4] + (det < 0 ? -svd_s[8] : svd_s[8]);
1298
- nk_f64_t computed_scale = trace_d_s / (n * var_a);
1282
+ nk_f64_t computed_scale = trace_d_s / ((nk_f64_t)n * var_a);
1299
1283
 
1300
1284
  if (det < 0) {
1301
- svd_v[2] = -svd_v[2];
1302
- svd_v[5] = -svd_v[5];
1303
- svd_v[8] = -svd_v[8];
1285
+ svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
1304
1286
  nk_rotation_from_svd_f64_neon_(svd_u, svd_v, r);
1305
1287
  }
1306
1288
 
@@ -1314,6 +1296,605 @@ NK_PUBLIC void nk_umeyama_f64_neon(nk_f64_t const *a, nk_f64_t const *b, nk_size
1314
1296
  *result = nk_f64_sqrt_neon(sum_squared * inv_n);
1315
1297
  }
1316
1298
 
1299
+ NK_INTERNAL void nk_deinterleave_f16x8_to_f32x4x2_neon_(nk_f16_t const *ptr, //
1300
+ float32x4_t *x_low_out, float32x4_t *x_high_out, //
1301
+ float32x4_t *y_low_out, float32x4_t *y_high_out, //
1302
+ float32x4_t *z_low_out, float32x4_t *z_high_out) {
1303
+ // Deinterleave 24 f16 values (8 xyz triplets) into separate x, y, z vectors.
1304
+ // Uses NEON vld3q_u16 for efficient stride-3 deinterleaving, then converts to f32.
1305
+ // Avoids vld3q_f16 which is unavailable on MSVC for ARM.
1306
+ //
1307
+ // Input: 24 contiguous f16 values [x0,y0,z0, ..., x7,y7,z7]
1308
+ // Output: x_low[4]+x_high[4], y_low[4]+y_high[4], z_low[4]+z_high[4] vectors in f32
1309
+ uint16x8x3_t xyz_u16x8x3 = vld3q_u16((nk_u16_t const *)ptr);
1310
+ float16x8_t x_f16x8 = vreinterpretq_f16_u16(xyz_u16x8x3.val[0]);
1311
+ float16x8_t y_f16x8 = vreinterpretq_f16_u16(xyz_u16x8x3.val[1]);
1312
+ float16x8_t z_f16x8 = vreinterpretq_f16_u16(xyz_u16x8x3.val[2]);
1313
+ *x_low_out = vcvt_f32_f16(vget_low_f16(x_f16x8));
1314
+ *x_high_out = vcvt_high_f32_f16(x_f16x8);
1315
+ *y_low_out = vcvt_f32_f16(vget_low_f16(y_f16x8));
1316
+ *y_high_out = vcvt_high_f32_f16(y_f16x8);
1317
+ *z_low_out = vcvt_f32_f16(vget_low_f16(z_f16x8));
1318
+ *z_high_out = vcvt_high_f32_f16(z_f16x8);
1319
+ }
1320
+
1321
+ NK_INTERNAL void nk_partial_deinterleave_f16_to_f32x4x2_neon_(nk_f16_t const *ptr, nk_size_t n_points, //
1322
+ float32x4_t *x_low_out, float32x4_t *x_high_out, //
1323
+ float32x4_t *y_low_out, float32x4_t *y_high_out, //
1324
+ float32x4_t *z_low_out, float32x4_t *z_high_out) {
1325
+ nk_u16_t buf[24] = {0};
1326
+ nk_u16_t const *src = (nk_u16_t const *)ptr;
1327
+ for (nk_size_t k = 0; k < n_points * 3; ++k) buf[k] = src[k];
1328
+ nk_deinterleave_f16x8_to_f32x4x2_neon_((nk_f16_t const *)buf, x_low_out, x_high_out, y_low_out, y_high_out,
1329
+ z_low_out, z_high_out);
1330
+ }
1331
+
1332
+ NK_INTERNAL nk_f32_t nk_transformed_ssd_f16_neon_(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t const *r,
1333
+ nk_f32_t scale, nk_f32_t centroid_a_x, nk_f32_t centroid_a_y,
1334
+ nk_f32_t centroid_a_z, nk_f32_t centroid_b_x, nk_f32_t centroid_b_y,
1335
+ nk_f32_t centroid_b_z) {
1336
+ // Compute sum of squared differences after rigid transformation.
1337
+ // Used by Kabsch algorithm for RMSD computation after rotation is applied.
1338
+ float32x4_t const centroid_a_x_f32x4 = vdupq_n_f32(centroid_a_x);
1339
+ float32x4_t const centroid_a_y_f32x4 = vdupq_n_f32(centroid_a_y);
1340
+ float32x4_t const centroid_a_z_f32x4 = vdupq_n_f32(centroid_a_z);
1341
+ float32x4_t const centroid_b_x_f32x4 = vdupq_n_f32(centroid_b_x);
1342
+ float32x4_t const centroid_b_y_f32x4 = vdupq_n_f32(centroid_b_y);
1343
+ float32x4_t const centroid_b_z_f32x4 = vdupq_n_f32(centroid_b_z);
1344
+ float32x4_t const scale_f32x4 = vdupq_n_f32(scale);
1345
+
1346
+ // Load rotation matrix elements
1347
+ float32x4_t const r00_f32x4 = vdupq_n_f32(r[0]), r01_f32x4 = vdupq_n_f32(r[1]), r02_f32x4 = vdupq_n_f32(r[2]);
1348
+ float32x4_t const r10_f32x4 = vdupq_n_f32(r[3]), r11_f32x4 = vdupq_n_f32(r[4]), r12_f32x4 = vdupq_n_f32(r[5]);
1349
+ float32x4_t const r20_f32x4 = vdupq_n_f32(r[6]), r21_f32x4 = vdupq_n_f32(r[7]), r22_f32x4 = vdupq_n_f32(r[8]);
1350
+
1351
+ float32x4_t sum_squared_f32x4 = vdupq_n_f32(0);
1352
+ float32x4_t a_x_low_f32x4, a_x_high_f32x4, a_y_low_f32x4, a_y_high_f32x4, a_z_low_f32x4, a_z_high_f32x4;
1353
+ float32x4_t b_x_low_f32x4, b_x_high_f32x4, b_y_low_f32x4, b_y_high_f32x4, b_z_low_f32x4, b_z_high_f32x4;
1354
+
1355
+ nk_size_t j = 0;
1356
+ for (; j + 8 <= n; j += 8) {
1357
+ nk_deinterleave_f16x8_to_f32x4x2_neon_(a + j * 3, &a_x_low_f32x4, &a_x_high_f32x4, &a_y_low_f32x4,
1358
+ &a_y_high_f32x4, &a_z_low_f32x4, &a_z_high_f32x4);
1359
+ nk_deinterleave_f16x8_to_f32x4x2_neon_(b + j * 3, &b_x_low_f32x4, &b_x_high_f32x4, &b_y_low_f32x4,
1360
+ &b_y_high_f32x4, &b_z_low_f32x4, &b_z_high_f32x4);
1361
+
1362
+ // Center points → low half
1363
+ float32x4_t pa_x_f32x4 = vsubq_f32(a_x_low_f32x4, centroid_a_x_f32x4);
1364
+ float32x4_t pa_y_f32x4 = vsubq_f32(a_y_low_f32x4, centroid_a_y_f32x4);
1365
+ float32x4_t pa_z_f32x4 = vsubq_f32(a_z_low_f32x4, centroid_a_z_f32x4);
1366
+ float32x4_t pb_x_f32x4 = vsubq_f32(b_x_low_f32x4, centroid_b_x_f32x4);
1367
+ float32x4_t pb_y_f32x4 = vsubq_f32(b_y_low_f32x4, centroid_b_y_f32x4);
1368
+ float32x4_t pb_z_f32x4 = vsubq_f32(b_z_low_f32x4, centroid_b_z_f32x4);
1369
+ float32x4_t ra_x_f32x4 = vmulq_f32(
1370
+ scale_f32x4,
1371
+ vfmaq_f32(vfmaq_f32(vmulq_f32(r00_f32x4, pa_x_f32x4), r01_f32x4, pa_y_f32x4), r02_f32x4, pa_z_f32x4));
1372
+ float32x4_t ra_y_f32x4 = vmulq_f32(
1373
+ scale_f32x4,
1374
+ vfmaq_f32(vfmaq_f32(vmulq_f32(r10_f32x4, pa_x_f32x4), r11_f32x4, pa_y_f32x4), r12_f32x4, pa_z_f32x4));
1375
+ float32x4_t ra_z_f32x4 = vmulq_f32(
1376
+ scale_f32x4,
1377
+ vfmaq_f32(vfmaq_f32(vmulq_f32(r20_f32x4, pa_x_f32x4), r21_f32x4, pa_y_f32x4), r22_f32x4, pa_z_f32x4));
1378
+ float32x4_t delta_x_f32x4 = vsubq_f32(ra_x_f32x4, pb_x_f32x4);
1379
+ float32x4_t delta_y_f32x4 = vsubq_f32(ra_y_f32x4, pb_y_f32x4);
1380
+ float32x4_t delta_z_f32x4 = vsubq_f32(ra_z_f32x4, pb_z_f32x4);
1381
+ sum_squared_f32x4 = vfmaq_f32(sum_squared_f32x4, delta_x_f32x4, delta_x_f32x4);
1382
+ sum_squared_f32x4 = vfmaq_f32(sum_squared_f32x4, delta_y_f32x4, delta_y_f32x4);
1383
+ sum_squared_f32x4 = vfmaq_f32(sum_squared_f32x4, delta_z_f32x4, delta_z_f32x4);
1384
+
1385
+ // Center points → high half
1386
+ pa_x_f32x4 = vsubq_f32(a_x_high_f32x4, centroid_a_x_f32x4);
1387
+ pa_y_f32x4 = vsubq_f32(a_y_high_f32x4, centroid_a_y_f32x4);
1388
+ pa_z_f32x4 = vsubq_f32(a_z_high_f32x4, centroid_a_z_f32x4);
1389
+ pb_x_f32x4 = vsubq_f32(b_x_high_f32x4, centroid_b_x_f32x4);
1390
+ pb_y_f32x4 = vsubq_f32(b_y_high_f32x4, centroid_b_y_f32x4);
1391
+ pb_z_f32x4 = vsubq_f32(b_z_high_f32x4, centroid_b_z_f32x4);
1392
+ ra_x_f32x4 = vmulq_f32(
1393
+ scale_f32x4,
1394
+ vfmaq_f32(vfmaq_f32(vmulq_f32(r00_f32x4, pa_x_f32x4), r01_f32x4, pa_y_f32x4), r02_f32x4, pa_z_f32x4));
1395
+ ra_y_f32x4 = vmulq_f32(
1396
+ scale_f32x4,
1397
+ vfmaq_f32(vfmaq_f32(vmulq_f32(r10_f32x4, pa_x_f32x4), r11_f32x4, pa_y_f32x4), r12_f32x4, pa_z_f32x4));
1398
+ ra_z_f32x4 = vmulq_f32(
1399
+ scale_f32x4,
1400
+ vfmaq_f32(vfmaq_f32(vmulq_f32(r20_f32x4, pa_x_f32x4), r21_f32x4, pa_y_f32x4), r22_f32x4, pa_z_f32x4));
1401
+ delta_x_f32x4 = vsubq_f32(ra_x_f32x4, pb_x_f32x4);
1402
+ delta_y_f32x4 = vsubq_f32(ra_y_f32x4, pb_y_f32x4);
1403
+ delta_z_f32x4 = vsubq_f32(ra_z_f32x4, pb_z_f32x4);
1404
+ sum_squared_f32x4 = vfmaq_f32(sum_squared_f32x4, delta_x_f32x4, delta_x_f32x4);
1405
+ sum_squared_f32x4 = vfmaq_f32(sum_squared_f32x4, delta_y_f32x4, delta_y_f32x4);
1406
+ sum_squared_f32x4 = vfmaq_f32(sum_squared_f32x4, delta_z_f32x4, delta_z_f32x4);
1407
+ }
1408
+
1409
+ // Reduce to scalar
1410
+ nk_f32_t sum_squared = vaddvq_f32(sum_squared_f32x4);
1411
+
1412
+ if (j < n) {
1413
+ nk_partial_deinterleave_f16_to_f32x4x2_neon_(a + j * 3, n - j, &a_x_low_f32x4, &a_x_high_f32x4, &a_y_low_f32x4,
1414
+ &a_y_high_f32x4, &a_z_low_f32x4, &a_z_high_f32x4);
1415
+ nk_partial_deinterleave_f16_to_f32x4x2_neon_(b + j * 3, n - j, &b_x_low_f32x4, &b_x_high_f32x4, &b_y_low_f32x4,
1416
+ &b_y_high_f32x4, &b_z_low_f32x4, &b_z_high_f32x4);
1417
+
1418
+ // Low half
1419
+ float32x4_t pa_x_f32x4 = vsubq_f32(a_x_low_f32x4, centroid_a_x_f32x4);
1420
+ float32x4_t pa_y_f32x4 = vsubq_f32(a_y_low_f32x4, centroid_a_y_f32x4);
1421
+ float32x4_t pa_z_f32x4 = vsubq_f32(a_z_low_f32x4, centroid_a_z_f32x4);
1422
+ float32x4_t pb_x_f32x4 = vsubq_f32(b_x_low_f32x4, centroid_b_x_f32x4);
1423
+ float32x4_t pb_y_f32x4 = vsubq_f32(b_y_low_f32x4, centroid_b_y_f32x4);
1424
+ float32x4_t pb_z_f32x4 = vsubq_f32(b_z_low_f32x4, centroid_b_z_f32x4);
1425
+ float32x4_t ra_x_f32x4 = vmulq_f32(
1426
+ scale_f32x4,
1427
+ vfmaq_f32(vfmaq_f32(vmulq_f32(r00_f32x4, pa_x_f32x4), r01_f32x4, pa_y_f32x4), r02_f32x4, pa_z_f32x4));
1428
+ float32x4_t ra_y_f32x4 = vmulq_f32(
1429
+ scale_f32x4,
1430
+ vfmaq_f32(vfmaq_f32(vmulq_f32(r10_f32x4, pa_x_f32x4), r11_f32x4, pa_y_f32x4), r12_f32x4, pa_z_f32x4));
1431
+ float32x4_t ra_z_f32x4 = vmulq_f32(
1432
+ scale_f32x4,
1433
+ vfmaq_f32(vfmaq_f32(vmulq_f32(r20_f32x4, pa_x_f32x4), r21_f32x4, pa_y_f32x4), r22_f32x4, pa_z_f32x4));
1434
+ float32x4_t delta_x_f32x4 = vsubq_f32(ra_x_f32x4, pb_x_f32x4);
1435
+ float32x4_t delta_y_f32x4 = vsubq_f32(ra_y_f32x4, pb_y_f32x4);
1436
+ float32x4_t delta_z_f32x4 = vsubq_f32(ra_z_f32x4, pb_z_f32x4);
1437
+ float32x4_t tail_sum_f32x4 = vmulq_f32(delta_x_f32x4, delta_x_f32x4);
1438
+ tail_sum_f32x4 = vfmaq_f32(tail_sum_f32x4, delta_y_f32x4, delta_y_f32x4);
1439
+ tail_sum_f32x4 = vfmaq_f32(tail_sum_f32x4, delta_z_f32x4, delta_z_f32x4);
1440
+
1441
+ // High half
1442
+ pa_x_f32x4 = vsubq_f32(a_x_high_f32x4, centroid_a_x_f32x4);
1443
+ pa_y_f32x4 = vsubq_f32(a_y_high_f32x4, centroid_a_y_f32x4);
1444
+ pa_z_f32x4 = vsubq_f32(a_z_high_f32x4, centroid_a_z_f32x4);
1445
+ pb_x_f32x4 = vsubq_f32(b_x_high_f32x4, centroid_b_x_f32x4);
1446
+ pb_y_f32x4 = vsubq_f32(b_y_high_f32x4, centroid_b_y_f32x4);
1447
+ pb_z_f32x4 = vsubq_f32(b_z_high_f32x4, centroid_b_z_f32x4);
1448
+ ra_x_f32x4 = vmulq_f32(
1449
+ scale_f32x4,
1450
+ vfmaq_f32(vfmaq_f32(vmulq_f32(r00_f32x4, pa_x_f32x4), r01_f32x4, pa_y_f32x4), r02_f32x4, pa_z_f32x4));
1451
+ ra_y_f32x4 = vmulq_f32(
1452
+ scale_f32x4,
1453
+ vfmaq_f32(vfmaq_f32(vmulq_f32(r10_f32x4, pa_x_f32x4), r11_f32x4, pa_y_f32x4), r12_f32x4, pa_z_f32x4));
1454
+ ra_z_f32x4 = vmulq_f32(
1455
+ scale_f32x4,
1456
+ vfmaq_f32(vfmaq_f32(vmulq_f32(r20_f32x4, pa_x_f32x4), r21_f32x4, pa_y_f32x4), r22_f32x4, pa_z_f32x4));
1457
+ delta_x_f32x4 = vsubq_f32(ra_x_f32x4, pb_x_f32x4);
1458
+ delta_y_f32x4 = vsubq_f32(ra_y_f32x4, pb_y_f32x4);
1459
+ delta_z_f32x4 = vsubq_f32(ra_z_f32x4, pb_z_f32x4);
1460
+ tail_sum_f32x4 = vfmaq_f32(tail_sum_f32x4, delta_x_f32x4, delta_x_f32x4);
1461
+ tail_sum_f32x4 = vfmaq_f32(tail_sum_f32x4, delta_y_f32x4, delta_y_f32x4);
1462
+ tail_sum_f32x4 = vfmaq_f32(tail_sum_f32x4, delta_z_f32x4, delta_z_f32x4);
1463
+ sum_squared += vaddvq_f32(tail_sum_f32x4);
1464
+ }
1465
+
1466
+ return sum_squared;
1467
+ }
1468
+
1469
+ /**
1470
+ * @brief RMSD (Root Mean Square Deviation) computation using NEON FP16 with widening to FP32.
1471
+ * Computes the RMS of distances between corresponding points after centroid alignment.
1472
+ */
1473
+ NK_PUBLIC void nk_rmsd_f16_neon(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
1474
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
1475
+ // RMSD uses identity rotation and scale=1.0
1476
+ if (rotation)
1477
+ rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
1478
+ rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
1479
+ if (scale) *scale = 1.0f;
1480
+
1481
+ float32x4_t const zeros_f32x4 = vdupq_n_f32(0);
1482
+
1483
+ // Accumulators for centroids and squared differences (all in f32)
1484
+ float32x4_t sum_a_x_f32x4 = zeros_f32x4, sum_a_y_f32x4 = zeros_f32x4, sum_a_z_f32x4 = zeros_f32x4;
1485
+ float32x4_t sum_b_x_f32x4 = zeros_f32x4, sum_b_y_f32x4 = zeros_f32x4, sum_b_z_f32x4 = zeros_f32x4;
1486
+ float32x4_t sum_squared_x_f32x4 = zeros_f32x4, sum_squared_y_f32x4 = zeros_f32x4, sum_squared_z_f32x4 = zeros_f32x4;
1487
+
1488
+ float32x4_t a_x_low_f32x4, a_x_high_f32x4, a_y_low_f32x4, a_y_high_f32x4, a_z_low_f32x4, a_z_high_f32x4;
1489
+ float32x4_t b_x_low_f32x4, b_x_high_f32x4, b_y_low_f32x4, b_y_high_f32x4, b_z_low_f32x4, b_z_high_f32x4;
1490
+ nk_size_t i = 0;
1491
+
1492
+ for (; i + 8 <= n; i += 8) {
1493
+ nk_deinterleave_f16x8_to_f32x4x2_neon_(a + i * 3, &a_x_low_f32x4, &a_x_high_f32x4, &a_y_low_f32x4,
1494
+ &a_y_high_f32x4, &a_z_low_f32x4, &a_z_high_f32x4);
1495
+ nk_deinterleave_f16x8_to_f32x4x2_neon_(b + i * 3, &b_x_low_f32x4, &b_x_high_f32x4, &b_y_low_f32x4,
1496
+ &b_y_high_f32x4, &b_z_low_f32x4, &b_z_high_f32x4);
1497
+
1498
+ sum_a_x_f32x4 = vaddq_f32(vaddq_f32(sum_a_x_f32x4, a_x_low_f32x4), a_x_high_f32x4);
1499
+ sum_a_y_f32x4 = vaddq_f32(vaddq_f32(sum_a_y_f32x4, a_y_low_f32x4), a_y_high_f32x4);
1500
+ sum_a_z_f32x4 = vaddq_f32(vaddq_f32(sum_a_z_f32x4, a_z_low_f32x4), a_z_high_f32x4);
1501
+ sum_b_x_f32x4 = vaddq_f32(vaddq_f32(sum_b_x_f32x4, b_x_low_f32x4), b_x_high_f32x4);
1502
+ sum_b_y_f32x4 = vaddq_f32(vaddq_f32(sum_b_y_f32x4, b_y_low_f32x4), b_y_high_f32x4);
1503
+ sum_b_z_f32x4 = vaddq_f32(vaddq_f32(sum_b_z_f32x4, b_z_low_f32x4), b_z_high_f32x4);
1504
+
1505
+ float32x4_t delta_x_f32x4 = vsubq_f32(a_x_low_f32x4, b_x_low_f32x4);
1506
+ float32x4_t delta_y_f32x4 = vsubq_f32(a_y_low_f32x4, b_y_low_f32x4);
1507
+ float32x4_t delta_z_f32x4 = vsubq_f32(a_z_low_f32x4, b_z_low_f32x4);
1508
+ sum_squared_x_f32x4 = vfmaq_f32(sum_squared_x_f32x4, delta_x_f32x4, delta_x_f32x4);
1509
+ sum_squared_y_f32x4 = vfmaq_f32(sum_squared_y_f32x4, delta_y_f32x4, delta_y_f32x4);
1510
+ sum_squared_z_f32x4 = vfmaq_f32(sum_squared_z_f32x4, delta_z_f32x4, delta_z_f32x4);
1511
+
1512
+ delta_x_f32x4 = vsubq_f32(a_x_high_f32x4, b_x_high_f32x4);
1513
+ delta_y_f32x4 = vsubq_f32(a_y_high_f32x4, b_y_high_f32x4);
1514
+ delta_z_f32x4 = vsubq_f32(a_z_high_f32x4, b_z_high_f32x4);
1515
+ sum_squared_x_f32x4 = vfmaq_f32(sum_squared_x_f32x4, delta_x_f32x4, delta_x_f32x4);
1516
+ sum_squared_y_f32x4 = vfmaq_f32(sum_squared_y_f32x4, delta_y_f32x4, delta_y_f32x4);
1517
+ sum_squared_z_f32x4 = vfmaq_f32(sum_squared_z_f32x4, delta_z_f32x4, delta_z_f32x4);
1518
+ }
1519
+
1520
+ if (i < n) {
1521
+ nk_partial_deinterleave_f16_to_f32x4x2_neon_(a + i * 3, n - i, &a_x_low_f32x4, &a_x_high_f32x4, &a_y_low_f32x4,
1522
+ &a_y_high_f32x4, &a_z_low_f32x4, &a_z_high_f32x4);
1523
+ nk_partial_deinterleave_f16_to_f32x4x2_neon_(b + i * 3, n - i, &b_x_low_f32x4, &b_x_high_f32x4, &b_y_low_f32x4,
1524
+ &b_y_high_f32x4, &b_z_low_f32x4, &b_z_high_f32x4);
1525
+
1526
+ sum_a_x_f32x4 = vaddq_f32(vaddq_f32(sum_a_x_f32x4, a_x_low_f32x4), a_x_high_f32x4);
1527
+ sum_a_y_f32x4 = vaddq_f32(vaddq_f32(sum_a_y_f32x4, a_y_low_f32x4), a_y_high_f32x4);
1528
+ sum_a_z_f32x4 = vaddq_f32(vaddq_f32(sum_a_z_f32x4, a_z_low_f32x4), a_z_high_f32x4);
1529
+ sum_b_x_f32x4 = vaddq_f32(vaddq_f32(sum_b_x_f32x4, b_x_low_f32x4), b_x_high_f32x4);
1530
+ sum_b_y_f32x4 = vaddq_f32(vaddq_f32(sum_b_y_f32x4, b_y_low_f32x4), b_y_high_f32x4);
1531
+ sum_b_z_f32x4 = vaddq_f32(vaddq_f32(sum_b_z_f32x4, b_z_low_f32x4), b_z_high_f32x4);
1532
+
1533
+ float32x4_t delta_x_f32x4 = vsubq_f32(a_x_low_f32x4, b_x_low_f32x4);
1534
+ float32x4_t delta_y_f32x4 = vsubq_f32(a_y_low_f32x4, b_y_low_f32x4);
1535
+ float32x4_t delta_z_f32x4 = vsubq_f32(a_z_low_f32x4, b_z_low_f32x4);
1536
+ sum_squared_x_f32x4 = vfmaq_f32(sum_squared_x_f32x4, delta_x_f32x4, delta_x_f32x4);
1537
+ sum_squared_y_f32x4 = vfmaq_f32(sum_squared_y_f32x4, delta_y_f32x4, delta_y_f32x4);
1538
+ sum_squared_z_f32x4 = vfmaq_f32(sum_squared_z_f32x4, delta_z_f32x4, delta_z_f32x4);
1539
+
1540
+ delta_x_f32x4 = vsubq_f32(a_x_high_f32x4, b_x_high_f32x4);
1541
+ delta_y_f32x4 = vsubq_f32(a_y_high_f32x4, b_y_high_f32x4);
1542
+ delta_z_f32x4 = vsubq_f32(a_z_high_f32x4, b_z_high_f32x4);
1543
+ sum_squared_x_f32x4 = vfmaq_f32(sum_squared_x_f32x4, delta_x_f32x4, delta_x_f32x4);
1544
+ sum_squared_y_f32x4 = vfmaq_f32(sum_squared_y_f32x4, delta_y_f32x4, delta_y_f32x4);
1545
+ sum_squared_z_f32x4 = vfmaq_f32(sum_squared_z_f32x4, delta_z_f32x4, delta_z_f32x4);
1546
+ }
1547
+
1548
+ // Reduce vectors to scalars
1549
+ nk_f32_t total_ax = vaddvq_f32(sum_a_x_f32x4);
1550
+ nk_f32_t total_ay = vaddvq_f32(sum_a_y_f32x4);
1551
+ nk_f32_t total_az = vaddvq_f32(sum_a_z_f32x4);
1552
+ nk_f32_t total_bx = vaddvq_f32(sum_b_x_f32x4);
1553
+ nk_f32_t total_by = vaddvq_f32(sum_b_y_f32x4);
1554
+ nk_f32_t total_bz = vaddvq_f32(sum_b_z_f32x4);
1555
+ nk_f32_t total_sq_x = vaddvq_f32(sum_squared_x_f32x4);
1556
+ nk_f32_t total_sq_y = vaddvq_f32(sum_squared_y_f32x4);
1557
+ nk_f32_t total_sq_z = vaddvq_f32(sum_squared_z_f32x4);
1558
+
1559
+ // Compute centroids
1560
+ nk_f32_t inv_n = 1.0f / (nk_f32_t)n;
1561
+ nk_f32_t centroid_a_x = total_ax * inv_n;
1562
+ nk_f32_t centroid_a_y = total_ay * inv_n;
1563
+ nk_f32_t centroid_a_z = total_az * inv_n;
1564
+ nk_f32_t centroid_b_x = total_bx * inv_n;
1565
+ nk_f32_t centroid_b_y = total_by * inv_n;
1566
+ nk_f32_t centroid_b_z = total_bz * inv_n;
1567
+
1568
+ if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
1569
+ if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
1570
+
1571
+ // Compute RMSD
1572
+ nk_f32_t mean_diff_x = centroid_a_x - centroid_b_x;
1573
+ nk_f32_t mean_diff_y = centroid_a_y - centroid_b_y;
1574
+ nk_f32_t mean_diff_z = centroid_a_z - centroid_b_z;
1575
+ nk_f32_t sum_squared = total_sq_x + total_sq_y + total_sq_z;
1576
+ nk_f32_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
1577
+
1578
+ *result = nk_f32_sqrt_neon(sum_squared * inv_n - mean_diff_sq);
1579
+ }
1580
+
1581
+ /**
1582
+ * @brief Kabsch algorithm for optimal rigid body superposition using NEON FP16 with widening to FP32.
1583
+ * Finds the rotation matrix R that minimizes RMSD between two point sets.
1584
+ */
1585
+ NK_PUBLIC void nk_kabsch_f16_neon(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
1586
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
1587
+ // Fused single-pass: load f16, convert to f32, compute centroids and covariance
1588
+ float32x4_t const zeros_f32x4 = vdupq_n_f32(0);
1589
+
1590
+ // Accumulators for centroids (f32)
1591
+ float32x4_t sum_a_x_f32x4 = zeros_f32x4, sum_a_y_f32x4 = zeros_f32x4, sum_a_z_f32x4 = zeros_f32x4;
1592
+ float32x4_t sum_b_x_f32x4 = zeros_f32x4, sum_b_y_f32x4 = zeros_f32x4, sum_b_z_f32x4 = zeros_f32x4;
1593
+
1594
+ // Accumulators for covariance matrix (sum of outer products)
1595
+ float32x4_t cov_xx_f32x4 = zeros_f32x4, cov_xy_f32x4 = zeros_f32x4, cov_xz_f32x4 = zeros_f32x4;
1596
+ float32x4_t cov_yx_f32x4 = zeros_f32x4, cov_yy_f32x4 = zeros_f32x4, cov_yz_f32x4 = zeros_f32x4;
1597
+ float32x4_t cov_zx_f32x4 = zeros_f32x4, cov_zy_f32x4 = zeros_f32x4, cov_zz_f32x4 = zeros_f32x4;
1598
+
1599
+ nk_size_t i = 0;
1600
+ float32x4_t a_x_low_f32x4, a_x_high_f32x4, a_y_low_f32x4, a_y_high_f32x4, a_z_low_f32x4, a_z_high_f32x4;
1601
+ float32x4_t b_x_low_f32x4, b_x_high_f32x4, b_y_low_f32x4, b_y_high_f32x4, b_z_low_f32x4, b_z_high_f32x4;
1602
+
1603
+ for (; i + 8 <= n; i += 8) {
1604
+ nk_deinterleave_f16x8_to_f32x4x2_neon_(a + i * 3, &a_x_low_f32x4, &a_x_high_f32x4, &a_y_low_f32x4,
1605
+ &a_y_high_f32x4, &a_z_low_f32x4, &a_z_high_f32x4);
1606
+ nk_deinterleave_f16x8_to_f32x4x2_neon_(b + i * 3, &b_x_low_f32x4, &b_x_high_f32x4, &b_y_low_f32x4,
1607
+ &b_y_high_f32x4, &b_z_low_f32x4, &b_z_high_f32x4);
1608
+
1609
+ sum_a_x_f32x4 = vaddq_f32(vaddq_f32(sum_a_x_f32x4, a_x_low_f32x4), a_x_high_f32x4);
1610
+ sum_a_y_f32x4 = vaddq_f32(vaddq_f32(sum_a_y_f32x4, a_y_low_f32x4), a_y_high_f32x4);
1611
+ sum_a_z_f32x4 = vaddq_f32(vaddq_f32(sum_a_z_f32x4, a_z_low_f32x4), a_z_high_f32x4);
1612
+ sum_b_x_f32x4 = vaddq_f32(vaddq_f32(sum_b_x_f32x4, b_x_low_f32x4), b_x_high_f32x4);
1613
+ sum_b_y_f32x4 = vaddq_f32(vaddq_f32(sum_b_y_f32x4, b_y_low_f32x4), b_y_high_f32x4);
1614
+ sum_b_z_f32x4 = vaddq_f32(vaddq_f32(sum_b_z_f32x4, b_z_low_f32x4), b_z_high_f32x4);
1615
+
1616
+ cov_xx_f32x4 = vfmaq_f32(vfmaq_f32(cov_xx_f32x4, a_x_low_f32x4, b_x_low_f32x4), a_x_high_f32x4, b_x_high_f32x4);
1617
+ cov_xy_f32x4 = vfmaq_f32(vfmaq_f32(cov_xy_f32x4, a_x_low_f32x4, b_y_low_f32x4), a_x_high_f32x4, b_y_high_f32x4);
1618
+ cov_xz_f32x4 = vfmaq_f32(vfmaq_f32(cov_xz_f32x4, a_x_low_f32x4, b_z_low_f32x4), a_x_high_f32x4, b_z_high_f32x4);
1619
+ cov_yx_f32x4 = vfmaq_f32(vfmaq_f32(cov_yx_f32x4, a_y_low_f32x4, b_x_low_f32x4), a_y_high_f32x4, b_x_high_f32x4);
1620
+ cov_yy_f32x4 = vfmaq_f32(vfmaq_f32(cov_yy_f32x4, a_y_low_f32x4, b_y_low_f32x4), a_y_high_f32x4, b_y_high_f32x4);
1621
+ cov_yz_f32x4 = vfmaq_f32(vfmaq_f32(cov_yz_f32x4, a_y_low_f32x4, b_z_low_f32x4), a_y_high_f32x4, b_z_high_f32x4);
1622
+ cov_zx_f32x4 = vfmaq_f32(vfmaq_f32(cov_zx_f32x4, a_z_low_f32x4, b_x_low_f32x4), a_z_high_f32x4, b_x_high_f32x4);
1623
+ cov_zy_f32x4 = vfmaq_f32(vfmaq_f32(cov_zy_f32x4, a_z_low_f32x4, b_y_low_f32x4), a_z_high_f32x4, b_y_high_f32x4);
1624
+ cov_zz_f32x4 = vfmaq_f32(vfmaq_f32(cov_zz_f32x4, a_z_low_f32x4, b_z_low_f32x4), a_z_high_f32x4, b_z_high_f32x4);
1625
+ }
1626
+
1627
+ if (i < n) {
1628
+ nk_partial_deinterleave_f16_to_f32x4x2_neon_(a + i * 3, n - i, &a_x_low_f32x4, &a_x_high_f32x4, &a_y_low_f32x4,
1629
+ &a_y_high_f32x4, &a_z_low_f32x4, &a_z_high_f32x4);
1630
+ nk_partial_deinterleave_f16_to_f32x4x2_neon_(b + i * 3, n - i, &b_x_low_f32x4, &b_x_high_f32x4, &b_y_low_f32x4,
1631
+ &b_y_high_f32x4, &b_z_low_f32x4, &b_z_high_f32x4);
1632
+
1633
+ sum_a_x_f32x4 = vaddq_f32(vaddq_f32(sum_a_x_f32x4, a_x_low_f32x4), a_x_high_f32x4);
1634
+ sum_a_y_f32x4 = vaddq_f32(vaddq_f32(sum_a_y_f32x4, a_y_low_f32x4), a_y_high_f32x4);
1635
+ sum_a_z_f32x4 = vaddq_f32(vaddq_f32(sum_a_z_f32x4, a_z_low_f32x4), a_z_high_f32x4);
1636
+ sum_b_x_f32x4 = vaddq_f32(vaddq_f32(sum_b_x_f32x4, b_x_low_f32x4), b_x_high_f32x4);
1637
+ sum_b_y_f32x4 = vaddq_f32(vaddq_f32(sum_b_y_f32x4, b_y_low_f32x4), b_y_high_f32x4);
1638
+ sum_b_z_f32x4 = vaddq_f32(vaddq_f32(sum_b_z_f32x4, b_z_low_f32x4), b_z_high_f32x4);
1639
+
1640
+ cov_xx_f32x4 = vfmaq_f32(vfmaq_f32(cov_xx_f32x4, a_x_low_f32x4, b_x_low_f32x4), a_x_high_f32x4, b_x_high_f32x4);
1641
+ cov_xy_f32x4 = vfmaq_f32(vfmaq_f32(cov_xy_f32x4, a_x_low_f32x4, b_y_low_f32x4), a_x_high_f32x4, b_y_high_f32x4);
1642
+ cov_xz_f32x4 = vfmaq_f32(vfmaq_f32(cov_xz_f32x4, a_x_low_f32x4, b_z_low_f32x4), a_x_high_f32x4, b_z_high_f32x4);
1643
+ cov_yx_f32x4 = vfmaq_f32(vfmaq_f32(cov_yx_f32x4, a_y_low_f32x4, b_x_low_f32x4), a_y_high_f32x4, b_x_high_f32x4);
1644
+ cov_yy_f32x4 = vfmaq_f32(vfmaq_f32(cov_yy_f32x4, a_y_low_f32x4, b_y_low_f32x4), a_y_high_f32x4, b_y_high_f32x4);
1645
+ cov_yz_f32x4 = vfmaq_f32(vfmaq_f32(cov_yz_f32x4, a_y_low_f32x4, b_z_low_f32x4), a_y_high_f32x4, b_z_high_f32x4);
1646
+ cov_zx_f32x4 = vfmaq_f32(vfmaq_f32(cov_zx_f32x4, a_z_low_f32x4, b_x_low_f32x4), a_z_high_f32x4, b_x_high_f32x4);
1647
+ cov_zy_f32x4 = vfmaq_f32(vfmaq_f32(cov_zy_f32x4, a_z_low_f32x4, b_y_low_f32x4), a_z_high_f32x4, b_y_high_f32x4);
1648
+ cov_zz_f32x4 = vfmaq_f32(vfmaq_f32(cov_zz_f32x4, a_z_low_f32x4, b_z_low_f32x4), a_z_high_f32x4, b_z_high_f32x4);
1649
+ }
1650
+
1651
+ // Reduce vector accumulators
1652
+ nk_f32_t sum_a_x = vaddvq_f32(sum_a_x_f32x4);
1653
+ nk_f32_t sum_a_y = vaddvq_f32(sum_a_y_f32x4);
1654
+ nk_f32_t sum_a_z = vaddvq_f32(sum_a_z_f32x4);
1655
+ nk_f32_t sum_b_x = vaddvq_f32(sum_b_x_f32x4);
1656
+ nk_f32_t sum_b_y = vaddvq_f32(sum_b_y_f32x4);
1657
+ nk_f32_t sum_b_z = vaddvq_f32(sum_b_z_f32x4);
1658
+
1659
+ nk_f32_t covariance_x_x = vaddvq_f32(cov_xx_f32x4);
1660
+ nk_f32_t covariance_x_y = vaddvq_f32(cov_xy_f32x4);
1661
+ nk_f32_t covariance_x_z = vaddvq_f32(cov_xz_f32x4);
1662
+ nk_f32_t covariance_y_x = vaddvq_f32(cov_yx_f32x4);
1663
+ nk_f32_t covariance_y_y = vaddvq_f32(cov_yy_f32x4);
1664
+ nk_f32_t covariance_y_z = vaddvq_f32(cov_yz_f32x4);
1665
+ nk_f32_t covariance_z_x = vaddvq_f32(cov_zx_f32x4);
1666
+ nk_f32_t covariance_z_y = vaddvq_f32(cov_zy_f32x4);
1667
+ nk_f32_t covariance_z_z = vaddvq_f32(cov_zz_f32x4);
1668
+
1669
+ // Compute centroids
1670
+ nk_f32_t inv_n = 1.0f / (nk_f32_t)n;
1671
+ nk_f32_t centroid_a_x = sum_a_x * inv_n;
1672
+ nk_f32_t centroid_a_y = sum_a_y * inv_n;
1673
+ nk_f32_t centroid_a_z = sum_a_z * inv_n;
1674
+ nk_f32_t centroid_b_x = sum_b_x * inv_n;
1675
+ nk_f32_t centroid_b_y = sum_b_y * inv_n;
1676
+ nk_f32_t centroid_b_z = sum_b_z * inv_n;
1677
+
1678
+ if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
1679
+ if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
1680
+
1681
+ // Compute centered covariance: H = (A - centroid_A)ᵀ * (B - centroid_B)
1682
+ // H = sum(a * bᵀ) - n * centroid_a * centroid_bᵀ
1683
+ nk_f32_t h[9];
1684
+ h[0] = covariance_x_x - (nk_f32_t)n * centroid_a_x * centroid_b_x;
1685
+ h[1] = covariance_x_y - (nk_f32_t)n * centroid_a_x * centroid_b_y;
1686
+ h[2] = covariance_x_z - (nk_f32_t)n * centroid_a_x * centroid_b_z;
1687
+ h[3] = covariance_y_x - (nk_f32_t)n * centroid_a_y * centroid_b_x;
1688
+ h[4] = covariance_y_y - (nk_f32_t)n * centroid_a_y * centroid_b_y;
1689
+ h[5] = covariance_y_z - (nk_f32_t)n * centroid_a_y * centroid_b_z;
1690
+ h[6] = covariance_z_x - (nk_f32_t)n * centroid_a_z * centroid_b_x;
1691
+ h[7] = covariance_z_y - (nk_f32_t)n * centroid_a_z * centroid_b_y;
1692
+ h[8] = covariance_z_z - (nk_f32_t)n * centroid_a_z * centroid_b_z;
1693
+
1694
+ // SVD of H = U * S * Vᵀ
1695
+ nk_f32_t svd_u[9], svd_s[9], svd_v[9];
1696
+ nk_svd3x3_f32_(h, svd_u, svd_s, svd_v);
1697
+
1698
+ // R = V * Uᵀ
1699
+ nk_f32_t r[9];
1700
+ r[0] = svd_v[0] * svd_u[0] + svd_v[1] * svd_u[1] + svd_v[2] * svd_u[2];
1701
+ r[1] = svd_v[0] * svd_u[3] + svd_v[1] * svd_u[4] + svd_v[2] * svd_u[5];
1702
+ r[2] = svd_v[0] * svd_u[6] + svd_v[1] * svd_u[7] + svd_v[2] * svd_u[8];
1703
+ r[3] = svd_v[3] * svd_u[0] + svd_v[4] * svd_u[1] + svd_v[5] * svd_u[2];
1704
+ r[4] = svd_v[3] * svd_u[3] + svd_v[4] * svd_u[4] + svd_v[5] * svd_u[5];
1705
+ r[5] = svd_v[3] * svd_u[6] + svd_v[4] * svd_u[7] + svd_v[5] * svd_u[8];
1706
+ r[6] = svd_v[6] * svd_u[0] + svd_v[7] * svd_u[1] + svd_v[8] * svd_u[2];
1707
+ r[7] = svd_v[6] * svd_u[3] + svd_v[7] * svd_u[4] + svd_v[8] * svd_u[5];
1708
+ r[8] = svd_v[6] * svd_u[6] + svd_v[7] * svd_u[7] + svd_v[8] * svd_u[8];
1709
+
1710
+ // Handle reflection: if det(R) < 0, negate third column of V and recompute
1711
+ nk_f32_t det_r = nk_det3x3_f32_(r);
1712
+ if (det_r < 0) {
1713
+ svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
1714
+ r[0] = svd_v[0] * svd_u[0] + svd_v[1] * svd_u[1] + svd_v[2] * svd_u[2];
1715
+ r[1] = svd_v[0] * svd_u[3] + svd_v[1] * svd_u[4] + svd_v[2] * svd_u[5];
1716
+ r[2] = svd_v[0] * svd_u[6] + svd_v[1] * svd_u[7] + svd_v[2] * svd_u[8];
1717
+ r[3] = svd_v[3] * svd_u[0] + svd_v[4] * svd_u[1] + svd_v[5] * svd_u[2];
1718
+ r[4] = svd_v[3] * svd_u[3] + svd_v[4] * svd_u[4] + svd_v[5] * svd_u[5];
1719
+ r[5] = svd_v[3] * svd_u[6] + svd_v[4] * svd_u[7] + svd_v[5] * svd_u[8];
1720
+ r[6] = svd_v[6] * svd_u[0] + svd_v[7] * svd_u[1] + svd_v[8] * svd_u[2];
1721
+ r[7] = svd_v[6] * svd_u[3] + svd_v[7] * svd_u[4] + svd_v[8] * svd_u[5];
1722
+ r[8] = svd_v[6] * svd_u[6] + svd_v[7] * svd_u[7] + svd_v[8] * svd_u[8];
1723
+ }
1724
+
1725
+ if (rotation)
1726
+ for (int j = 0; j < 9; ++j) rotation[j] = r[j];
1727
+ if (scale) *scale = 1.0f;
1728
+
1729
+ // Compute RMSD after rotation
1730
+ nk_f32_t sum_squared = nk_transformed_ssd_f16_neon_(a, b, n, r, 1.0f, centroid_a_x, centroid_a_y, centroid_a_z,
1731
+ centroid_b_x, centroid_b_y, centroid_b_z);
1732
+ *result = nk_f32_sqrt_neon(sum_squared * inv_n);
1733
+ }
1734
+
1735
+ NK_PUBLIC void nk_umeyama_f16_neon(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
1736
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
1737
+ // Fused single-pass: load f16, convert to f32, compute centroids, covariance, and variance
1738
+ float32x4_t const zeros_f32x4 = vdupq_n_f32(0);
1739
+
1740
+ float32x4_t sum_a_x_f32x4 = zeros_f32x4, sum_a_y_f32x4 = zeros_f32x4, sum_a_z_f32x4 = zeros_f32x4;
1741
+ float32x4_t sum_b_x_f32x4 = zeros_f32x4, sum_b_y_f32x4 = zeros_f32x4, sum_b_z_f32x4 = zeros_f32x4;
1742
+ float32x4_t cov_xx_f32x4 = zeros_f32x4, cov_xy_f32x4 = zeros_f32x4, cov_xz_f32x4 = zeros_f32x4;
1743
+ float32x4_t cov_yx_f32x4 = zeros_f32x4, cov_yy_f32x4 = zeros_f32x4, cov_yz_f32x4 = zeros_f32x4;
1744
+ float32x4_t cov_zx_f32x4 = zeros_f32x4, cov_zy_f32x4 = zeros_f32x4, cov_zz_f32x4 = zeros_f32x4;
1745
+ float32x4_t variance_a_f32x4 = zeros_f32x4;
1746
+
1747
+ nk_size_t i = 0;
1748
+ float32x4_t a_x_low_f32x4, a_x_high_f32x4, a_y_low_f32x4, a_y_high_f32x4, a_z_low_f32x4, a_z_high_f32x4;
1749
+ float32x4_t b_x_low_f32x4, b_x_high_f32x4, b_y_low_f32x4, b_y_high_f32x4, b_z_low_f32x4, b_z_high_f32x4;
1750
+
1751
+ for (; i + 8 <= n; i += 8) {
1752
+ nk_deinterleave_f16x8_to_f32x4x2_neon_(a + i * 3, &a_x_low_f32x4, &a_x_high_f32x4, &a_y_low_f32x4,
1753
+ &a_y_high_f32x4, &a_z_low_f32x4, &a_z_high_f32x4);
1754
+ nk_deinterleave_f16x8_to_f32x4x2_neon_(b + i * 3, &b_x_low_f32x4, &b_x_high_f32x4, &b_y_low_f32x4,
1755
+ &b_y_high_f32x4, &b_z_low_f32x4, &b_z_high_f32x4);
1756
+
1757
+ sum_a_x_f32x4 = vaddq_f32(vaddq_f32(sum_a_x_f32x4, a_x_low_f32x4), a_x_high_f32x4);
1758
+ sum_a_y_f32x4 = vaddq_f32(vaddq_f32(sum_a_y_f32x4, a_y_low_f32x4), a_y_high_f32x4);
1759
+ sum_a_z_f32x4 = vaddq_f32(vaddq_f32(sum_a_z_f32x4, a_z_low_f32x4), a_z_high_f32x4);
1760
+ sum_b_x_f32x4 = vaddq_f32(vaddq_f32(sum_b_x_f32x4, b_x_low_f32x4), b_x_high_f32x4);
1761
+ sum_b_y_f32x4 = vaddq_f32(vaddq_f32(sum_b_y_f32x4, b_y_low_f32x4), b_y_high_f32x4);
1762
+ sum_b_z_f32x4 = vaddq_f32(vaddq_f32(sum_b_z_f32x4, b_z_low_f32x4), b_z_high_f32x4);
1763
+
1764
+ cov_xx_f32x4 = vfmaq_f32(vfmaq_f32(cov_xx_f32x4, a_x_low_f32x4, b_x_low_f32x4), a_x_high_f32x4, b_x_high_f32x4);
1765
+ cov_xy_f32x4 = vfmaq_f32(vfmaq_f32(cov_xy_f32x4, a_x_low_f32x4, b_y_low_f32x4), a_x_high_f32x4, b_y_high_f32x4);
1766
+ cov_xz_f32x4 = vfmaq_f32(vfmaq_f32(cov_xz_f32x4, a_x_low_f32x4, b_z_low_f32x4), a_x_high_f32x4, b_z_high_f32x4);
1767
+ cov_yx_f32x4 = vfmaq_f32(vfmaq_f32(cov_yx_f32x4, a_y_low_f32x4, b_x_low_f32x4), a_y_high_f32x4, b_x_high_f32x4);
1768
+ cov_yy_f32x4 = vfmaq_f32(vfmaq_f32(cov_yy_f32x4, a_y_low_f32x4, b_y_low_f32x4), a_y_high_f32x4, b_y_high_f32x4);
1769
+ cov_yz_f32x4 = vfmaq_f32(vfmaq_f32(cov_yz_f32x4, a_y_low_f32x4, b_z_low_f32x4), a_y_high_f32x4, b_z_high_f32x4);
1770
+ cov_zx_f32x4 = vfmaq_f32(vfmaq_f32(cov_zx_f32x4, a_z_low_f32x4, b_x_low_f32x4), a_z_high_f32x4, b_x_high_f32x4);
1771
+ cov_zy_f32x4 = vfmaq_f32(vfmaq_f32(cov_zy_f32x4, a_z_low_f32x4, b_y_low_f32x4), a_z_high_f32x4, b_y_high_f32x4);
1772
+ cov_zz_f32x4 = vfmaq_f32(vfmaq_f32(cov_zz_f32x4, a_z_low_f32x4, b_z_low_f32x4), a_z_high_f32x4, b_z_high_f32x4);
1773
+
1774
+ variance_a_f32x4 = vfmaq_f32(variance_a_f32x4, a_x_low_f32x4, a_x_low_f32x4);
1775
+ variance_a_f32x4 = vfmaq_f32(variance_a_f32x4, a_y_low_f32x4, a_y_low_f32x4);
1776
+ variance_a_f32x4 = vfmaq_f32(variance_a_f32x4, a_z_low_f32x4, a_z_low_f32x4);
1777
+ variance_a_f32x4 = vfmaq_f32(variance_a_f32x4, a_x_high_f32x4, a_x_high_f32x4);
1778
+ variance_a_f32x4 = vfmaq_f32(variance_a_f32x4, a_y_high_f32x4, a_y_high_f32x4);
1779
+ variance_a_f32x4 = vfmaq_f32(variance_a_f32x4, a_z_high_f32x4, a_z_high_f32x4);
1780
+ }
1781
+
1782
+ if (i < n) {
1783
+ nk_partial_deinterleave_f16_to_f32x4x2_neon_(a + i * 3, n - i, &a_x_low_f32x4, &a_x_high_f32x4, &a_y_low_f32x4,
1784
+ &a_y_high_f32x4, &a_z_low_f32x4, &a_z_high_f32x4);
1785
+ nk_partial_deinterleave_f16_to_f32x4x2_neon_(b + i * 3, n - i, &b_x_low_f32x4, &b_x_high_f32x4, &b_y_low_f32x4,
1786
+ &b_y_high_f32x4, &b_z_low_f32x4, &b_z_high_f32x4);
1787
+
1788
+ sum_a_x_f32x4 = vaddq_f32(vaddq_f32(sum_a_x_f32x4, a_x_low_f32x4), a_x_high_f32x4);
1789
+ sum_a_y_f32x4 = vaddq_f32(vaddq_f32(sum_a_y_f32x4, a_y_low_f32x4), a_y_high_f32x4);
1790
+ sum_a_z_f32x4 = vaddq_f32(vaddq_f32(sum_a_z_f32x4, a_z_low_f32x4), a_z_high_f32x4);
1791
+ sum_b_x_f32x4 = vaddq_f32(vaddq_f32(sum_b_x_f32x4, b_x_low_f32x4), b_x_high_f32x4);
1792
+ sum_b_y_f32x4 = vaddq_f32(vaddq_f32(sum_b_y_f32x4, b_y_low_f32x4), b_y_high_f32x4);
1793
+ sum_b_z_f32x4 = vaddq_f32(vaddq_f32(sum_b_z_f32x4, b_z_low_f32x4), b_z_high_f32x4);
1794
+
1795
+ cov_xx_f32x4 = vfmaq_f32(vfmaq_f32(cov_xx_f32x4, a_x_low_f32x4, b_x_low_f32x4), a_x_high_f32x4, b_x_high_f32x4);
1796
+ cov_xy_f32x4 = vfmaq_f32(vfmaq_f32(cov_xy_f32x4, a_x_low_f32x4, b_y_low_f32x4), a_x_high_f32x4, b_y_high_f32x4);
1797
+ cov_xz_f32x4 = vfmaq_f32(vfmaq_f32(cov_xz_f32x4, a_x_low_f32x4, b_z_low_f32x4), a_x_high_f32x4, b_z_high_f32x4);
1798
+ cov_yx_f32x4 = vfmaq_f32(vfmaq_f32(cov_yx_f32x4, a_y_low_f32x4, b_x_low_f32x4), a_y_high_f32x4, b_x_high_f32x4);
1799
+ cov_yy_f32x4 = vfmaq_f32(vfmaq_f32(cov_yy_f32x4, a_y_low_f32x4, b_y_low_f32x4), a_y_high_f32x4, b_y_high_f32x4);
1800
+ cov_yz_f32x4 = vfmaq_f32(vfmaq_f32(cov_yz_f32x4, a_y_low_f32x4, b_z_low_f32x4), a_y_high_f32x4, b_z_high_f32x4);
1801
+ cov_zx_f32x4 = vfmaq_f32(vfmaq_f32(cov_zx_f32x4, a_z_low_f32x4, b_x_low_f32x4), a_z_high_f32x4, b_x_high_f32x4);
1802
+ cov_zy_f32x4 = vfmaq_f32(vfmaq_f32(cov_zy_f32x4, a_z_low_f32x4, b_y_low_f32x4), a_z_high_f32x4, b_y_high_f32x4);
1803
+ cov_zz_f32x4 = vfmaq_f32(vfmaq_f32(cov_zz_f32x4, a_z_low_f32x4, b_z_low_f32x4), a_z_high_f32x4, b_z_high_f32x4);
1804
+
1805
+ variance_a_f32x4 = vfmaq_f32(variance_a_f32x4, a_x_low_f32x4, a_x_low_f32x4);
1806
+ variance_a_f32x4 = vfmaq_f32(variance_a_f32x4, a_y_low_f32x4, a_y_low_f32x4);
1807
+ variance_a_f32x4 = vfmaq_f32(variance_a_f32x4, a_z_low_f32x4, a_z_low_f32x4);
1808
+ variance_a_f32x4 = vfmaq_f32(variance_a_f32x4, a_x_high_f32x4, a_x_high_f32x4);
1809
+ variance_a_f32x4 = vfmaq_f32(variance_a_f32x4, a_y_high_f32x4, a_y_high_f32x4);
1810
+ variance_a_f32x4 = vfmaq_f32(variance_a_f32x4, a_z_high_f32x4, a_z_high_f32x4);
1811
+ }
1812
+
1813
+ // Reduce vector accumulators
1814
+ nk_f32_t sum_a_x = vaddvq_f32(sum_a_x_f32x4);
1815
+ nk_f32_t sum_a_y = vaddvq_f32(sum_a_y_f32x4);
1816
+ nk_f32_t sum_a_z = vaddvq_f32(sum_a_z_f32x4);
1817
+ nk_f32_t sum_b_x = vaddvq_f32(sum_b_x_f32x4);
1818
+ nk_f32_t sum_b_y = vaddvq_f32(sum_b_y_f32x4);
1819
+ nk_f32_t sum_b_z = vaddvq_f32(sum_b_z_f32x4);
1820
+ nk_f32_t covariance_x_x = vaddvq_f32(cov_xx_f32x4);
1821
+ nk_f32_t covariance_x_y = vaddvq_f32(cov_xy_f32x4);
1822
+ nk_f32_t covariance_x_z = vaddvq_f32(cov_xz_f32x4);
1823
+ nk_f32_t covariance_y_x = vaddvq_f32(cov_yx_f32x4);
1824
+ nk_f32_t covariance_y_y = vaddvq_f32(cov_yy_f32x4);
1825
+ nk_f32_t covariance_y_z = vaddvq_f32(cov_yz_f32x4);
1826
+ nk_f32_t covariance_z_x = vaddvq_f32(cov_zx_f32x4);
1827
+ nk_f32_t covariance_z_y = vaddvq_f32(cov_zy_f32x4);
1828
+ nk_f32_t covariance_z_z = vaddvq_f32(cov_zz_f32x4);
1829
+ nk_f32_t variance_a_sum = vaddvq_f32(variance_a_f32x4);
1830
+
1831
+ // Compute centroids
1832
+ nk_f32_t inv_n = 1.0f / (nk_f32_t)n;
1833
+ nk_f32_t centroid_a_x = sum_a_x * inv_n, centroid_a_y = sum_a_y * inv_n, centroid_a_z = sum_a_z * inv_n;
1834
+ nk_f32_t centroid_b_x = sum_b_x * inv_n, centroid_b_y = sum_b_y * inv_n, centroid_b_z = sum_b_z * inv_n;
1835
+
1836
+ if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
1837
+ if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
1838
+
1839
+ // Compute centered covariance and variance
1840
+ nk_f32_t variance_a = variance_a_sum * inv_n -
1841
+ (centroid_a_x * centroid_a_x + centroid_a_y * centroid_a_y + centroid_a_z * centroid_a_z);
1842
+
1843
+ nk_f32_t h[9];
1844
+ h[0] = covariance_x_x - (nk_f32_t)n * centroid_a_x * centroid_b_x;
1845
+ h[1] = covariance_x_y - (nk_f32_t)n * centroid_a_x * centroid_b_y;
1846
+ h[2] = covariance_x_z - (nk_f32_t)n * centroid_a_x * centroid_b_z;
1847
+ h[3] = covariance_y_x - (nk_f32_t)n * centroid_a_y * centroid_b_x;
1848
+ h[4] = covariance_y_y - (nk_f32_t)n * centroid_a_y * centroid_b_y;
1849
+ h[5] = covariance_y_z - (nk_f32_t)n * centroid_a_y * centroid_b_z;
1850
+ h[6] = covariance_z_x - (nk_f32_t)n * centroid_a_z * centroid_b_x;
1851
+ h[7] = covariance_z_y - (nk_f32_t)n * centroid_a_z * centroid_b_y;
1852
+ h[8] = covariance_z_z - (nk_f32_t)n * centroid_a_z * centroid_b_z;
1853
+
1854
+ // SVD of H = U * S * Vᵀ
1855
+ nk_f32_t svd_u[9], svd_s[9], svd_v[9];
1856
+ nk_svd3x3_f32_(h, svd_u, svd_s, svd_v);
1857
+
1858
+ // R = V * Uᵀ
1859
+ nk_f32_t r[9];
1860
+ r[0] = svd_v[0] * svd_u[0] + svd_v[1] * svd_u[1] + svd_v[2] * svd_u[2];
1861
+ r[1] = svd_v[0] * svd_u[3] + svd_v[1] * svd_u[4] + svd_v[2] * svd_u[5];
1862
+ r[2] = svd_v[0] * svd_u[6] + svd_v[1] * svd_u[7] + svd_v[2] * svd_u[8];
1863
+ r[3] = svd_v[3] * svd_u[0] + svd_v[4] * svd_u[1] + svd_v[5] * svd_u[2];
1864
+ r[4] = svd_v[3] * svd_u[3] + svd_v[4] * svd_u[4] + svd_v[5] * svd_u[5];
1865
+ r[5] = svd_v[3] * svd_u[6] + svd_v[4] * svd_u[7] + svd_v[5] * svd_u[8];
1866
+ r[6] = svd_v[6] * svd_u[0] + svd_v[7] * svd_u[1] + svd_v[8] * svd_u[2];
1867
+ r[7] = svd_v[6] * svd_u[3] + svd_v[7] * svd_u[4] + svd_v[8] * svd_u[5];
1868
+ r[8] = svd_v[6] * svd_u[6] + svd_v[7] * svd_u[7] + svd_v[8] * svd_u[8];
1869
+
1870
+ // Handle reflection and compute scale: c = trace(D × S) / variance(a)
1871
+ nk_f32_t det_r = nk_det3x3_f32_(r);
1872
+ nk_f32_t sign_det = det_r < 0 ? -1.0f : 1.0f;
1873
+ nk_f32_t trace_scaled_s = svd_s[0] + svd_s[4] + sign_det * svd_s[8];
1874
+ nk_f32_t scale_factor = trace_scaled_s / ((nk_f32_t)n * variance_a);
1875
+ if (scale) *scale = scale_factor;
1876
+
1877
+ if (det_r < 0) {
1878
+ svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
1879
+ r[0] = svd_v[0] * svd_u[0] + svd_v[1] * svd_u[1] + svd_v[2] * svd_u[2];
1880
+ r[1] = svd_v[0] * svd_u[3] + svd_v[1] * svd_u[4] + svd_v[2] * svd_u[5];
1881
+ r[2] = svd_v[0] * svd_u[6] + svd_v[1] * svd_u[7] + svd_v[2] * svd_u[8];
1882
+ r[3] = svd_v[3] * svd_u[0] + svd_v[4] * svd_u[1] + svd_v[5] * svd_u[2];
1883
+ r[4] = svd_v[3] * svd_u[3] + svd_v[4] * svd_u[4] + svd_v[5] * svd_u[5];
1884
+ r[5] = svd_v[3] * svd_u[6] + svd_v[4] * svd_u[7] + svd_v[5] * svd_u[8];
1885
+ r[6] = svd_v[6] * svd_u[0] + svd_v[7] * svd_u[1] + svd_v[8] * svd_u[2];
1886
+ r[7] = svd_v[6] * svd_u[3] + svd_v[7] * svd_u[4] + svd_v[8] * svd_u[5];
1887
+ r[8] = svd_v[6] * svd_u[6] + svd_v[7] * svd_u[7] + svd_v[8] * svd_u[8];
1888
+ }
1889
+
1890
+ if (rotation)
1891
+ for (int j = 0; j < 9; ++j) rotation[j] = r[j];
1892
+
1893
+ // Compute RMSD after similarity transform
1894
+ nk_f32_t sum_squared = nk_transformed_ssd_f16_neon_(a, b, n, r, scale_factor, centroid_a_x, centroid_a_y,
1895
+ centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z);
1896
+ *result = nk_f32_sqrt_neon(sum_squared * inv_n);
1897
+ }
1317
1898
  #if defined(__clang__)
1318
1899
  #pragma clang attribute pop
1319
1900
  #elif defined(__GNUC__)