numkong 7.0.0 → 7.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +239 -122
  2. package/binding.gyp +25 -491
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -56,14 +56,14 @@ NK_INTERNAL void nk_deinterleave_f32x4_v128relaxed_(nk_f32_t const *ptr, v128_t
56
56
  v128_t v1_f32x4 = wasm_v128_load(ptr + 4); // y1 z1 x2 y2
57
57
  v128_t v2_f32x4 = wasm_v128_load(ptr + 8); // z2 x3 y3 z3
58
58
  // x0 x1 x2 x3
59
- v128_t tmp01 = wasm_i32x4_shuffle(v0_f32x4, v1_f32x4, 0, 3, 6, 0); // x0 x1 x2 _
60
- *xs_f32x4 = wasm_i32x4_shuffle(tmp01, v2_f32x4, 0, 1, 2, 5); // x0 x1 x2 x3
59
+ v128_t x_partial_f32x4 = wasm_i32x4_shuffle(v0_f32x4, v1_f32x4, 0, 3, 6, 0); // x0 x1 x2 _
60
+ *xs_f32x4 = wasm_i32x4_shuffle(x_partial_f32x4, v2_f32x4, 0, 1, 2, 5); // x0 x1 x2 x3
61
61
  // y0 y1 y2 y3
62
- v128_t tmp23 = wasm_i32x4_shuffle(v0_f32x4, v1_f32x4, 1, 4, 7, 0); // y0 y1 y2 _
63
- *ys_f32x4 = wasm_i32x4_shuffle(tmp23, v2_f32x4, 0, 1, 2, 6); // y0 y1 y2 y3
62
+ v128_t y_partial_f32x4 = wasm_i32x4_shuffle(v0_f32x4, v1_f32x4, 1, 4, 7, 0); // y0 y1 y2 _
63
+ *ys_f32x4 = wasm_i32x4_shuffle(y_partial_f32x4, v2_f32x4, 0, 1, 2, 6); // y0 y1 y2 y3
64
64
  // z0 z1 z2 z3
65
- v128_t tmp45 = wasm_i32x4_shuffle(v0_f32x4, v1_f32x4, 2, 5, 0, 0); // z0 z1 _ _
66
- *zs_f32x4 = wasm_i32x4_shuffle(tmp45, v2_f32x4, 0, 1, 4, 7); // z0 z1 z2 z3
65
+ v128_t z_partial_f32x4 = wasm_i32x4_shuffle(v0_f32x4, v1_f32x4, 2, 5, 0, 0); // z0 z1 _ _
66
+ *zs_f32x4 = wasm_i32x4_shuffle(z_partial_f32x4, v2_f32x4, 0, 1, 4, 7); // z0 z1 z2 z3
67
67
  }
68
68
 
69
69
  /* Deinterleave 6 contiguous f64 values (2 XYZ triplets) into separate x, y, z vectors.
@@ -120,89 +120,27 @@ NK_INTERNAL void nk_accumulate_square_f64x2_v128relaxed_(v128_t *sum_f64x2, v128
120
120
  *compensation_f64x2 = wasm_f64x2_add(*compensation_f64x2, wasm_f64x2_add(sum_error_f64x2, product_error_f64x2));
121
121
  }
122
122
 
123
- NK_INTERNAL void nk_bicentroid_f32_v128relaxed_( //
124
- nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, //
125
- nk_f64_t *ca_x, nk_f64_t *ca_y, nk_f64_t *ca_z, //
126
- nk_f64_t *cb_x, nk_f64_t *cb_y, nk_f64_t *cb_z) { //
127
- v128_t zero_f64x2 = wasm_f64x2_splat(0.0);
128
- v128_t sum_a_x_lower_f64x2 = zero_f64x2, sum_a_x_upper_f64x2 = zero_f64x2;
129
- v128_t sum_a_y_lower_f64x2 = zero_f64x2, sum_a_y_upper_f64x2 = zero_f64x2;
130
- v128_t sum_a_z_lower_f64x2 = zero_f64x2, sum_a_z_upper_f64x2 = zero_f64x2;
131
- v128_t sum_b_x_lower_f64x2 = zero_f64x2, sum_b_x_upper_f64x2 = zero_f64x2;
132
- v128_t sum_b_y_lower_f64x2 = zero_f64x2, sum_b_y_upper_f64x2 = zero_f64x2;
133
- v128_t sum_b_z_lower_f64x2 = zero_f64x2, sum_b_z_upper_f64x2 = zero_f64x2;
134
- nk_size_t index = 0;
135
-
136
- for (; index + 4 <= n; index += 4) {
137
- v128_t a_x_f32x4, a_y_f32x4, a_z_f32x4, b_x_f32x4, b_y_f32x4, b_z_f32x4;
138
- nk_deinterleave_f32x4_v128relaxed_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4);
139
- nk_deinterleave_f32x4_v128relaxed_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
140
-
141
- v128_t a_x_lower_f64x2 = wasm_f64x2_promote_low_f32x4(a_x_f32x4);
142
- v128_t a_x_upper_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_x_f32x4, a_x_f32x4, 2, 3, 0, 1));
143
- v128_t a_y_lower_f64x2 = wasm_f64x2_promote_low_f32x4(a_y_f32x4);
144
- v128_t a_y_upper_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_y_f32x4, a_y_f32x4, 2, 3, 0, 1));
145
- v128_t a_z_lower_f64x2 = wasm_f64x2_promote_low_f32x4(a_z_f32x4);
146
- v128_t a_z_upper_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_z_f32x4, a_z_f32x4, 2, 3, 0, 1));
147
- v128_t b_x_lower_f64x2 = wasm_f64x2_promote_low_f32x4(b_x_f32x4);
148
- v128_t b_x_upper_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_x_f32x4, b_x_f32x4, 2, 3, 0, 1));
149
- v128_t b_y_lower_f64x2 = wasm_f64x2_promote_low_f32x4(b_y_f32x4);
150
- v128_t b_y_upper_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_y_f32x4, b_y_f32x4, 2, 3, 0, 1));
151
- v128_t b_z_lower_f64x2 = wasm_f64x2_promote_low_f32x4(b_z_f32x4);
152
- v128_t b_z_upper_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_z_f32x4, b_z_f32x4, 2, 3, 0, 1));
153
-
154
- sum_a_x_lower_f64x2 = wasm_f64x2_add(sum_a_x_lower_f64x2, a_x_lower_f64x2),
155
- sum_a_x_upper_f64x2 = wasm_f64x2_add(sum_a_x_upper_f64x2, a_x_upper_f64x2);
156
- sum_a_y_lower_f64x2 = wasm_f64x2_add(sum_a_y_lower_f64x2, a_y_lower_f64x2),
157
- sum_a_y_upper_f64x2 = wasm_f64x2_add(sum_a_y_upper_f64x2, a_y_upper_f64x2);
158
- sum_a_z_lower_f64x2 = wasm_f64x2_add(sum_a_z_lower_f64x2, a_z_lower_f64x2),
159
- sum_a_z_upper_f64x2 = wasm_f64x2_add(sum_a_z_upper_f64x2, a_z_upper_f64x2);
160
- sum_b_x_lower_f64x2 = wasm_f64x2_add(sum_b_x_lower_f64x2, b_x_lower_f64x2),
161
- sum_b_x_upper_f64x2 = wasm_f64x2_add(sum_b_x_upper_f64x2, b_x_upper_f64x2);
162
- sum_b_y_lower_f64x2 = wasm_f64x2_add(sum_b_y_lower_f64x2, b_y_lower_f64x2),
163
- sum_b_y_upper_f64x2 = wasm_f64x2_add(sum_b_y_upper_f64x2, b_y_upper_f64x2);
164
- sum_b_z_lower_f64x2 = wasm_f64x2_add(sum_b_z_lower_f64x2, b_z_lower_f64x2),
165
- sum_b_z_upper_f64x2 = wasm_f64x2_add(sum_b_z_upper_f64x2, b_z_upper_f64x2);
166
- }
167
-
168
- nk_f64_t sum_a_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_x_lower_f64x2, sum_a_x_upper_f64x2));
169
- nk_f64_t sum_a_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_y_lower_f64x2, sum_a_y_upper_f64x2));
170
- nk_f64_t sum_a_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_z_lower_f64x2, sum_a_z_upper_f64x2));
171
- nk_f64_t sum_b_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_x_lower_f64x2, sum_b_x_upper_f64x2));
172
- nk_f64_t sum_b_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_y_lower_f64x2, sum_b_y_upper_f64x2));
173
- nk_f64_t sum_b_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_z_lower_f64x2, sum_b_z_upper_f64x2));
174
-
175
- for (; index < n; ++index) {
176
- sum_a_x += a[index * 3 + 0], sum_a_y += a[index * 3 + 1], sum_a_z += a[index * 3 + 2];
177
- sum_b_x += b[index * 3 + 0], sum_b_y += b[index * 3 + 1], sum_b_z += b[index * 3 + 2];
178
- }
179
-
180
- nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
181
- *ca_x = sum_a_x * inv_n, *ca_y = sum_a_y * inv_n, *ca_z = sum_a_z * inv_n;
182
- *cb_x = sum_b_x * inv_n, *cb_y = sum_b_y * inv_n, *cb_z = sum_b_z * inv_n;
183
- }
184
-
185
- NK_INTERNAL void nk_centroid_and_cross_covariance_f32_v128relaxed_( //
186
- nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, //
187
- nk_f64_t *ca_x, nk_f64_t *ca_y, nk_f64_t *ca_z, //
188
- nk_f64_t *cb_x, nk_f64_t *cb_y, nk_f64_t *cb_z, //
123
+ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_v128relaxed_( //
124
+ nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, //
125
+ nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
126
+ nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
189
127
  nk_f64_t h[9]) {
190
128
  v128_t zero_f64x2 = wasm_f64x2_splat(0.0);
191
- v128_t sum_a_x_lower_f64x2 = zero_f64x2, sum_a_x_upper_f64x2 = zero_f64x2;
192
- v128_t sum_a_y_lower_f64x2 = zero_f64x2, sum_a_y_upper_f64x2 = zero_f64x2;
193
- v128_t sum_a_z_lower_f64x2 = zero_f64x2, sum_a_z_upper_f64x2 = zero_f64x2;
194
- v128_t sum_b_x_lower_f64x2 = zero_f64x2, sum_b_x_upper_f64x2 = zero_f64x2;
195
- v128_t sum_b_y_lower_f64x2 = zero_f64x2, sum_b_y_upper_f64x2 = zero_f64x2;
196
- v128_t sum_b_z_lower_f64x2 = zero_f64x2, sum_b_z_upper_f64x2 = zero_f64x2;
197
- v128_t cross_00_lower_f64x2 = zero_f64x2, cross_00_upper_f64x2 = zero_f64x2;
198
- v128_t cross_01_lower_f64x2 = zero_f64x2, cross_01_upper_f64x2 = zero_f64x2;
199
- v128_t cross_02_lower_f64x2 = zero_f64x2, cross_02_upper_f64x2 = zero_f64x2;
200
- v128_t cross_10_lower_f64x2 = zero_f64x2, cross_10_upper_f64x2 = zero_f64x2;
201
- v128_t cross_11_lower_f64x2 = zero_f64x2, cross_11_upper_f64x2 = zero_f64x2;
202
- v128_t cross_12_lower_f64x2 = zero_f64x2, cross_12_upper_f64x2 = zero_f64x2;
203
- v128_t cross_20_lower_f64x2 = zero_f64x2, cross_20_upper_f64x2 = zero_f64x2;
204
- v128_t cross_21_lower_f64x2 = zero_f64x2, cross_21_upper_f64x2 = zero_f64x2;
205
- v128_t cross_22_lower_f64x2 = zero_f64x2, cross_22_upper_f64x2 = zero_f64x2;
129
+ v128_t sum_a_x_low_f64x2 = zero_f64x2, sum_a_x_high_f64x2 = zero_f64x2;
130
+ v128_t sum_a_y_low_f64x2 = zero_f64x2, sum_a_y_high_f64x2 = zero_f64x2;
131
+ v128_t sum_a_z_low_f64x2 = zero_f64x2, sum_a_z_high_f64x2 = zero_f64x2;
132
+ v128_t sum_b_x_low_f64x2 = zero_f64x2, sum_b_x_high_f64x2 = zero_f64x2;
133
+ v128_t sum_b_y_low_f64x2 = zero_f64x2, sum_b_y_high_f64x2 = zero_f64x2;
134
+ v128_t sum_b_z_low_f64x2 = zero_f64x2, sum_b_z_high_f64x2 = zero_f64x2;
135
+ v128_t cross_00_low_f64x2 = zero_f64x2, cross_00_high_f64x2 = zero_f64x2;
136
+ v128_t cross_01_low_f64x2 = zero_f64x2, cross_01_high_f64x2 = zero_f64x2;
137
+ v128_t cross_02_low_f64x2 = zero_f64x2, cross_02_high_f64x2 = zero_f64x2;
138
+ v128_t cross_10_low_f64x2 = zero_f64x2, cross_10_high_f64x2 = zero_f64x2;
139
+ v128_t cross_11_low_f64x2 = zero_f64x2, cross_11_high_f64x2 = zero_f64x2;
140
+ v128_t cross_12_low_f64x2 = zero_f64x2, cross_12_high_f64x2 = zero_f64x2;
141
+ v128_t cross_20_low_f64x2 = zero_f64x2, cross_20_high_f64x2 = zero_f64x2;
142
+ v128_t cross_21_low_f64x2 = zero_f64x2, cross_21_high_f64x2 = zero_f64x2;
143
+ v128_t cross_22_low_f64x2 = zero_f64x2, cross_22_high_f64x2 = zero_f64x2;
206
144
  nk_size_t index = 0;
207
145
 
208
146
  for (; index + 4 <= n; index += 4) {
@@ -210,67 +148,67 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_v128relaxed_( //
210
148
  nk_deinterleave_f32x4_v128relaxed_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4);
211
149
  nk_deinterleave_f32x4_v128relaxed_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
212
150
 
213
- v128_t a_x_lower_f64x2 = wasm_f64x2_promote_low_f32x4(a_x_f32x4);
214
- v128_t a_x_upper_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_x_f32x4, a_x_f32x4, 2, 3, 0, 1));
215
- v128_t a_y_lower_f64x2 = wasm_f64x2_promote_low_f32x4(a_y_f32x4);
216
- v128_t a_y_upper_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_y_f32x4, a_y_f32x4, 2, 3, 0, 1));
217
- v128_t a_z_lower_f64x2 = wasm_f64x2_promote_low_f32x4(a_z_f32x4);
218
- v128_t a_z_upper_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_z_f32x4, a_z_f32x4, 2, 3, 0, 1));
219
- v128_t b_x_lower_f64x2 = wasm_f64x2_promote_low_f32x4(b_x_f32x4);
220
- v128_t b_x_upper_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_x_f32x4, b_x_f32x4, 2, 3, 0, 1));
221
- v128_t b_y_lower_f64x2 = wasm_f64x2_promote_low_f32x4(b_y_f32x4);
222
- v128_t b_y_upper_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_y_f32x4, b_y_f32x4, 2, 3, 0, 1));
223
- v128_t b_z_lower_f64x2 = wasm_f64x2_promote_low_f32x4(b_z_f32x4);
224
- v128_t b_z_upper_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_z_f32x4, b_z_f32x4, 2, 3, 0, 1));
225
-
226
- sum_a_x_lower_f64x2 = wasm_f64x2_add(sum_a_x_lower_f64x2, a_x_lower_f64x2),
227
- sum_a_x_upper_f64x2 = wasm_f64x2_add(sum_a_x_upper_f64x2, a_x_upper_f64x2);
228
- sum_a_y_lower_f64x2 = wasm_f64x2_add(sum_a_y_lower_f64x2, a_y_lower_f64x2),
229
- sum_a_y_upper_f64x2 = wasm_f64x2_add(sum_a_y_upper_f64x2, a_y_upper_f64x2);
230
- sum_a_z_lower_f64x2 = wasm_f64x2_add(sum_a_z_lower_f64x2, a_z_lower_f64x2),
231
- sum_a_z_upper_f64x2 = wasm_f64x2_add(sum_a_z_upper_f64x2, a_z_upper_f64x2);
232
- sum_b_x_lower_f64x2 = wasm_f64x2_add(sum_b_x_lower_f64x2, b_x_lower_f64x2),
233
- sum_b_x_upper_f64x2 = wasm_f64x2_add(sum_b_x_upper_f64x2, b_x_upper_f64x2);
234
- sum_b_y_lower_f64x2 = wasm_f64x2_add(sum_b_y_lower_f64x2, b_y_lower_f64x2),
235
- sum_b_y_upper_f64x2 = wasm_f64x2_add(sum_b_y_upper_f64x2, b_y_upper_f64x2);
236
- sum_b_z_lower_f64x2 = wasm_f64x2_add(sum_b_z_lower_f64x2, b_z_lower_f64x2),
237
- sum_b_z_upper_f64x2 = wasm_f64x2_add(sum_b_z_upper_f64x2, b_z_upper_f64x2);
238
-
239
- cross_00_lower_f64x2 = wasm_f64x2_relaxed_madd(a_x_lower_f64x2, b_x_lower_f64x2, cross_00_lower_f64x2),
240
- cross_00_upper_f64x2 = wasm_f64x2_relaxed_madd(a_x_upper_f64x2, b_x_upper_f64x2, cross_00_upper_f64x2);
241
- cross_01_lower_f64x2 = wasm_f64x2_relaxed_madd(a_x_lower_f64x2, b_y_lower_f64x2, cross_01_lower_f64x2),
242
- cross_01_upper_f64x2 = wasm_f64x2_relaxed_madd(a_x_upper_f64x2, b_y_upper_f64x2, cross_01_upper_f64x2);
243
- cross_02_lower_f64x2 = wasm_f64x2_relaxed_madd(a_x_lower_f64x2, b_z_lower_f64x2, cross_02_lower_f64x2),
244
- cross_02_upper_f64x2 = wasm_f64x2_relaxed_madd(a_x_upper_f64x2, b_z_upper_f64x2, cross_02_upper_f64x2);
245
- cross_10_lower_f64x2 = wasm_f64x2_relaxed_madd(a_y_lower_f64x2, b_x_lower_f64x2, cross_10_lower_f64x2),
246
- cross_10_upper_f64x2 = wasm_f64x2_relaxed_madd(a_y_upper_f64x2, b_x_upper_f64x2, cross_10_upper_f64x2);
247
- cross_11_lower_f64x2 = wasm_f64x2_relaxed_madd(a_y_lower_f64x2, b_y_lower_f64x2, cross_11_lower_f64x2),
248
- cross_11_upper_f64x2 = wasm_f64x2_relaxed_madd(a_y_upper_f64x2, b_y_upper_f64x2, cross_11_upper_f64x2);
249
- cross_12_lower_f64x2 = wasm_f64x2_relaxed_madd(a_y_lower_f64x2, b_z_lower_f64x2, cross_12_lower_f64x2),
250
- cross_12_upper_f64x2 = wasm_f64x2_relaxed_madd(a_y_upper_f64x2, b_z_upper_f64x2, cross_12_upper_f64x2);
251
- cross_20_lower_f64x2 = wasm_f64x2_relaxed_madd(a_z_lower_f64x2, b_x_lower_f64x2, cross_20_lower_f64x2),
252
- cross_20_upper_f64x2 = wasm_f64x2_relaxed_madd(a_z_upper_f64x2, b_x_upper_f64x2, cross_20_upper_f64x2);
253
- cross_21_lower_f64x2 = wasm_f64x2_relaxed_madd(a_z_lower_f64x2, b_y_lower_f64x2, cross_21_lower_f64x2),
254
- cross_21_upper_f64x2 = wasm_f64x2_relaxed_madd(a_z_upper_f64x2, b_y_upper_f64x2, cross_21_upper_f64x2);
255
- cross_22_lower_f64x2 = wasm_f64x2_relaxed_madd(a_z_lower_f64x2, b_z_lower_f64x2, cross_22_lower_f64x2),
256
- cross_22_upper_f64x2 = wasm_f64x2_relaxed_madd(a_z_upper_f64x2, b_z_upper_f64x2, cross_22_upper_f64x2);
151
+ v128_t a_x_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_x_f32x4);
152
+ v128_t a_x_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_x_f32x4, a_x_f32x4, 2, 3, 0, 1));
153
+ v128_t a_y_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_y_f32x4);
154
+ v128_t a_y_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_y_f32x4, a_y_f32x4, 2, 3, 0, 1));
155
+ v128_t a_z_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_z_f32x4);
156
+ v128_t a_z_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_z_f32x4, a_z_f32x4, 2, 3, 0, 1));
157
+ v128_t b_x_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_x_f32x4);
158
+ v128_t b_x_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_x_f32x4, b_x_f32x4, 2, 3, 0, 1));
159
+ v128_t b_y_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_y_f32x4);
160
+ v128_t b_y_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_y_f32x4, b_y_f32x4, 2, 3, 0, 1));
161
+ v128_t b_z_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_z_f32x4);
162
+ v128_t b_z_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_z_f32x4, b_z_f32x4, 2, 3, 0, 1));
163
+
164
+ sum_a_x_low_f64x2 = wasm_f64x2_add(sum_a_x_low_f64x2, a_x_low_f64x2),
165
+ sum_a_x_high_f64x2 = wasm_f64x2_add(sum_a_x_high_f64x2, a_x_high_f64x2);
166
+ sum_a_y_low_f64x2 = wasm_f64x2_add(sum_a_y_low_f64x2, a_y_low_f64x2),
167
+ sum_a_y_high_f64x2 = wasm_f64x2_add(sum_a_y_high_f64x2, a_y_high_f64x2);
168
+ sum_a_z_low_f64x2 = wasm_f64x2_add(sum_a_z_low_f64x2, a_z_low_f64x2),
169
+ sum_a_z_high_f64x2 = wasm_f64x2_add(sum_a_z_high_f64x2, a_z_high_f64x2);
170
+ sum_b_x_low_f64x2 = wasm_f64x2_add(sum_b_x_low_f64x2, b_x_low_f64x2),
171
+ sum_b_x_high_f64x2 = wasm_f64x2_add(sum_b_x_high_f64x2, b_x_high_f64x2);
172
+ sum_b_y_low_f64x2 = wasm_f64x2_add(sum_b_y_low_f64x2, b_y_low_f64x2),
173
+ sum_b_y_high_f64x2 = wasm_f64x2_add(sum_b_y_high_f64x2, b_y_high_f64x2);
174
+ sum_b_z_low_f64x2 = wasm_f64x2_add(sum_b_z_low_f64x2, b_z_low_f64x2),
175
+ sum_b_z_high_f64x2 = wasm_f64x2_add(sum_b_z_high_f64x2, b_z_high_f64x2);
176
+
177
+ cross_00_low_f64x2 = wasm_f64x2_relaxed_madd(a_x_low_f64x2, b_x_low_f64x2, cross_00_low_f64x2),
178
+ cross_00_high_f64x2 = wasm_f64x2_relaxed_madd(a_x_high_f64x2, b_x_high_f64x2, cross_00_high_f64x2);
179
+ cross_01_low_f64x2 = wasm_f64x2_relaxed_madd(a_x_low_f64x2, b_y_low_f64x2, cross_01_low_f64x2),
180
+ cross_01_high_f64x2 = wasm_f64x2_relaxed_madd(a_x_high_f64x2, b_y_high_f64x2, cross_01_high_f64x2);
181
+ cross_02_low_f64x2 = wasm_f64x2_relaxed_madd(a_x_low_f64x2, b_z_low_f64x2, cross_02_low_f64x2),
182
+ cross_02_high_f64x2 = wasm_f64x2_relaxed_madd(a_x_high_f64x2, b_z_high_f64x2, cross_02_high_f64x2);
183
+ cross_10_low_f64x2 = wasm_f64x2_relaxed_madd(a_y_low_f64x2, b_x_low_f64x2, cross_10_low_f64x2),
184
+ cross_10_high_f64x2 = wasm_f64x2_relaxed_madd(a_y_high_f64x2, b_x_high_f64x2, cross_10_high_f64x2);
185
+ cross_11_low_f64x2 = wasm_f64x2_relaxed_madd(a_y_low_f64x2, b_y_low_f64x2, cross_11_low_f64x2),
186
+ cross_11_high_f64x2 = wasm_f64x2_relaxed_madd(a_y_high_f64x2, b_y_high_f64x2, cross_11_high_f64x2);
187
+ cross_12_low_f64x2 = wasm_f64x2_relaxed_madd(a_y_low_f64x2, b_z_low_f64x2, cross_12_low_f64x2),
188
+ cross_12_high_f64x2 = wasm_f64x2_relaxed_madd(a_y_high_f64x2, b_z_high_f64x2, cross_12_high_f64x2);
189
+ cross_20_low_f64x2 = wasm_f64x2_relaxed_madd(a_z_low_f64x2, b_x_low_f64x2, cross_20_low_f64x2),
190
+ cross_20_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, b_x_high_f64x2, cross_20_high_f64x2);
191
+ cross_21_low_f64x2 = wasm_f64x2_relaxed_madd(a_z_low_f64x2, b_y_low_f64x2, cross_21_low_f64x2),
192
+ cross_21_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, b_y_high_f64x2, cross_21_high_f64x2);
193
+ cross_22_low_f64x2 = wasm_f64x2_relaxed_madd(a_z_low_f64x2, b_z_low_f64x2, cross_22_low_f64x2),
194
+ cross_22_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, b_z_high_f64x2, cross_22_high_f64x2);
257
195
  }
258
196
 
259
- nk_f64_t sum_a_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_x_lower_f64x2, sum_a_x_upper_f64x2));
260
- nk_f64_t sum_a_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_y_lower_f64x2, sum_a_y_upper_f64x2));
261
- nk_f64_t sum_a_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_z_lower_f64x2, sum_a_z_upper_f64x2));
262
- nk_f64_t sum_b_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_x_lower_f64x2, sum_b_x_upper_f64x2));
263
- nk_f64_t sum_b_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_y_lower_f64x2, sum_b_y_upper_f64x2));
264
- nk_f64_t sum_b_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_z_lower_f64x2, sum_b_z_upper_f64x2));
265
- nk_f64_t cross_00 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_00_lower_f64x2, cross_00_upper_f64x2));
266
- nk_f64_t cross_01 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_01_lower_f64x2, cross_01_upper_f64x2));
267
- nk_f64_t cross_02 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_02_lower_f64x2, cross_02_upper_f64x2));
268
- nk_f64_t cross_10 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_10_lower_f64x2, cross_10_upper_f64x2));
269
- nk_f64_t cross_11 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_11_lower_f64x2, cross_11_upper_f64x2));
270
- nk_f64_t cross_12 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_12_lower_f64x2, cross_12_upper_f64x2));
271
- nk_f64_t cross_20 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_20_lower_f64x2, cross_20_upper_f64x2));
272
- nk_f64_t cross_21 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_21_lower_f64x2, cross_21_upper_f64x2));
273
- nk_f64_t cross_22 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_22_lower_f64x2, cross_22_upper_f64x2));
197
+ nk_f64_t sum_a_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_x_low_f64x2, sum_a_x_high_f64x2));
198
+ nk_f64_t sum_a_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_y_low_f64x2, sum_a_y_high_f64x2));
199
+ nk_f64_t sum_a_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_z_low_f64x2, sum_a_z_high_f64x2));
200
+ nk_f64_t sum_b_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_x_low_f64x2, sum_b_x_high_f64x2));
201
+ nk_f64_t sum_b_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_y_low_f64x2, sum_b_y_high_f64x2));
202
+ nk_f64_t sum_b_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_z_low_f64x2, sum_b_z_high_f64x2));
203
+ nk_f64_t cross_00 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_00_low_f64x2, cross_00_high_f64x2));
204
+ nk_f64_t cross_01 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_01_low_f64x2, cross_01_high_f64x2));
205
+ nk_f64_t cross_02 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_02_low_f64x2, cross_02_high_f64x2));
206
+ nk_f64_t cross_10 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_10_low_f64x2, cross_10_high_f64x2));
207
+ nk_f64_t cross_11 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_11_low_f64x2, cross_11_high_f64x2));
208
+ nk_f64_t cross_12 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_12_low_f64x2, cross_12_high_f64x2));
209
+ nk_f64_t cross_20 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_20_low_f64x2, cross_20_high_f64x2));
210
+ nk_f64_t cross_21 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_21_low_f64x2, cross_21_high_f64x2));
211
+ nk_f64_t cross_22 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_22_low_f64x2, cross_22_high_f64x2));
274
212
 
275
213
  for (; index < n; ++index) {
276
214
  nk_f64_t a_x = a[index * 3 + 0], a_y = a[index * 3 + 1], a_z = a[index * 3 + 2];
@@ -282,59 +220,157 @@ NK_INTERNAL void nk_centroid_and_cross_covariance_f32_v128relaxed_( //
282
220
  cross_20 += a_z * b_x, cross_21 += a_z * b_y, cross_22 += a_z * b_z;
283
221
  }
284
222
 
285
- nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
286
- *ca_x = sum_a_x * inv_n, *ca_y = sum_a_y * inv_n, *ca_z = sum_a_z * inv_n;
287
- *cb_x = sum_b_x * inv_n, *cb_y = sum_b_y * inv_n, *cb_z = sum_b_z * inv_n;
223
+ nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
224
+ *centroid_a_x = sum_a_x * inv_points_count, *centroid_a_y = sum_a_y * inv_points_count,
225
+ *centroid_a_z = sum_a_z * inv_points_count;
226
+ *centroid_b_x = sum_b_x * inv_points_count, *centroid_b_y = sum_b_y * inv_points_count,
227
+ *centroid_b_z = sum_b_z * inv_points_count;
288
228
 
289
229
  nk_f64_t n_f64 = (nk_f64_t)n;
290
- h[0] = cross_00 - n_f64 * (*ca_x) * (*cb_x), h[1] = cross_01 - n_f64 * (*ca_x) * (*cb_y),
291
- h[2] = cross_02 - n_f64 * (*ca_x) * (*cb_z);
292
- h[3] = cross_10 - n_f64 * (*ca_y) * (*cb_x), h[4] = cross_11 - n_f64 * (*ca_y) * (*cb_y),
293
- h[5] = cross_12 - n_f64 * (*ca_y) * (*cb_z);
294
- h[6] = cross_20 - n_f64 * (*ca_z) * (*cb_x), h[7] = cross_21 - n_f64 * (*ca_z) * (*cb_y),
295
- h[8] = cross_22 - n_f64 * (*ca_z) * (*cb_z);
230
+ h[0] = cross_00 - n_f64 * (*centroid_a_x) * (*centroid_b_x),
231
+ h[1] = cross_01 - n_f64 * (*centroid_a_x) * (*centroid_b_y),
232
+ h[2] = cross_02 - n_f64 * (*centroid_a_x) * (*centroid_b_z);
233
+ h[3] = cross_10 - n_f64 * (*centroid_a_y) * (*centroid_b_x),
234
+ h[4] = cross_11 - n_f64 * (*centroid_a_y) * (*centroid_b_y),
235
+ h[5] = cross_12 - n_f64 * (*centroid_a_y) * (*centroid_b_z);
236
+ h[6] = cross_20 - n_f64 * (*centroid_a_z) * (*centroid_b_x),
237
+ h[7] = cross_21 - n_f64 * (*centroid_a_z) * (*centroid_b_y),
238
+ h[8] = cross_22 - n_f64 * (*centroid_a_z) * (*centroid_b_z);
296
239
  }
297
240
 
298
241
  NK_INTERNAL void nk_centroid_and_cross_covariance_and_variance_f32_v128relaxed_( //
299
242
  nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, //
300
- nk_f64_t *ca_x, nk_f64_t *ca_y, nk_f64_t *ca_z, //
301
- nk_f64_t *cb_x, nk_f64_t *cb_y, nk_f64_t *cb_z, //
243
+ nk_f64_t *centroid_a_x, nk_f64_t *centroid_a_y, nk_f64_t *centroid_a_z, //
244
+ nk_f64_t *centroid_b_x, nk_f64_t *centroid_b_y, nk_f64_t *centroid_b_z, //
302
245
  nk_f64_t h[9], nk_f64_t *variance_a) {
303
246
  v128_t zero_f64x2 = wasm_f64x2_splat(0.0);
304
- v128_t sum_norm_squared_lower_f64x2 = zero_f64x2, sum_norm_squared_upper_f64x2 = zero_f64x2;
305
- nk_centroid_and_cross_covariance_f32_v128relaxed_(a, b, n, ca_x, ca_y, ca_z, cb_x, cb_y, cb_z, h);
306
-
247
+ v128_t sum_a_x_low_f64x2 = zero_f64x2, sum_a_x_high_f64x2 = zero_f64x2;
248
+ v128_t sum_a_y_low_f64x2 = zero_f64x2, sum_a_y_high_f64x2 = zero_f64x2;
249
+ v128_t sum_a_z_low_f64x2 = zero_f64x2, sum_a_z_high_f64x2 = zero_f64x2;
250
+ v128_t sum_b_x_low_f64x2 = zero_f64x2, sum_b_x_high_f64x2 = zero_f64x2;
251
+ v128_t sum_b_y_low_f64x2 = zero_f64x2, sum_b_y_high_f64x2 = zero_f64x2;
252
+ v128_t sum_b_z_low_f64x2 = zero_f64x2, sum_b_z_high_f64x2 = zero_f64x2;
253
+ v128_t cross_00_low_f64x2 = zero_f64x2, cross_00_high_f64x2 = zero_f64x2;
254
+ v128_t cross_01_low_f64x2 = zero_f64x2, cross_01_high_f64x2 = zero_f64x2;
255
+ v128_t cross_02_low_f64x2 = zero_f64x2, cross_02_high_f64x2 = zero_f64x2;
256
+ v128_t cross_10_low_f64x2 = zero_f64x2, cross_10_high_f64x2 = zero_f64x2;
257
+ v128_t cross_11_low_f64x2 = zero_f64x2, cross_11_high_f64x2 = zero_f64x2;
258
+ v128_t cross_12_low_f64x2 = zero_f64x2, cross_12_high_f64x2 = zero_f64x2;
259
+ v128_t cross_20_low_f64x2 = zero_f64x2, cross_20_high_f64x2 = zero_f64x2;
260
+ v128_t cross_21_low_f64x2 = zero_f64x2, cross_21_high_f64x2 = zero_f64x2;
261
+ v128_t cross_22_low_f64x2 = zero_f64x2, cross_22_high_f64x2 = zero_f64x2;
262
+ v128_t sum_norm_squared_low_f64x2 = zero_f64x2, sum_norm_squared_high_f64x2 = zero_f64x2;
307
263
  nk_size_t index = 0;
264
+
308
265
  for (; index + 4 <= n; index += 4) {
309
- v128_t a_x_f32x4, a_y_f32x4, a_z_f32x4;
266
+ v128_t a_x_f32x4, a_y_f32x4, a_z_f32x4, b_x_f32x4, b_y_f32x4, b_z_f32x4;
310
267
  nk_deinterleave_f32x4_v128relaxed_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4);
268
+ nk_deinterleave_f32x4_v128relaxed_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
311
269
 
312
- v128_t a_x_lower_f64x2 = wasm_f64x2_promote_low_f32x4(a_x_f32x4);
313
- v128_t a_x_upper_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_x_f32x4, a_x_f32x4, 2, 3, 0, 1));
314
- v128_t a_y_lower_f64x2 = wasm_f64x2_promote_low_f32x4(a_y_f32x4);
315
- v128_t a_y_upper_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_y_f32x4, a_y_f32x4, 2, 3, 0, 1));
316
- v128_t a_z_lower_f64x2 = wasm_f64x2_promote_low_f32x4(a_z_f32x4);
317
- v128_t a_z_upper_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_z_f32x4, a_z_f32x4, 2, 3, 0, 1));
318
-
319
- v128_t norm_squared_lower_f64x2 = wasm_f64x2_relaxed_madd(a_y_lower_f64x2, a_y_lower_f64x2,
320
- wasm_f64x2_mul(a_x_lower_f64x2, a_x_lower_f64x2));
321
- v128_t norm_squared_upper_f64x2 = wasm_f64x2_relaxed_madd(a_y_upper_f64x2, a_y_upper_f64x2,
322
- wasm_f64x2_mul(a_x_upper_f64x2, a_x_upper_f64x2));
323
- norm_squared_lower_f64x2 = wasm_f64x2_relaxed_madd(a_z_lower_f64x2, a_z_lower_f64x2, norm_squared_lower_f64x2);
324
- norm_squared_upper_f64x2 = wasm_f64x2_relaxed_madd(a_z_upper_f64x2, a_z_upper_f64x2, norm_squared_upper_f64x2);
325
- sum_norm_squared_lower_f64x2 = wasm_f64x2_add(sum_norm_squared_lower_f64x2, norm_squared_lower_f64x2),
326
- sum_norm_squared_upper_f64x2 = wasm_f64x2_add(sum_norm_squared_upper_f64x2, norm_squared_upper_f64x2);
270
+ v128_t a_x_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_x_f32x4);
271
+ v128_t a_x_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_x_f32x4, a_x_f32x4, 2, 3, 0, 1));
272
+ v128_t a_y_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_y_f32x4);
273
+ v128_t a_y_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_y_f32x4, a_y_f32x4, 2, 3, 0, 1));
274
+ v128_t a_z_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_z_f32x4);
275
+ v128_t a_z_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_z_f32x4, a_z_f32x4, 2, 3, 0, 1));
276
+ v128_t b_x_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_x_f32x4);
277
+ v128_t b_x_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_x_f32x4, b_x_f32x4, 2, 3, 0, 1));
278
+ v128_t b_y_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_y_f32x4);
279
+ v128_t b_y_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_y_f32x4, b_y_f32x4, 2, 3, 0, 1));
280
+ v128_t b_z_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_z_f32x4);
281
+ v128_t b_z_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_z_f32x4, b_z_f32x4, 2, 3, 0, 1));
282
+
283
+ sum_a_x_low_f64x2 = wasm_f64x2_add(sum_a_x_low_f64x2, a_x_low_f64x2),
284
+ sum_a_x_high_f64x2 = wasm_f64x2_add(sum_a_x_high_f64x2, a_x_high_f64x2);
285
+ sum_a_y_low_f64x2 = wasm_f64x2_add(sum_a_y_low_f64x2, a_y_low_f64x2),
286
+ sum_a_y_high_f64x2 = wasm_f64x2_add(sum_a_y_high_f64x2, a_y_high_f64x2);
287
+ sum_a_z_low_f64x2 = wasm_f64x2_add(sum_a_z_low_f64x2, a_z_low_f64x2),
288
+ sum_a_z_high_f64x2 = wasm_f64x2_add(sum_a_z_high_f64x2, a_z_high_f64x2);
289
+ sum_b_x_low_f64x2 = wasm_f64x2_add(sum_b_x_low_f64x2, b_x_low_f64x2),
290
+ sum_b_x_high_f64x2 = wasm_f64x2_add(sum_b_x_high_f64x2, b_x_high_f64x2);
291
+ sum_b_y_low_f64x2 = wasm_f64x2_add(sum_b_y_low_f64x2, b_y_low_f64x2),
292
+ sum_b_y_high_f64x2 = wasm_f64x2_add(sum_b_y_high_f64x2, b_y_high_f64x2);
293
+ sum_b_z_low_f64x2 = wasm_f64x2_add(sum_b_z_low_f64x2, b_z_low_f64x2),
294
+ sum_b_z_high_f64x2 = wasm_f64x2_add(sum_b_z_high_f64x2, b_z_high_f64x2);
295
+
296
+ cross_00_low_f64x2 = wasm_f64x2_relaxed_madd(a_x_low_f64x2, b_x_low_f64x2, cross_00_low_f64x2),
297
+ cross_00_high_f64x2 = wasm_f64x2_relaxed_madd(a_x_high_f64x2, b_x_high_f64x2, cross_00_high_f64x2);
298
+ cross_01_low_f64x2 = wasm_f64x2_relaxed_madd(a_x_low_f64x2, b_y_low_f64x2, cross_01_low_f64x2),
299
+ cross_01_high_f64x2 = wasm_f64x2_relaxed_madd(a_x_high_f64x2, b_y_high_f64x2, cross_01_high_f64x2);
300
+ cross_02_low_f64x2 = wasm_f64x2_relaxed_madd(a_x_low_f64x2, b_z_low_f64x2, cross_02_low_f64x2),
301
+ cross_02_high_f64x2 = wasm_f64x2_relaxed_madd(a_x_high_f64x2, b_z_high_f64x2, cross_02_high_f64x2);
302
+ cross_10_low_f64x2 = wasm_f64x2_relaxed_madd(a_y_low_f64x2, b_x_low_f64x2, cross_10_low_f64x2),
303
+ cross_10_high_f64x2 = wasm_f64x2_relaxed_madd(a_y_high_f64x2, b_x_high_f64x2, cross_10_high_f64x2);
304
+ cross_11_low_f64x2 = wasm_f64x2_relaxed_madd(a_y_low_f64x2, b_y_low_f64x2, cross_11_low_f64x2),
305
+ cross_11_high_f64x2 = wasm_f64x2_relaxed_madd(a_y_high_f64x2, b_y_high_f64x2, cross_11_high_f64x2);
306
+ cross_12_low_f64x2 = wasm_f64x2_relaxed_madd(a_y_low_f64x2, b_z_low_f64x2, cross_12_low_f64x2),
307
+ cross_12_high_f64x2 = wasm_f64x2_relaxed_madd(a_y_high_f64x2, b_z_high_f64x2, cross_12_high_f64x2);
308
+ cross_20_low_f64x2 = wasm_f64x2_relaxed_madd(a_z_low_f64x2, b_x_low_f64x2, cross_20_low_f64x2),
309
+ cross_20_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, b_x_high_f64x2, cross_20_high_f64x2);
310
+ cross_21_low_f64x2 = wasm_f64x2_relaxed_madd(a_z_low_f64x2, b_y_low_f64x2, cross_21_low_f64x2),
311
+ cross_21_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, b_y_high_f64x2, cross_21_high_f64x2);
312
+ cross_22_low_f64x2 = wasm_f64x2_relaxed_madd(a_z_low_f64x2, b_z_low_f64x2, cross_22_low_f64x2),
313
+ cross_22_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, b_z_high_f64x2, cross_22_high_f64x2);
314
+
315
+ // Variance: accumulate ||a||^2.
316
+ v128_t norm_squared_low_f64x2 = wasm_f64x2_relaxed_madd(a_y_low_f64x2, a_y_low_f64x2,
317
+ wasm_f64x2_mul(a_x_low_f64x2, a_x_low_f64x2));
318
+ v128_t norm_squared_high_f64x2 = wasm_f64x2_relaxed_madd(a_y_high_f64x2, a_y_high_f64x2,
319
+ wasm_f64x2_mul(a_x_high_f64x2, a_x_high_f64x2));
320
+ norm_squared_low_f64x2 = wasm_f64x2_relaxed_madd(a_z_low_f64x2, a_z_low_f64x2, norm_squared_low_f64x2);
321
+ norm_squared_high_f64x2 = wasm_f64x2_relaxed_madd(a_z_high_f64x2, a_z_high_f64x2, norm_squared_high_f64x2);
322
+ sum_norm_squared_low_f64x2 = wasm_f64x2_add(sum_norm_squared_low_f64x2, norm_squared_low_f64x2);
323
+ sum_norm_squared_high_f64x2 = wasm_f64x2_add(sum_norm_squared_high_f64x2, norm_squared_high_f64x2);
327
324
  }
328
325
 
326
+ nk_f64_t sum_a_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_x_low_f64x2, sum_a_x_high_f64x2));
327
+ nk_f64_t sum_a_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_y_low_f64x2, sum_a_y_high_f64x2));
328
+ nk_f64_t sum_a_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_z_low_f64x2, sum_a_z_high_f64x2));
329
+ nk_f64_t sum_b_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_x_low_f64x2, sum_b_x_high_f64x2));
330
+ nk_f64_t sum_b_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_y_low_f64x2, sum_b_y_high_f64x2));
331
+ nk_f64_t sum_b_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_z_low_f64x2, sum_b_z_high_f64x2));
332
+ nk_f64_t cross_00 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_00_low_f64x2, cross_00_high_f64x2));
333
+ nk_f64_t cross_01 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_01_low_f64x2, cross_01_high_f64x2));
334
+ nk_f64_t cross_02 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_02_low_f64x2, cross_02_high_f64x2));
335
+ nk_f64_t cross_10 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_10_low_f64x2, cross_10_high_f64x2));
336
+ nk_f64_t cross_11 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_11_low_f64x2, cross_11_high_f64x2));
337
+ nk_f64_t cross_12 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_12_low_f64x2, cross_12_high_f64x2));
338
+ nk_f64_t cross_20 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_20_low_f64x2, cross_20_high_f64x2));
339
+ nk_f64_t cross_21 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_21_low_f64x2, cross_21_high_f64x2));
340
+ nk_f64_t cross_22 = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(cross_22_low_f64x2, cross_22_high_f64x2));
329
341
  nk_f64_t sum_norm_squared = nk_hsum_f64x2_v128relaxed_(
330
- wasm_f64x2_add(sum_norm_squared_lower_f64x2, sum_norm_squared_upper_f64x2));
342
+ wasm_f64x2_add(sum_norm_squared_low_f64x2, sum_norm_squared_high_f64x2));
343
+
331
344
  for (; index < n; ++index) {
332
345
  nk_f64_t a_x = a[index * 3 + 0], a_y = a[index * 3 + 1], a_z = a[index * 3 + 2];
346
+ nk_f64_t b_x = b[index * 3 + 0], b_y = b[index * 3 + 1], b_z = b[index * 3 + 2];
347
+ sum_a_x += a_x, sum_a_y += a_y, sum_a_z += a_z;
348
+ sum_b_x += b_x, sum_b_y += b_y, sum_b_z += b_z;
349
+ cross_00 += a_x * b_x, cross_01 += a_x * b_y, cross_02 += a_x * b_z;
350
+ cross_10 += a_y * b_x, cross_11 += a_y * b_y, cross_12 += a_y * b_z;
351
+ cross_20 += a_z * b_x, cross_21 += a_z * b_y, cross_22 += a_z * b_z;
333
352
  sum_norm_squared += a_x * a_x + a_y * a_y + a_z * a_z;
334
353
  }
335
354
 
336
- nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
337
- *variance_a = sum_norm_squared * inv_n - ((*ca_x) * (*ca_x) + (*ca_y) * (*ca_y) + (*ca_z) * (*ca_z));
355
+ nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
356
+ *centroid_a_x = sum_a_x * inv_points_count, *centroid_a_y = sum_a_y * inv_points_count,
357
+ *centroid_a_z = sum_a_z * inv_points_count;
358
+ *centroid_b_x = sum_b_x * inv_points_count, *centroid_b_y = sum_b_y * inv_points_count,
359
+ *centroid_b_z = sum_b_z * inv_points_count;
360
+
361
+ nk_f64_t n_f64 = (nk_f64_t)n;
362
+ h[0] = cross_00 - n_f64 * (*centroid_a_x) * (*centroid_b_x),
363
+ h[1] = cross_01 - n_f64 * (*centroid_a_x) * (*centroid_b_y),
364
+ h[2] = cross_02 - n_f64 * (*centroid_a_x) * (*centroid_b_z);
365
+ h[3] = cross_10 - n_f64 * (*centroid_a_y) * (*centroid_b_x),
366
+ h[4] = cross_11 - n_f64 * (*centroid_a_y) * (*centroid_b_y),
367
+ h[5] = cross_12 - n_f64 * (*centroid_a_y) * (*centroid_b_z);
368
+ h[6] = cross_20 - n_f64 * (*centroid_a_z) * (*centroid_b_x),
369
+ h[7] = cross_21 - n_f64 * (*centroid_a_z) * (*centroid_b_y),
370
+ h[8] = cross_22 - n_f64 * (*centroid_a_z) * (*centroid_b_z);
371
+ *variance_a = sum_norm_squared * inv_points_count -
372
+ ((*centroid_a_x) * (*centroid_a_x) + (*centroid_a_y) * (*centroid_a_y) +
373
+ (*centroid_a_z) * (*centroid_a_z));
338
374
  }
339
375
 
340
376
  NK_INTERNAL nk_f64_t nk_transformed_ssd_f32_v128relaxed_( //
@@ -352,7 +388,7 @@ NK_INTERNAL nk_f64_t nk_transformed_ssd_f32_v128relaxed_( //
352
388
  v128_t centroid_a_x_f64x2 = wasm_f64x2_splat(centroid_a_x), centroid_a_y_f64x2 = wasm_f64x2_splat(centroid_a_y);
353
389
  v128_t centroid_a_z_f64x2 = wasm_f64x2_splat(centroid_a_z), centroid_b_x_f64x2 = wasm_f64x2_splat(centroid_b_x);
354
390
  v128_t centroid_b_y_f64x2 = wasm_f64x2_splat(centroid_b_y), centroid_b_z_f64x2 = wasm_f64x2_splat(centroid_b_z);
355
- v128_t sum_squared_lower_f64x2 = wasm_f64x2_splat(0.0), sum_squared_upper_f64x2 = wasm_f64x2_splat(0.0);
391
+ v128_t sum_squared_low_f64x2 = wasm_f64x2_splat(0.0), sum_squared_high_f64x2 = wasm_f64x2_splat(0.0);
356
392
  nk_size_t index = 0;
357
393
 
358
394
  for (; index + 4 <= n; index += 4) {
@@ -360,82 +396,79 @@ NK_INTERNAL nk_f64_t nk_transformed_ssd_f32_v128relaxed_( //
360
396
  nk_deinterleave_f32x4_v128relaxed_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4);
361
397
  nk_deinterleave_f32x4_v128relaxed_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
362
398
 
363
- v128_t centered_a_x_lower_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(a_x_f32x4), centroid_a_x_f64x2);
364
- v128_t centered_a_x_upper_f64x2 = wasm_f64x2_sub(
399
+ v128_t centered_a_x_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(a_x_f32x4), centroid_a_x_f64x2);
400
+ v128_t centered_a_x_high_f64x2 = wasm_f64x2_sub(
365
401
  wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_x_f32x4, a_x_f32x4, 2, 3, 0, 1)), centroid_a_x_f64x2);
366
- v128_t centered_a_y_lower_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(a_y_f32x4), centroid_a_y_f64x2);
367
- v128_t centered_a_y_upper_f64x2 = wasm_f64x2_sub(
402
+ v128_t centered_a_y_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(a_y_f32x4), centroid_a_y_f64x2);
403
+ v128_t centered_a_y_high_f64x2 = wasm_f64x2_sub(
368
404
  wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_y_f32x4, a_y_f32x4, 2, 3, 0, 1)), centroid_a_y_f64x2);
369
- v128_t centered_a_z_lower_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(a_z_f32x4), centroid_a_z_f64x2);
370
- v128_t centered_a_z_upper_f64x2 = wasm_f64x2_sub(
405
+ v128_t centered_a_z_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(a_z_f32x4), centroid_a_z_f64x2);
406
+ v128_t centered_a_z_high_f64x2 = wasm_f64x2_sub(
371
407
  wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_z_f32x4, a_z_f32x4, 2, 3, 0, 1)), centroid_a_z_f64x2);
372
- v128_t centered_b_x_lower_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(b_x_f32x4), centroid_b_x_f64x2);
373
- v128_t centered_b_x_upper_f64x2 = wasm_f64x2_sub(
408
+ v128_t centered_b_x_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(b_x_f32x4), centroid_b_x_f64x2);
409
+ v128_t centered_b_x_high_f64x2 = wasm_f64x2_sub(
374
410
  wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_x_f32x4, b_x_f32x4, 2, 3, 0, 1)), centroid_b_x_f64x2);
375
- v128_t centered_b_y_lower_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(b_y_f32x4), centroid_b_y_f64x2);
376
- v128_t centered_b_y_upper_f64x2 = wasm_f64x2_sub(
411
+ v128_t centered_b_y_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(b_y_f32x4), centroid_b_y_f64x2);
412
+ v128_t centered_b_y_high_f64x2 = wasm_f64x2_sub(
377
413
  wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_y_f32x4, b_y_f32x4, 2, 3, 0, 1)), centroid_b_y_f64x2);
378
- v128_t centered_b_z_lower_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(b_z_f32x4), centroid_b_z_f64x2);
379
- v128_t centered_b_z_upper_f64x2 = wasm_f64x2_sub(
414
+ v128_t centered_b_z_low_f64x2 = wasm_f64x2_sub(wasm_f64x2_promote_low_f32x4(b_z_f32x4), centroid_b_z_f64x2);
415
+ v128_t centered_b_z_high_f64x2 = wasm_f64x2_sub(
380
416
  wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_z_f32x4, b_z_f32x4, 2, 3, 0, 1)), centroid_b_z_f64x2);
381
417
 
382
- v128_t rotated_a_x_lower_f64x2 = wasm_f64x2_relaxed_madd(
383
- scaled_rotation_x_z_f64x2, centered_a_z_lower_f64x2,
384
- wasm_f64x2_relaxed_madd(scaled_rotation_x_y_f64x2, centered_a_y_lower_f64x2,
385
- wasm_f64x2_mul(scaled_rotation_x_x_f64x2, centered_a_x_lower_f64x2)));
386
- v128_t rotated_a_x_upper_f64x2 = wasm_f64x2_relaxed_madd(
387
- scaled_rotation_x_z_f64x2, centered_a_z_upper_f64x2,
388
- wasm_f64x2_relaxed_madd(scaled_rotation_x_y_f64x2, centered_a_y_upper_f64x2,
389
- wasm_f64x2_mul(scaled_rotation_x_x_f64x2, centered_a_x_upper_f64x2)));
390
- v128_t rotated_a_y_lower_f64x2 = wasm_f64x2_relaxed_madd(
391
- scaled_rotation_y_z_f64x2, centered_a_z_lower_f64x2,
392
- wasm_f64x2_relaxed_madd(scaled_rotation_y_y_f64x2, centered_a_y_lower_f64x2,
393
- wasm_f64x2_mul(scaled_rotation_y_x_f64x2, centered_a_x_lower_f64x2)));
394
- v128_t rotated_a_y_upper_f64x2 = wasm_f64x2_relaxed_madd(
395
- scaled_rotation_y_z_f64x2, centered_a_z_upper_f64x2,
396
- wasm_f64x2_relaxed_madd(scaled_rotation_y_y_f64x2, centered_a_y_upper_f64x2,
397
- wasm_f64x2_mul(scaled_rotation_y_x_f64x2, centered_a_x_upper_f64x2)));
398
- v128_t rotated_a_z_lower_f64x2 = wasm_f64x2_relaxed_madd(
399
- scaled_rotation_z_z_f64x2, centered_a_z_lower_f64x2,
400
- wasm_f64x2_relaxed_madd(scaled_rotation_z_y_f64x2, centered_a_y_lower_f64x2,
401
- wasm_f64x2_mul(scaled_rotation_z_x_f64x2, centered_a_x_lower_f64x2)));
402
- v128_t rotated_a_z_upper_f64x2 = wasm_f64x2_relaxed_madd(
403
- scaled_rotation_z_z_f64x2, centered_a_z_upper_f64x2,
404
- wasm_f64x2_relaxed_madd(scaled_rotation_z_y_f64x2, centered_a_y_upper_f64x2,
405
- wasm_f64x2_mul(scaled_rotation_z_x_f64x2, centered_a_x_upper_f64x2)));
406
-
407
- v128_t delta_x_lower_f64x2 = wasm_f64x2_sub(rotated_a_x_lower_f64x2, centered_b_x_lower_f64x2);
408
- v128_t delta_x_upper_f64x2 = wasm_f64x2_sub(rotated_a_x_upper_f64x2, centered_b_x_upper_f64x2);
409
- v128_t delta_y_lower_f64x2 = wasm_f64x2_sub(rotated_a_y_lower_f64x2, centered_b_y_lower_f64x2);
410
- v128_t delta_y_upper_f64x2 = wasm_f64x2_sub(rotated_a_y_upper_f64x2, centered_b_y_upper_f64x2);
411
- v128_t delta_z_lower_f64x2 = wasm_f64x2_sub(rotated_a_z_lower_f64x2, centered_b_z_lower_f64x2);
412
- v128_t delta_z_upper_f64x2 = wasm_f64x2_sub(rotated_a_z_upper_f64x2, centered_b_z_upper_f64x2);
413
-
414
- sum_squared_lower_f64x2 = wasm_f64x2_relaxed_madd(delta_x_lower_f64x2, delta_x_lower_f64x2,
415
- sum_squared_lower_f64x2);
416
- sum_squared_upper_f64x2 = wasm_f64x2_relaxed_madd(delta_x_upper_f64x2, delta_x_upper_f64x2,
417
- sum_squared_upper_f64x2);
418
- sum_squared_lower_f64x2 = wasm_f64x2_relaxed_madd(delta_y_lower_f64x2, delta_y_lower_f64x2,
419
- sum_squared_lower_f64x2);
420
- sum_squared_upper_f64x2 = wasm_f64x2_relaxed_madd(delta_y_upper_f64x2, delta_y_upper_f64x2,
421
- sum_squared_upper_f64x2);
422
- sum_squared_lower_f64x2 = wasm_f64x2_relaxed_madd(delta_z_lower_f64x2, delta_z_lower_f64x2,
423
- sum_squared_lower_f64x2);
424
- sum_squared_upper_f64x2 = wasm_f64x2_relaxed_madd(delta_z_upper_f64x2, delta_z_upper_f64x2,
425
- sum_squared_upper_f64x2);
418
+ v128_t rotated_a_x_low_f64x2 = wasm_f64x2_relaxed_madd(
419
+ scaled_rotation_x_z_f64x2, centered_a_z_low_f64x2,
420
+ wasm_f64x2_relaxed_madd(scaled_rotation_x_y_f64x2, centered_a_y_low_f64x2,
421
+ wasm_f64x2_mul(scaled_rotation_x_x_f64x2, centered_a_x_low_f64x2)));
422
+ v128_t rotated_a_x_high_f64x2 = wasm_f64x2_relaxed_madd(
423
+ scaled_rotation_x_z_f64x2, centered_a_z_high_f64x2,
424
+ wasm_f64x2_relaxed_madd(scaled_rotation_x_y_f64x2, centered_a_y_high_f64x2,
425
+ wasm_f64x2_mul(scaled_rotation_x_x_f64x2, centered_a_x_high_f64x2)));
426
+ v128_t rotated_a_y_low_f64x2 = wasm_f64x2_relaxed_madd(
427
+ scaled_rotation_y_z_f64x2, centered_a_z_low_f64x2,
428
+ wasm_f64x2_relaxed_madd(scaled_rotation_y_y_f64x2, centered_a_y_low_f64x2,
429
+ wasm_f64x2_mul(scaled_rotation_y_x_f64x2, centered_a_x_low_f64x2)));
430
+ v128_t rotated_a_y_high_f64x2 = wasm_f64x2_relaxed_madd(
431
+ scaled_rotation_y_z_f64x2, centered_a_z_high_f64x2,
432
+ wasm_f64x2_relaxed_madd(scaled_rotation_y_y_f64x2, centered_a_y_high_f64x2,
433
+ wasm_f64x2_mul(scaled_rotation_y_x_f64x2, centered_a_x_high_f64x2)));
434
+ v128_t rotated_a_z_low_f64x2 = wasm_f64x2_relaxed_madd(
435
+ scaled_rotation_z_z_f64x2, centered_a_z_low_f64x2,
436
+ wasm_f64x2_relaxed_madd(scaled_rotation_z_y_f64x2, centered_a_y_low_f64x2,
437
+ wasm_f64x2_mul(scaled_rotation_z_x_f64x2, centered_a_x_low_f64x2)));
438
+ v128_t rotated_a_z_high_f64x2 = wasm_f64x2_relaxed_madd(
439
+ scaled_rotation_z_z_f64x2, centered_a_z_high_f64x2,
440
+ wasm_f64x2_relaxed_madd(scaled_rotation_z_y_f64x2, centered_a_y_high_f64x2,
441
+ wasm_f64x2_mul(scaled_rotation_z_x_f64x2, centered_a_x_high_f64x2)));
442
+
443
+ v128_t delta_x_low_f64x2 = wasm_f64x2_sub(rotated_a_x_low_f64x2, centered_b_x_low_f64x2);
444
+ v128_t delta_x_high_f64x2 = wasm_f64x2_sub(rotated_a_x_high_f64x2, centered_b_x_high_f64x2);
445
+ v128_t delta_y_low_f64x2 = wasm_f64x2_sub(rotated_a_y_low_f64x2, centered_b_y_low_f64x2);
446
+ v128_t delta_y_high_f64x2 = wasm_f64x2_sub(rotated_a_y_high_f64x2, centered_b_y_high_f64x2);
447
+ v128_t delta_z_low_f64x2 = wasm_f64x2_sub(rotated_a_z_low_f64x2, centered_b_z_low_f64x2);
448
+ v128_t delta_z_high_f64x2 = wasm_f64x2_sub(rotated_a_z_high_f64x2, centered_b_z_high_f64x2);
449
+
450
+ sum_squared_low_f64x2 = wasm_f64x2_relaxed_madd(delta_x_low_f64x2, delta_x_low_f64x2, sum_squared_low_f64x2);
451
+ sum_squared_high_f64x2 = wasm_f64x2_relaxed_madd(delta_x_high_f64x2, delta_x_high_f64x2,
452
+ sum_squared_high_f64x2);
453
+ sum_squared_low_f64x2 = wasm_f64x2_relaxed_madd(delta_y_low_f64x2, delta_y_low_f64x2, sum_squared_low_f64x2);
454
+ sum_squared_high_f64x2 = wasm_f64x2_relaxed_madd(delta_y_high_f64x2, delta_y_high_f64x2,
455
+ sum_squared_high_f64x2);
456
+ sum_squared_low_f64x2 = wasm_f64x2_relaxed_madd(delta_z_low_f64x2, delta_z_low_f64x2, sum_squared_low_f64x2);
457
+ sum_squared_high_f64x2 = wasm_f64x2_relaxed_madd(delta_z_high_f64x2, delta_z_high_f64x2,
458
+ sum_squared_high_f64x2);
426
459
  }
427
460
 
428
- nk_f64_t sum_squared = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_squared_lower_f64x2, sum_squared_upper_f64x2));
461
+ nk_f64_t sum_squared = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_squared_low_f64x2, sum_squared_high_f64x2));
429
462
  for (; index < n; ++index) {
430
- nk_f64_t centered_a_x = (nk_f64_t)a[index * 3 + 0] - centroid_a_x;
431
- nk_f64_t centered_a_y = (nk_f64_t)a[index * 3 + 1] - centroid_a_y;
432
- nk_f64_t centered_a_z = (nk_f64_t)a[index * 3 + 2] - centroid_a_z;
433
- nk_f64_t centered_b_x = (nk_f64_t)b[index * 3 + 0] - centroid_b_x;
434
- nk_f64_t centered_b_y = (nk_f64_t)b[index * 3 + 1] - centroid_b_y;
435
- nk_f64_t centered_b_z = (nk_f64_t)b[index * 3 + 2] - centroid_b_z;
436
- nk_f64_t rotated_a_x = scale * (r[0] * centered_a_x + r[1] * centered_a_y + r[2] * centered_a_z);
437
- nk_f64_t rotated_a_y = scale * (r[3] * centered_a_x + r[4] * centered_a_y + r[5] * centered_a_z);
438
- nk_f64_t rotated_a_z = scale * (r[6] * centered_a_x + r[7] * centered_a_y + r[8] * centered_a_z);
463
+ nk_f64_t centered_a_x = (nk_f64_t)a[index * 3 + 0] - centroid_a_x,
464
+ centered_a_y = (nk_f64_t)a[index * 3 + 1] - centroid_a_y,
465
+ centered_a_z = (nk_f64_t)a[index * 3 + 2] - centroid_a_z;
466
+ nk_f64_t centered_b_x = (nk_f64_t)b[index * 3 + 0] - centroid_b_x,
467
+ centered_b_y = (nk_f64_t)b[index * 3 + 1] - centroid_b_y,
468
+ centered_b_z = (nk_f64_t)b[index * 3 + 2] - centroid_b_z;
469
+ nk_f64_t rotated_a_x = scale * (r[0] * centered_a_x + r[1] * centered_a_y + r[2] * centered_a_z),
470
+ rotated_a_y = scale * (r[3] * centered_a_x + r[4] * centered_a_y + r[5] * centered_a_z),
471
+ rotated_a_z = scale * (r[6] * centered_a_x + r[7] * centered_a_y + r[8] * centered_a_z);
439
472
  nk_f64_t delta_x = rotated_a_x - centered_b_x, delta_y = rotated_a_y - centered_b_y,
440
473
  delta_z = rotated_a_z - centered_b_z;
441
474
  sum_squared += delta_x * delta_x + delta_y * delta_y + delta_z * delta_z;
@@ -474,35 +507,38 @@ NK_INTERNAL nk_f64_t nk_transformed_ssd_f64_v128relaxed_(nk_f64_t const *a, nk_f
474
507
 
475
508
  // Main loop: process 2 points per iteration
476
509
  for (; j + 2 <= n; j += 2) {
477
- v128_t a_x, a_y, a_z, b_x, b_y, b_z;
478
- nk_deinterleave_f64x2_v128relaxed_(a + j * 3, &a_x, &a_y, &a_z);
479
- nk_deinterleave_f64x2_v128relaxed_(b + j * 3, &b_x, &b_y, &b_z);
510
+ v128_t a_x_f64x2, a_y_f64x2, a_z_f64x2, b_x_f64x2, b_y_f64x2, b_z_f64x2;
511
+ nk_deinterleave_f64x2_v128relaxed_(a + j * 3, &a_x_f64x2, &a_y_f64x2, &a_z_f64x2);
512
+ nk_deinterleave_f64x2_v128relaxed_(b + j * 3, &b_x_f64x2, &b_y_f64x2, &b_z_f64x2);
480
513
 
481
- v128_t pa_x = wasm_f64x2_sub(a_x, centroid_a_x_f64x2);
482
- v128_t pa_y = wasm_f64x2_sub(a_y, centroid_a_y_f64x2);
483
- v128_t pa_z = wasm_f64x2_sub(a_z, centroid_a_z_f64x2);
484
- v128_t pb_x = wasm_f64x2_sub(b_x, centroid_b_x_f64x2);
485
- v128_t pb_y = wasm_f64x2_sub(b_y, centroid_b_y_f64x2);
486
- v128_t pb_z = wasm_f64x2_sub(b_z, centroid_b_z_f64x2);
514
+ v128_t centered_a_x_f64x2 = wasm_f64x2_sub(a_x_f64x2, centroid_a_x_f64x2);
515
+ v128_t centered_a_y_f64x2 = wasm_f64x2_sub(a_y_f64x2, centroid_a_y_f64x2);
516
+ v128_t centered_a_z_f64x2 = wasm_f64x2_sub(a_z_f64x2, centroid_a_z_f64x2);
517
+ v128_t centered_b_x_f64x2 = wasm_f64x2_sub(b_x_f64x2, centroid_b_x_f64x2);
518
+ v128_t centered_b_y_f64x2 = wasm_f64x2_sub(b_y_f64x2, centroid_b_y_f64x2);
519
+ v128_t centered_b_z_f64x2 = wasm_f64x2_sub(b_z_f64x2, centroid_b_z_f64x2);
487
520
 
488
521
  // Rotate and scale: ra = scale * R * pa
489
- v128_t ra_x = wasm_f64x2_relaxed_madd(
490
- scaled_rotation_x_z_f64x2, pa_z,
491
- wasm_f64x2_relaxed_madd(scaled_rotation_x_y_f64x2, pa_y, wasm_f64x2_mul(scaled_rotation_x_x_f64x2, pa_x)));
492
- v128_t ra_y = wasm_f64x2_relaxed_madd(
493
- scaled_rotation_y_z_f64x2, pa_z,
494
- wasm_f64x2_relaxed_madd(scaled_rotation_y_y_f64x2, pa_y, wasm_f64x2_mul(scaled_rotation_y_x_f64x2, pa_x)));
495
- v128_t ra_z = wasm_f64x2_relaxed_madd(
496
- scaled_rotation_z_z_f64x2, pa_z,
497
- wasm_f64x2_relaxed_madd(scaled_rotation_z_y_f64x2, pa_y, wasm_f64x2_mul(scaled_rotation_z_x_f64x2, pa_x)));
498
-
499
- v128_t delta_x = wasm_f64x2_sub(ra_x, pb_x);
500
- v128_t delta_y = wasm_f64x2_sub(ra_y, pb_y);
501
- v128_t delta_z = wasm_f64x2_sub(ra_z, pb_z);
502
-
503
- nk_accumulate_square_f64x2_v128relaxed_(&sum_squared_f64x2, &sum_squared_compensation_f64x2, delta_x);
504
- nk_accumulate_square_f64x2_v128relaxed_(&sum_squared_f64x2, &sum_squared_compensation_f64x2, delta_y);
505
- nk_accumulate_square_f64x2_v128relaxed_(&sum_squared_f64x2, &sum_squared_compensation_f64x2, delta_z);
522
+ v128_t rotated_a_x_f64x2 = wasm_f64x2_relaxed_madd(
523
+ scaled_rotation_x_z_f64x2, centered_a_z_f64x2,
524
+ wasm_f64x2_relaxed_madd(scaled_rotation_x_y_f64x2, centered_a_y_f64x2,
525
+ wasm_f64x2_mul(scaled_rotation_x_x_f64x2, centered_a_x_f64x2)));
526
+ v128_t rotated_a_y_f64x2 = wasm_f64x2_relaxed_madd(
527
+ scaled_rotation_y_z_f64x2, centered_a_z_f64x2,
528
+ wasm_f64x2_relaxed_madd(scaled_rotation_y_y_f64x2, centered_a_y_f64x2,
529
+ wasm_f64x2_mul(scaled_rotation_y_x_f64x2, centered_a_x_f64x2)));
530
+ v128_t rotated_a_z_f64x2 = wasm_f64x2_relaxed_madd(
531
+ scaled_rotation_z_z_f64x2, centered_a_z_f64x2,
532
+ wasm_f64x2_relaxed_madd(scaled_rotation_z_y_f64x2, centered_a_y_f64x2,
533
+ wasm_f64x2_mul(scaled_rotation_z_x_f64x2, centered_a_x_f64x2)));
534
+
535
+ v128_t delta_x_f64x2 = wasm_f64x2_sub(rotated_a_x_f64x2, centered_b_x_f64x2);
536
+ v128_t delta_y_f64x2 = wasm_f64x2_sub(rotated_a_y_f64x2, centered_b_y_f64x2);
537
+ v128_t delta_z_f64x2 = wasm_f64x2_sub(rotated_a_z_f64x2, centered_b_z_f64x2);
538
+
539
+ nk_accumulate_square_f64x2_v128relaxed_(&sum_squared_f64x2, &sum_squared_compensation_f64x2, delta_x_f64x2);
540
+ nk_accumulate_square_f64x2_v128relaxed_(&sum_squared_f64x2, &sum_squared_compensation_f64x2, delta_y_f64x2);
541
+ nk_accumulate_square_f64x2_v128relaxed_(&sum_squared_f64x2, &sum_squared_compensation_f64x2, delta_z_f64x2);
506
542
  }
507
543
 
508
544
  nk_f64_t sum_squared = nk_dot_stable_sum_f64x2_v128relaxed_(sum_squared_f64x2, sum_squared_compensation_f64x2);
@@ -510,20 +546,16 @@ NK_INTERNAL nk_f64_t nk_transformed_ssd_f64_v128relaxed_(nk_f64_t const *a, nk_f
510
546
 
511
547
  // Scalar tail
512
548
  for (; j < n; ++j) {
513
- nk_f64_t pa_x = a[j * 3 + 0] - centroid_a_x;
514
- nk_f64_t pa_y = a[j * 3 + 1] - centroid_a_y;
515
- nk_f64_t pa_z = a[j * 3 + 2] - centroid_a_z;
516
- nk_f64_t pb_x = b[j * 3 + 0] - centroid_b_x;
517
- nk_f64_t pb_y = b[j * 3 + 1] - centroid_b_y;
518
- nk_f64_t pb_z = b[j * 3 + 2] - centroid_b_z;
519
-
520
- nk_f64_t ra_x = scale * (r[0] * pa_x + r[1] * pa_y + r[2] * pa_z);
521
- nk_f64_t ra_y = scale * (r[3] * pa_x + r[4] * pa_y + r[5] * pa_z);
522
- nk_f64_t ra_z = scale * (r[6] * pa_x + r[7] * pa_y + r[8] * pa_z);
523
-
524
- nk_f64_t delta_x = ra_x - pb_x;
525
- nk_f64_t delta_y = ra_y - pb_y;
526
- nk_f64_t delta_z = ra_z - pb_z;
549
+ nk_f64_t pa_x = a[j * 3 + 0] - centroid_a_x, pa_y = a[j * 3 + 1] - centroid_a_y,
550
+ pa_z = a[j * 3 + 2] - centroid_a_z;
551
+ nk_f64_t pb_x = b[j * 3 + 0] - centroid_b_x, pb_y = b[j * 3 + 1] - centroid_b_y,
552
+ pb_z = b[j * 3 + 2] - centroid_b_z;
553
+
554
+ nk_f64_t ra_x = scale * (r[0] * pa_x + r[1] * pa_y + r[2] * pa_z),
555
+ ra_y = scale * (r[3] * pa_x + r[4] * pa_y + r[5] * pa_z),
556
+ ra_z = scale * (r[6] * pa_x + r[7] * pa_y + r[8] * pa_z);
557
+
558
+ nk_f64_t delta_x = ra_x - pb_x, delta_y = ra_y - pb_y, delta_z = ra_z - pb_z;
527
559
  nk_accumulate_square_f64_(&sum_squared, &sum_squared_compensation, delta_x);
528
560
  nk_accumulate_square_f64_(&sum_squared, &sum_squared_compensation, delta_y);
529
561
  nk_accumulate_square_f64_(&sum_squared, &sum_squared_compensation, delta_z);
@@ -534,37 +566,121 @@ NK_INTERNAL nk_f64_t nk_transformed_ssd_f64_v128relaxed_(nk_f64_t const *a, nk_f
534
566
 
535
567
  NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
536
568
  nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
537
- // RMSD uses identity rotation and scale=1.0.
538
- if (rotation) {
539
- rotation[0] = 1, rotation[1] = 0, rotation[2] = 0;
540
- rotation[3] = 0, rotation[4] = 1, rotation[5] = 0;
569
+ if (rotation)
570
+ rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
541
571
  rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
542
- }
543
572
  if (scale) *scale = 1.0f;
544
- nk_f64_t centroid_a_x, centroid_a_y, centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z;
545
- nk_f64_t identity[9] = {1, 0, 0, 0, 1, 0, 0, 0, 1};
546
- nk_bicentroid_f32_v128relaxed_(a, b, n, &centroid_a_x, &centroid_a_y, &centroid_a_z, &centroid_b_x, &centroid_b_y,
547
- &centroid_b_z);
573
+
574
+ // Fused single-pass: accumulate centroids and squared differences simultaneously.
575
+ // RMSD = √(E[(ab)²] b̄)²)
576
+ v128_t zero_f64x2 = wasm_f64x2_splat(0.0);
577
+ v128_t sum_a_x_low_f64x2 = zero_f64x2, sum_a_x_high_f64x2 = zero_f64x2;
578
+ v128_t sum_a_y_low_f64x2 = zero_f64x2, sum_a_y_high_f64x2 = zero_f64x2;
579
+ v128_t sum_a_z_low_f64x2 = zero_f64x2, sum_a_z_high_f64x2 = zero_f64x2;
580
+ v128_t sum_b_x_low_f64x2 = zero_f64x2, sum_b_x_high_f64x2 = zero_f64x2;
581
+ v128_t sum_b_y_low_f64x2 = zero_f64x2, sum_b_y_high_f64x2 = zero_f64x2;
582
+ v128_t sum_b_z_low_f64x2 = zero_f64x2, sum_b_z_high_f64x2 = zero_f64x2;
583
+ v128_t sum_sq_x_low_f64x2 = zero_f64x2, sum_sq_x_high_f64x2 = zero_f64x2;
584
+ v128_t sum_sq_y_low_f64x2 = zero_f64x2, sum_sq_y_high_f64x2 = zero_f64x2;
585
+ v128_t sum_sq_z_low_f64x2 = zero_f64x2, sum_sq_z_high_f64x2 = zero_f64x2;
586
+ nk_size_t index = 0;
587
+
588
+ for (; index + 4 <= n; index += 4) {
589
+ v128_t a_x_f32x4, a_y_f32x4, a_z_f32x4, b_x_f32x4, b_y_f32x4, b_z_f32x4;
590
+ nk_deinterleave_f32x4_v128relaxed_(a + index * 3, &a_x_f32x4, &a_y_f32x4, &a_z_f32x4);
591
+ nk_deinterleave_f32x4_v128relaxed_(b + index * 3, &b_x_f32x4, &b_y_f32x4, &b_z_f32x4);
592
+
593
+ // Promote lower and upper halves to f64. Deltas computed in f64 to avoid
594
+ // f32 cancellation in the single-pass formula RMSD = √(E[(a−b)²] − (ā − b̄)²).
595
+ v128_t a_x_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_x_f32x4);
596
+ v128_t a_x_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_x_f32x4, a_x_f32x4, 2, 3, 0, 1));
597
+ v128_t a_y_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_y_f32x4);
598
+ v128_t a_y_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_y_f32x4, a_y_f32x4, 2, 3, 0, 1));
599
+ v128_t a_z_low_f64x2 = wasm_f64x2_promote_low_f32x4(a_z_f32x4);
600
+ v128_t a_z_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(a_z_f32x4, a_z_f32x4, 2, 3, 0, 1));
601
+ v128_t b_x_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_x_f32x4);
602
+ v128_t b_x_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_x_f32x4, b_x_f32x4, 2, 3, 0, 1));
603
+ v128_t b_y_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_y_f32x4);
604
+ v128_t b_y_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_y_f32x4, b_y_f32x4, 2, 3, 0, 1));
605
+ v128_t b_z_low_f64x2 = wasm_f64x2_promote_low_f32x4(b_z_f32x4);
606
+ v128_t b_z_high_f64x2 = wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(b_z_f32x4, b_z_f32x4, 2, 3, 0, 1));
607
+
608
+ // Accumulate centroids.
609
+ sum_a_x_low_f64x2 = wasm_f64x2_add(sum_a_x_low_f64x2, a_x_low_f64x2);
610
+ sum_a_x_high_f64x2 = wasm_f64x2_add(sum_a_x_high_f64x2, a_x_high_f64x2);
611
+ sum_a_y_low_f64x2 = wasm_f64x2_add(sum_a_y_low_f64x2, a_y_low_f64x2);
612
+ sum_a_y_high_f64x2 = wasm_f64x2_add(sum_a_y_high_f64x2, a_y_high_f64x2);
613
+ sum_a_z_low_f64x2 = wasm_f64x2_add(sum_a_z_low_f64x2, a_z_low_f64x2);
614
+ sum_a_z_high_f64x2 = wasm_f64x2_add(sum_a_z_high_f64x2, a_z_high_f64x2);
615
+ sum_b_x_low_f64x2 = wasm_f64x2_add(sum_b_x_low_f64x2, b_x_low_f64x2);
616
+ sum_b_x_high_f64x2 = wasm_f64x2_add(sum_b_x_high_f64x2, b_x_high_f64x2);
617
+ sum_b_y_low_f64x2 = wasm_f64x2_add(sum_b_y_low_f64x2, b_y_low_f64x2);
618
+ sum_b_y_high_f64x2 = wasm_f64x2_add(sum_b_y_high_f64x2, b_y_high_f64x2);
619
+ sum_b_z_low_f64x2 = wasm_f64x2_add(sum_b_z_low_f64x2, b_z_low_f64x2);
620
+ sum_b_z_high_f64x2 = wasm_f64x2_add(sum_b_z_high_f64x2, b_z_high_f64x2);
621
+
622
+ // Accumulate squared differences in f64 — deltas computed in f64 for precision.
623
+ v128_t dx_low_f64x2 = wasm_f64x2_sub(a_x_low_f64x2, b_x_low_f64x2);
624
+ v128_t dx_high_f64x2 = wasm_f64x2_sub(a_x_high_f64x2, b_x_high_f64x2);
625
+ v128_t dy_low_f64x2 = wasm_f64x2_sub(a_y_low_f64x2, b_y_low_f64x2);
626
+ v128_t dy_high_f64x2 = wasm_f64x2_sub(a_y_high_f64x2, b_y_high_f64x2);
627
+ v128_t dz_low_f64x2 = wasm_f64x2_sub(a_z_low_f64x2, b_z_low_f64x2);
628
+ v128_t dz_high_f64x2 = wasm_f64x2_sub(a_z_high_f64x2, b_z_high_f64x2);
629
+
630
+ sum_sq_x_low_f64x2 = wasm_f64x2_relaxed_madd(dx_low_f64x2, dx_low_f64x2, sum_sq_x_low_f64x2);
631
+ sum_sq_x_high_f64x2 = wasm_f64x2_relaxed_madd(dx_high_f64x2, dx_high_f64x2, sum_sq_x_high_f64x2);
632
+ sum_sq_y_low_f64x2 = wasm_f64x2_relaxed_madd(dy_low_f64x2, dy_low_f64x2, sum_sq_y_low_f64x2);
633
+ sum_sq_y_high_f64x2 = wasm_f64x2_relaxed_madd(dy_high_f64x2, dy_high_f64x2, sum_sq_y_high_f64x2);
634
+ sum_sq_z_low_f64x2 = wasm_f64x2_relaxed_madd(dz_low_f64x2, dz_low_f64x2, sum_sq_z_low_f64x2);
635
+ sum_sq_z_high_f64x2 = wasm_f64x2_relaxed_madd(dz_high_f64x2, dz_high_f64x2, sum_sq_z_high_f64x2);
636
+ }
637
+
638
+ nk_f64_t sum_a_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_x_low_f64x2, sum_a_x_high_f64x2));
639
+ nk_f64_t sum_a_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_y_low_f64x2, sum_a_y_high_f64x2));
640
+ nk_f64_t sum_a_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_a_z_low_f64x2, sum_a_z_high_f64x2));
641
+ nk_f64_t sum_b_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_x_low_f64x2, sum_b_x_high_f64x2));
642
+ nk_f64_t sum_b_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_y_low_f64x2, sum_b_y_high_f64x2));
643
+ nk_f64_t sum_b_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_b_z_low_f64x2, sum_b_z_high_f64x2));
644
+ nk_f64_t sum_sq_x = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_sq_x_low_f64x2, sum_sq_x_high_f64x2));
645
+ nk_f64_t sum_sq_y = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_sq_y_low_f64x2, sum_sq_y_high_f64x2));
646
+ nk_f64_t sum_sq_z = nk_hsum_f64x2_v128relaxed_(wasm_f64x2_add(sum_sq_z_low_f64x2, sum_sq_z_high_f64x2));
647
+
648
+ // Scalar tail.
649
+ for (; index < n; ++index) {
650
+ nk_f64_t ax = a[index * 3 + 0], ay = a[index * 3 + 1], az = a[index * 3 + 2];
651
+ nk_f64_t bx = b[index * 3 + 0], by = b[index * 3 + 1], bz = b[index * 3 + 2];
652
+ sum_a_x += ax, sum_a_y += ay, sum_a_z += az;
653
+ sum_b_x += bx, sum_b_y += by, sum_b_z += bz;
654
+ nk_f64_t dx = ax - bx, dy = ay - by, dz = az - bz;
655
+ sum_sq_x += dx * dx, sum_sq_y += dy * dy, sum_sq_z += dz * dz;
656
+ }
657
+
658
+ nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
659
+ nk_f64_t centroid_a_x = sum_a_x * inv_points_count, centroid_a_y = sum_a_y * inv_points_count,
660
+ centroid_a_z = sum_a_z * inv_points_count;
661
+ nk_f64_t centroid_b_x = sum_b_x * inv_points_count, centroid_b_y = sum_b_y * inv_points_count,
662
+ centroid_b_z = sum_b_z * inv_points_count;
548
663
  if (a_centroid)
549
664
  a_centroid[0] = (nk_f32_t)centroid_a_x, a_centroid[1] = (nk_f32_t)centroid_a_y,
550
665
  a_centroid[2] = (nk_f32_t)centroid_a_z;
551
666
  if (b_centroid)
552
667
  b_centroid[0] = (nk_f32_t)centroid_b_x, b_centroid[1] = (nk_f32_t)centroid_b_y,
553
668
  b_centroid[2] = (nk_f32_t)centroid_b_z;
554
- *result = nk_f64_sqrt_v128relaxed(nk_transformed_ssd_f32_v128relaxed_(a, b, n, identity, 1.0, centroid_a_x,
555
- centroid_a_y, centroid_a_z, centroid_b_x,
556
- centroid_b_y, centroid_b_z) /
557
- (nk_f64_t)n);
669
+
670
+ nk_f64_t sum_squared = sum_sq_x + sum_sq_y + sum_sq_z;
671
+ nk_f64_t mean_diff_x = centroid_a_x - centroid_b_x;
672
+ nk_f64_t mean_diff_y = centroid_a_y - centroid_b_y;
673
+ nk_f64_t mean_diff_z = centroid_a_z - centroid_b_z;
674
+ nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
675
+ *result = nk_f64_sqrt_v128relaxed(sum_squared * inv_points_count - mean_diff_sq);
558
676
  }
559
677
 
560
678
  NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
561
679
  nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
562
680
  // RMSD uses identity rotation and scale=1.0
563
- if (rotation) {
564
- rotation[0] = 1, rotation[1] = 0, rotation[2] = 0;
565
- rotation[3] = 0, rotation[4] = 1, rotation[5] = 0;
681
+ if (rotation)
682
+ rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
566
683
  rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
567
- }
568
684
  if (scale) *scale = 1.0;
569
685
 
570
686
  v128_t const zeros_f64x2 = wasm_f64x2_splat(0);
@@ -634,9 +750,11 @@ NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_
634
750
  total_squared_z += total_squared_z_compensation;
635
751
 
636
752
  // Compute centroids
637
- nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
638
- nk_f64_t centroid_a_x = total_ax * inv_n, centroid_a_y = total_ay * inv_n, centroid_a_z = total_az * inv_n;
639
- nk_f64_t centroid_b_x = total_bx * inv_n, centroid_b_y = total_by * inv_n, centroid_b_z = total_bz * inv_n;
753
+ nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
754
+ nk_f64_t centroid_a_x = total_ax * inv_points_count, centroid_a_y = total_ay * inv_points_count,
755
+ centroid_a_z = total_az * inv_points_count;
756
+ nk_f64_t centroid_b_x = total_bx * inv_points_count, centroid_b_y = total_by * inv_points_count,
757
+ centroid_b_z = total_bz * inv_points_count;
640
758
  if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
641
759
  if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
642
760
 
@@ -647,7 +765,7 @@ NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_
647
765
  nk_f64_t sum_squared = total_squared_x + total_squared_y + total_squared_z;
648
766
  nk_f64_t mean_diff_sq = mean_diff_x * mean_diff_x + mean_diff_y * mean_diff_y + mean_diff_z * mean_diff_z;
649
767
 
650
- *result = nk_f64_sqrt_v128relaxed(sum_squared * inv_n - mean_diff_sq);
768
+ *result = nk_f64_sqrt_v128relaxed(sum_squared * inv_points_count - mean_diff_sq);
651
769
  }
652
770
 
653
771
  NK_PUBLIC void nk_kabsch_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
@@ -678,9 +796,7 @@ NK_PUBLIC void nk_kabsch_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, n
678
796
 
679
797
  // Handle reflection: if det(R) < 0, negate third column of V and recompute R.
680
798
  if (nk_det3x3_f64_(r) < 0) {
681
- svd_v[2] = -svd_v[2];
682
- svd_v[5] = -svd_v[5];
683
- svd_v[8] = -svd_v[8];
799
+ svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
684
800
  r[0] = svd_v[0] * svd_u[0] + svd_v[1] * svd_u[1] + svd_v[2] * svd_u[2];
685
801
  r[1] = svd_v[0] * svd_u[3] + svd_v[1] * svd_u[4] + svd_v[2] * svd_u[5];
686
802
  r[2] = svd_v[0] * svd_u[6] + svd_v[1] * svd_u[7] + svd_v[2] * svd_u[8];
@@ -692,9 +808,8 @@ NK_PUBLIC void nk_kabsch_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, n
692
808
  r[8] = svd_v[6] * svd_u[6] + svd_v[7] * svd_u[7] + svd_v[8] * svd_u[8];
693
809
  }
694
810
 
695
- if (rotation) {
811
+ if (rotation)
696
812
  for (int j = 0; j < 9; ++j) rotation[j] = (nk_f32_t)r[j];
697
- }
698
813
  if (scale) *scale = 1.0f;
699
814
 
700
815
  *result = nk_f64_sqrt_v128relaxed(nk_transformed_ssd_f32_v128relaxed_(a, b, n, r, 1.0, centroid_a_x, centroid_a_y,
@@ -790,9 +905,11 @@ NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, n
790
905
  covariance_z_z += covariance_z_z_compensation;
791
906
 
792
907
  // Compute centroids
793
- nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
794
- nk_f64_t centroid_a_x = sum_a_x * inv_n, centroid_a_y = sum_a_y * inv_n, centroid_a_z = sum_a_z * inv_n;
795
- nk_f64_t centroid_b_x = sum_b_x * inv_n, centroid_b_y = sum_b_y * inv_n, centroid_b_z = sum_b_z * inv_n;
908
+ nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
909
+ nk_f64_t centroid_a_x = sum_a_x * inv_points_count, centroid_a_y = sum_a_y * inv_points_count,
910
+ centroid_a_z = sum_a_z * inv_points_count;
911
+ nk_f64_t centroid_b_x = sum_b_x * inv_points_count, centroid_b_y = sum_b_y * inv_points_count,
912
+ centroid_b_z = sum_b_z * inv_points_count;
796
913
  if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
797
914
  if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
798
915
 
@@ -818,9 +935,7 @@ NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, n
818
935
 
819
936
  // Handle reflection: if det(R) < 0, negate third column of V and recompute R
820
937
  if (nk_det3x3_f64_(r) < 0) {
821
- svd_v[2] = -svd_v[2];
822
- svd_v[5] = -svd_v[5];
823
- svd_v[8] = -svd_v[8];
938
+ svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
824
939
  nk_rotation_from_svd_f64_v128relaxed_(svd_u, svd_v, r);
825
940
  }
826
941
 
@@ -833,7 +948,7 @@ NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, n
833
948
  // Compute RMSD after optimal rotation
834
949
  nk_f64_t sum_squared = nk_transformed_ssd_f64_v128relaxed_(a, b, n, r, 1.0, centroid_a_x, centroid_a_y,
835
950
  centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z);
836
- *result = nk_f64_sqrt_v128relaxed(sum_squared * inv_n);
951
+ *result = nk_f64_sqrt_v128relaxed(sum_squared * inv_points_count);
837
952
  }
838
953
 
839
954
  NK_PUBLIC void nk_umeyama_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
@@ -866,9 +981,7 @@ NK_PUBLIC void nk_umeyama_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b,
866
981
 
867
982
  nk_f64_t det = nk_det3x3_f64_(r);
868
983
  if (det < 0) {
869
- svd_v[2] = -svd_v[2];
870
- svd_v[5] = -svd_v[5];
871
- svd_v[8] = -svd_v[8];
984
+ svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
872
985
  r[0] = svd_v[0] * svd_u[0] + svd_v[1] * svd_u[1] + svd_v[2] * svd_u[2];
873
986
  r[1] = svd_v[0] * svd_u[3] + svd_v[1] * svd_u[4] + svd_v[2] * svd_u[5];
874
987
  r[2] = svd_v[0] * svd_u[6] + svd_v[1] * svd_u[7] + svd_v[2] * svd_u[8];
@@ -988,15 +1101,17 @@ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b,
988
1101
  sum_sq_a += sum_sq_a_compensation;
989
1102
 
990
1103
  // Compute centroids
991
- nk_f64_t inv_n = 1.0 / (nk_f64_t)n;
992
- nk_f64_t centroid_a_x = sum_a_x * inv_n, centroid_a_y = sum_a_y * inv_n, centroid_a_z = sum_a_z * inv_n;
993
- nk_f64_t centroid_b_x = sum_b_x * inv_n, centroid_b_y = sum_b_y * inv_n, centroid_b_z = sum_b_z * inv_n;
1104
+ nk_f64_t inv_points_count = 1.0 / (nk_f64_t)n;
1105
+ nk_f64_t centroid_a_x = sum_a_x * inv_points_count, centroid_a_y = sum_a_y * inv_points_count,
1106
+ centroid_a_z = sum_a_z * inv_points_count;
1107
+ nk_f64_t centroid_b_x = sum_b_x * inv_points_count, centroid_b_y = sum_b_y * inv_points_count,
1108
+ centroid_b_z = sum_b_z * inv_points_count;
994
1109
  if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
995
1110
  if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
996
1111
 
997
1112
  // Compute variance of A (centered)
998
1113
  nk_f64_t centroid_sq = centroid_a_x * centroid_a_x + centroid_a_y * centroid_a_y + centroid_a_z * centroid_a_z;
999
- nk_f64_t var_a = sum_sq_a * inv_n - centroid_sq;
1114
+ nk_f64_t var_a = sum_sq_a * inv_points_count - centroid_sq;
1000
1115
 
1001
1116
  // Apply centering correction: H_centered = H - n * centroid_a * centroid_bT
1002
1117
  covariance_x_x -= n * centroid_a_x * centroid_b_x;
@@ -1024,9 +1139,7 @@ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b,
1024
1139
  nk_f64_t computed_scale = trace_d_s / (n * var_a);
1025
1140
 
1026
1141
  if (det < 0) {
1027
- svd_v[2] = -svd_v[2];
1028
- svd_v[5] = -svd_v[5];
1029
- svd_v[8] = -svd_v[8];
1142
+ svd_v[2] = -svd_v[2], svd_v[5] = -svd_v[5], svd_v[8] = -svd_v[8];
1030
1143
  nk_rotation_from_svd_f64_v128relaxed_(svd_u, svd_v, r);
1031
1144
  }
1032
1145
 
@@ -1037,7 +1150,7 @@ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b,
1037
1150
  // Compute RMSD after transformation
1038
1151
  nk_f64_t sum_squared = nk_transformed_ssd_f64_v128relaxed_(a, b, n, r, computed_scale, centroid_a_x, centroid_a_y,
1039
1152
  centroid_a_z, centroid_b_x, centroid_b_y, centroid_b_z);
1040
- *result = nk_f64_sqrt_v128relaxed(sum_squared * inv_n);
1153
+ *result = nk_f64_sqrt_v128relaxed(sum_squared * inv_points_count);
1041
1154
  }
1042
1155
 
1043
1156
  #if defined(__clang__)