numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,652 @@
1
+ /**
2
+ * @brief SIMD-accelerated Point Cloud Alignment.
3
+ * @file include/numkong/mesh.h
4
+ * @author Ash Vardanian
5
+ * @date June 19, 2024
6
+ *
7
+ * Contains:
8
+ *
9
+ * - Root Mean Square Deviation (RMSD) for rigid body superposition
10
+ * - Kabsch algorithm for optimal rigid body alignment (rotation only)
11
+ * - Umeyama algorithm for similarity transform (rotation + uniform scaling)
12
+ *
13
+ * Precision policy is intentionally mixed across algorithm phases:
14
+ *
15
+ * - `f64` inputs keep both the geometric transform and the scalar fit metric in `f64`
16
+ * - `f32` inputs keep transform outputs narrow (`a_centroid`, `b_centroid`, `rotation`, `scale`) but widen
17
+ * the scalar fit metric to `f64`
18
+ * - `f16` and `bf16` inputs keep transform and metric outputs in `f32`
19
+ *
20
+ * This keeps `f32` mesh kernels materially faster than `f64` kernels by preserving narrower input bandwidth,
21
+ * while still widening the numerically sensitive stages that dominate alignment quality.
22
+ *
23
+ * For hardware architectures:
24
+ *
25
+ * - x86 (AVX2, AVX512)
26
+ * - Arm (NEON, SVE)
27
+ *
28
+ * @section applications Applications
29
+ *
30
+ * These routines are the core of point-cloud alignment pipelines:
31
+ *
32
+ * - Structural biology: protein backbone or ligand alignment (RMSD, Kabsch)
33
+ * - Computer graphics: mesh registration and deformation transfer
34
+ * - Robotics/SLAM: point-cloud registration and tracking
35
+ *
36
+ * @section transformation_convention Transformation Convention
37
+ *
38
+ * All functions compute a transformation that aligns the FIRST point cloud (a) to the SECOND (b).
39
+ * The transformation to apply is:
40
+ *
41
+ * a′ᵢ = scale × R × (aᵢ - ā) + b̄
42
+ *
43
+ * Where:
44
+ *
45
+ * - R is a 3×3 rotation matrix (row-major, 9 values)
46
+ * - scale is a uniform scaling factor (1.0 for RMSD and Kabsch)
47
+ * - ā, b̄ are the centroids of the respective point clouds
48
+ *
49
+ * @section algorithm_overview Algorithm Overview
50
+ *
51
+ * - RMSD: Simple root mean square deviation without alignment. R = identity, scale = 1.0
52
+ * - Kabsch: Finds optimal rotation R minimizing ‖R × (a - ā) - (b - b̄)‖. scale = 1.0
53
+ * - Umeyama: Finds optimal rotation R and scale c minimizing ‖c × R × (a - ā) - (b - b̄)‖
54
+ *
55
+ * Kabsch and Umeyama compute a 3×3 cross-covariance matrix H = Σ(aᵢ - ā)(bᵢ - b̄)ᵀ
56
+ * and recover R from the SVD of H. Umeyama additionally estimates a uniform scale from the
57
+ * singular values and the variance of the centered source points.
58
+ *
59
+ * The 3×3 SVD implementation is based on the McAdams et al. paper:
60
+ * "Computing the Singular Value Decomposition of 3×3 matrices with minimal branching
61
+ * and elementary floating point operations", University of Wisconsin - Madison TR1690, 2011.
62
+ *
63
+ * @section numerical_notes Numerical Notes
64
+ *
65
+ * Let `n` be the number of 3D points:
66
+ *
67
+ * - `O(n)` stages are the point-cloud passes for centroids, cross-covariance, source variance, and transformed SSD.
68
+ * - `O(1)` stages are the fixed-size 3×3 SVD/eigensolve, determinant/reflection fix, and scale construction.
69
+ *
70
+ * Kernel policy:
71
+ *
72
+ * - `f64`: both `O(n)` reductions and the `O(1)` 3×3 solve run in `f64`.
73
+ * - `f32`: point coordinates load as `f32`, widen before arithmetic, keep `O(n)` reductions in `f64`,
74
+ * keep the `O(1)` 3×3 solve in `f64`, and only narrow public transform outputs on store.
75
+ * - `f16`/`bf16`: keep both `O(n)` and `O(1)` stages in `f32`.
76
+ *
77
+ * - `f32` transform outputs stay narrow because they are typically applied back onto `f32` point clouds.
78
+ * - Reflections are handled by flipping the last singular vector when det(R) < 0.
79
+ * - For very small point sets, the loops are scalar-heavy and dominate over SIMD setup costs.
80
+ *
81
+ * @section x86_instructions Relevant x86 Instructions
82
+ *
83
+ * The SIMD kernels are dominated by FMA, permutes, and gathers:
84
+ *
85
+ * Intrinsic Instruction Notes
86
+ * _mm256_fmadd_ps/pd VFMADD* FMA on FP ports (Haswell/Skylake: ports 0/1)
87
+ * _mm256_i32gather_ps VGATHERDPS High-latency; memory-bound
88
+ * _mm512_permutex2var_ps/pd VPERMT2* Shuffle-heavy; can bottleneck on shuffle ports
89
+ * _mm512_reduce_add_ps/pd (sequence) Implemented via shuffles + adds
90
+ *
91
+ * Gather-heavy tails are intentionally isolated to keep the steady-state loop on contiguous loads.
92
+ *
93
+ * @section references References
94
+ *
95
+ * - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
96
+ * - Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
97
+ *
98
+ */
99
+ #ifndef NK_MESH_H
100
+ #define NK_MESH_H
101
+
102
+ #include "numkong/types.h"
103
+
104
+ #if defined(__cplusplus)
105
+ extern "C" {
106
+ #endif
107
+
108
+ /**
109
+ * @brief RMSD mesh superposition function.
110
+ *
111
+ * The transformation aligns a to b: a′ᵢ = scale × R × (aᵢ - ā) + b̄
112
+ *
113
+ * @param[in] a First point cloud (source), n×3 interleaved [x0,y0,z0, x1,y1,z1, ...].
114
+ * @param[in] b Second point cloud (target), n×3 interleaved [x0,y0,z0, x1,y1,z1, ...].
115
+ * @param[in] n Number of 3D points in each cloud.
116
+ * @param[out] a_centroid Centroid of first cloud (3 values). Can be NULL.
117
+ * @param[out] b_centroid Centroid of second cloud (3 values). Can be NULL.
118
+ * @param[out] rotation Row-major 3×3 rotation matrix (9 values), always identity. Can be NULL.
119
+ * @param[out] scale Scale factor applied, always 1. Can be NULL.
120
+ * @param[out] result RMSD after applying the transformation.
121
+ */
122
+ NK_DYNAMIC void nk_rmsd_f64(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
123
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
124
+ /** @copydoc nk_rmsd_f64 */
125
+ NK_DYNAMIC void nk_rmsd_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
126
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result);
127
+ /** @copydoc nk_rmsd_f64 */
128
+ NK_DYNAMIC void nk_rmsd_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
129
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
130
+ /** @copydoc nk_rmsd_f64 */
131
+ NK_DYNAMIC void nk_rmsd_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
132
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
133
+
134
+ /**
135
+ * @brief Kabsch mesh superposition function.
136
+ *
137
+ * The transformation aligns a to b: a′ᵢ = scale × R × (aᵢ - ā) + b̄
138
+ *
139
+ * @param[in] a First point cloud (source), n×3 interleaved [x0,y0,z0, x1,y1,z1, ...].
140
+ * @param[in] b Second point cloud (target), n×3 interleaved [x0,y0,z0, x1,y1,z1, ...].
141
+ * @param[in] n Number of 3D points in each cloud.
142
+ * @param[out] a_centroid Centroid of first cloud (3 values). Can be NULL.
143
+ * @param[out] b_centroid Centroid of second cloud (3 values). Can be NULL.
144
+ * @param[out] rotation Row-major 3×3 rotation matrix (9 values). Can be NULL.
145
+ * @param[out] scale Scale factor applied, always 1. Can be NULL.
146
+ * @param[out] result RMSD after applying the transformation.
147
+ */
148
+ NK_DYNAMIC void nk_kabsch_f64(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
149
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
150
+ /** @copydoc nk_kabsch_f64 */
151
+ NK_DYNAMIC void nk_kabsch_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
152
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result);
153
+ /** @copydoc nk_kabsch_f64 */
154
+ NK_DYNAMIC void nk_kabsch_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
155
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
156
+ /** @copydoc nk_kabsch_f64 */
157
+ NK_DYNAMIC void nk_kabsch_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
158
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
159
+
160
+ /**
161
+ * @brief Umeyama mesh superposition function.
162
+ *
163
+ * The transformation aligns a to b: a′ᵢ = scale × R × (aᵢ - ā) + b̄
164
+ *
165
+ * @param[in] a First point cloud (source), n×3 interleaved [x0,y0,z0, x1,y1,z1, ...].
166
+ * @param[in] b Second point cloud (target), n×3 interleaved [x0,y0,z0, x1,y1,z1, ...].
167
+ * @param[in] n Number of 3D points in each cloud.
168
+ * @param[out] a_centroid Centroid of first cloud (3 values). Can be NULL.
169
+ * @param[out] b_centroid Centroid of second cloud (3 values). Can be NULL.
170
+ * @param[out] rotation Row-major 3×3 rotation matrix (9 values). Can be NULL.
171
+ * @param[out] scale Scale factor applied. Can be NULL.
172
+ * @param[out] result RMSD after applying the transformation.
173
+ */
174
+ NK_DYNAMIC void nk_umeyama_f64(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
175
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
176
+ /** @copydoc nk_umeyama_f64 */
177
+ NK_DYNAMIC void nk_umeyama_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
178
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result);
179
+ /** @copydoc nk_umeyama_f64 */
180
+ NK_DYNAMIC void nk_umeyama_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
181
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
182
+ /** @copydoc nk_umeyama_f64 */
183
+ NK_DYNAMIC void nk_umeyama_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
184
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
185
+
186
+ /** @copydoc nk_rmsd_f64 */
187
+ NK_PUBLIC void nk_rmsd_f64_serial(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
188
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
189
+ /** @copydoc nk_kabsch_f64 */
190
+ NK_PUBLIC void nk_kabsch_f64_serial(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
191
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
192
+ /** @copydoc nk_umeyama_f64 */
193
+ NK_PUBLIC void nk_umeyama_f64_serial(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
194
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
195
+
196
+ /** @copydoc nk_rmsd_f32 */
197
+ NK_PUBLIC void nk_rmsd_f32_serial(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
198
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result);
199
+ /** @copydoc nk_kabsch_f32 */
200
+ NK_PUBLIC void nk_kabsch_f32_serial(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
201
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result);
202
+ /** @copydoc nk_umeyama_f32 */
203
+ NK_PUBLIC void nk_umeyama_f32_serial(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
204
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result);
205
+
206
+ /** @copydoc nk_rmsd_f16 */
207
+ NK_PUBLIC void nk_rmsd_f16_serial(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
208
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
209
+ /** @copydoc nk_kabsch_f16 */
210
+ NK_PUBLIC void nk_kabsch_f16_serial(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
211
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
212
+ /** @copydoc nk_umeyama_f16 */
213
+ NK_PUBLIC void nk_umeyama_f16_serial(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
214
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
215
+
216
+ /** @copydoc nk_rmsd_bf16 */
217
+ NK_PUBLIC void nk_rmsd_bf16_serial(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
218
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
219
+ /** @copydoc nk_kabsch_bf16 */
220
+ NK_PUBLIC void nk_kabsch_bf16_serial(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
221
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
222
+ /** @copydoc nk_umeyama_bf16 */
223
+ NK_PUBLIC void nk_umeyama_bf16_serial(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
224
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
225
+
226
+ /* SIMD-powered backends for AVX512 CPUs of Skylake generation and newer.
227
+ */
228
+ #if NK_TARGET_SKYLAKE
229
+ /** @copydoc nk_rmsd_f32 */
230
+ NK_PUBLIC void nk_rmsd_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
231
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result);
232
+ /** @copydoc nk_kabsch_f32 */
233
+ NK_PUBLIC void nk_kabsch_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
234
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result);
235
+ /** @copydoc nk_umeyama_f32 */
236
+ NK_PUBLIC void nk_umeyama_f32_skylake(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
237
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result);
238
+
239
+ /** @copydoc nk_rmsd_f64 */
240
+ NK_PUBLIC void nk_rmsd_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
241
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
242
+ /** @copydoc nk_kabsch_f64 */
243
+ NK_PUBLIC void nk_kabsch_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
244
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
245
+ /** @copydoc nk_umeyama_f64 */
246
+ NK_PUBLIC void nk_umeyama_f64_skylake(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
247
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
248
+ #endif // NK_TARGET_SKYLAKE
249
+
250
+ /* SIMD-powered backends for AVX2 CPUs of Haswell generation and newer.
251
+ */
252
+ #if NK_TARGET_HASWELL
253
+ /** @copydoc nk_rmsd_f32 */
254
+ NK_PUBLIC void nk_rmsd_f32_haswell(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
255
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result);
256
+ /** @copydoc nk_kabsch_f32 */
257
+ NK_PUBLIC void nk_kabsch_f32_haswell(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
258
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result);
259
+ /** @copydoc nk_umeyama_f32 */
260
+ NK_PUBLIC void nk_umeyama_f32_haswell(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
261
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result);
262
+
263
+ /** @copydoc nk_rmsd_f64 */
264
+ NK_PUBLIC void nk_rmsd_f64_haswell(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
265
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
266
+ /** @copydoc nk_kabsch_f64 */
267
+ NK_PUBLIC void nk_kabsch_f64_haswell(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
268
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
269
+ /** @copydoc nk_umeyama_f64 */
270
+ NK_PUBLIC void nk_umeyama_f64_haswell(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
271
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
272
+
273
+ /** @copydoc nk_rmsd_f16 */
274
+ NK_PUBLIC void nk_rmsd_f16_haswell(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
275
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
276
+ /** @copydoc nk_kabsch_f16 */
277
+ NK_PUBLIC void nk_kabsch_f16_haswell(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
278
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
279
+ /** @copydoc nk_umeyama_f16 */
280
+ NK_PUBLIC void nk_umeyama_f16_haswell(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
281
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
282
+
283
+ /** @copydoc nk_rmsd_bf16 */
284
+ NK_PUBLIC void nk_rmsd_bf16_haswell(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
285
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
286
+ /** @copydoc nk_kabsch_bf16 */
287
+ NK_PUBLIC void nk_kabsch_bf16_haswell(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
288
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
289
+ /** @copydoc nk_umeyama_bf16 */
290
+ NK_PUBLIC void nk_umeyama_bf16_haswell(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
291
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
292
+ #endif // NK_TARGET_HASWELL
293
+
294
+ /* SIMD-powered backends for Arm NEON CPUs.
295
+ */
296
+ #if NK_TARGET_NEON
297
+ /** @copydoc nk_rmsd_f32 */
298
+ NK_PUBLIC void nk_rmsd_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
299
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result);
300
+ /** @copydoc nk_kabsch_f32 */
301
+ NK_PUBLIC void nk_kabsch_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
302
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result);
303
+ /** @copydoc nk_umeyama_f32 */
304
+ NK_PUBLIC void nk_umeyama_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
305
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result);
306
+
307
+ /** @copydoc nk_rmsd_f64 */
308
+ NK_PUBLIC void nk_rmsd_f64_neon(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
309
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
310
+ /** @copydoc nk_kabsch_f64 */
311
+ NK_PUBLIC void nk_kabsch_f64_neon(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
312
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
313
+ /** @copydoc nk_umeyama_f64 */
314
+ NK_PUBLIC void nk_umeyama_f64_neon(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
315
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
316
+ #endif // NK_TARGET_NEON
317
+
318
+ /* SIMD-powered backends for Arm NEON FP16 CPUs.
319
+ */
320
+ #if NK_TARGET_NEONHALF
321
+ /** @copydoc nk_rmsd_f16 */
322
+ NK_PUBLIC void nk_rmsd_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
323
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
324
+ /** @copydoc nk_kabsch_f16 */
325
+ NK_PUBLIC void nk_kabsch_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
326
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
327
+ /** @copydoc nk_umeyama_f16 */
328
+ NK_PUBLIC void nk_umeyama_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
329
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
330
+ #endif // NK_TARGET_NEONHALF
331
+
332
+ /* SIMD-powered backends for Arm NEON BF16 CPUs.
333
+ */
334
+ #if NK_TARGET_NEONBFDOT
335
+ /** @copydoc nk_rmsd_bf16 */
336
+ NK_PUBLIC void nk_rmsd_bf16_neonbfdot(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
337
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
338
+ /** @copydoc nk_kabsch_bf16 */
339
+ NK_PUBLIC void nk_kabsch_bf16_neonbfdot(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
340
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
341
+ /** @copydoc nk_umeyama_bf16 */
342
+ NK_PUBLIC void nk_umeyama_bf16_neonbfdot(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
343
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
344
+ #endif // NK_TARGET_NEONBFDOT
345
+
346
+ #if NK_TARGET_RVV
347
+ /** @copydoc nk_rmsd_f32 */
348
+ NK_PUBLIC void nk_rmsd_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
349
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result);
350
+ /** @copydoc nk_rmsd_f64 */
351
+ NK_PUBLIC void nk_rmsd_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
352
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
353
+ /** @copydoc nk_rmsd_f16 */
354
+ NK_PUBLIC void nk_rmsd_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
355
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
356
+ /** @copydoc nk_rmsd_bf16 */
357
+ NK_PUBLIC void nk_rmsd_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
358
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
359
+ /** @copydoc nk_kabsch_f32 */
360
+ NK_PUBLIC void nk_kabsch_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
361
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result);
362
+ /** @copydoc nk_kabsch_f64 */
363
+ NK_PUBLIC void nk_kabsch_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
364
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
365
+ /** @copydoc nk_kabsch_f16 */
366
+ NK_PUBLIC void nk_kabsch_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
367
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
368
+ /** @copydoc nk_kabsch_bf16 */
369
+ NK_PUBLIC void nk_kabsch_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
370
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
371
+ /** @copydoc nk_umeyama_f32 */
372
+ NK_PUBLIC void nk_umeyama_f32_rvv(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
373
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result);
374
+ /** @copydoc nk_umeyama_f64 */
375
+ NK_PUBLIC void nk_umeyama_f64_rvv(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
376
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
377
+ /** @copydoc nk_umeyama_f16 */
378
+ NK_PUBLIC void nk_umeyama_f16_rvv(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
379
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
380
+ /** @copydoc nk_umeyama_bf16 */
381
+ NK_PUBLIC void nk_umeyama_bf16_rvv(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
382
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result);
383
+ #endif // NK_TARGET_RVV
384
+
385
+ /* WASM Relaxed SIMD backends using wasm_f32x4_relaxed_madd for FMA.
386
+ */
387
+ #if NK_TARGET_V128RELAXED
388
+ /** @copydoc nk_rmsd_f32 */
389
+ NK_PUBLIC void nk_rmsd_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
390
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result);
391
+ /** @copydoc nk_kabsch_f32 */
392
+ NK_PUBLIC void nk_kabsch_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
393
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result);
394
+ /** @copydoc nk_umeyama_f32 */
395
+ NK_PUBLIC void nk_umeyama_f32_v128relaxed(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
396
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result);
397
+ /** @copydoc nk_rmsd_f64 */
398
+ NK_PUBLIC void nk_rmsd_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
399
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
400
+ /** @copydoc nk_kabsch_f64 */
401
+ NK_PUBLIC void nk_kabsch_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
402
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
403
+ /** @copydoc nk_umeyama_f64 */
404
+ NK_PUBLIC void nk_umeyama_f64_v128relaxed(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
405
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result);
406
+ #endif // NK_TARGET_V128RELAXED
407
+
408
+ /**
409
+ * @brief Returns the output dtype for RMSD.
410
+ */
411
+ NK_INTERNAL nk_dtype_t nk_rmsd_output_dtype(nk_dtype_t dtype) {
412
+ switch (dtype) {
413
+ case nk_f64_k: return nk_f64_k;
414
+ case nk_f32_k: return nk_f64_k;
415
+ case nk_f16_k: return nk_f32_k;
416
+ case nk_bf16_k: return nk_f32_k;
417
+ default: return nk_dtype_unknown_k;
418
+ }
419
+ }
420
+
421
+ /**
422
+ * @brief Returns the output dtype for Kabsch alignment.
423
+ */
424
+ NK_INTERNAL nk_dtype_t nk_kabsch_output_dtype(nk_dtype_t dtype) {
425
+ switch (dtype) {
426
+ case nk_f64_k: return nk_f64_k;
427
+ case nk_f32_k: return nk_f64_k;
428
+ case nk_f16_k: return nk_f32_k;
429
+ case nk_bf16_k: return nk_f32_k;
430
+ default: return nk_dtype_unknown_k;
431
+ }
432
+ }
433
+
434
+ /**
435
+ * @brief Returns the output dtype for Umeyama alignment.
436
+ */
437
+ NK_INTERNAL nk_dtype_t nk_umeyama_output_dtype(nk_dtype_t dtype) {
438
+ switch (dtype) {
439
+ case nk_f64_k: return nk_f64_k;
440
+ case nk_f32_k: return nk_f64_k;
441
+ case nk_f16_k: return nk_f32_k;
442
+ case nk_bf16_k: return nk_f32_k;
443
+ default: return nk_dtype_unknown_k;
444
+ }
445
+ }
446
+
447
+ #if defined(__cplusplus)
448
+ } // extern "C"
449
+ #endif
450
+
451
+ #include "numkong/mesh/serial.h"
452
+ #include "numkong/mesh/neon.h"
453
+ #include "numkong/mesh/neonhalf.h"
454
+ #include "numkong/mesh/neonbfdot.h"
455
+ #include "numkong/mesh/haswell.h"
456
+ #include "numkong/mesh/skylake.h"
457
+ #include "numkong/mesh/rvv.h"
458
+ #include "numkong/mesh/v128relaxed.h"
459
+
460
+ #if defined(__cplusplus)
461
+ extern "C" {
462
+ #endif
463
+
464
+ #if !NK_DYNAMIC_DISPATCH
465
+
466
+ NK_PUBLIC void nk_rmsd_f64(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
467
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
468
+ #if NK_TARGET_SKYLAKE
469
+ nk_rmsd_f64_skylake(a, b, n, a_centroid, b_centroid, rotation, scale, result);
470
+ #elif NK_TARGET_HASWELL
471
+ nk_rmsd_f64_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
472
+ #elif NK_TARGET_NEON
473
+ nk_rmsd_f64_neon(a, b, n, a_centroid, b_centroid, rotation, scale, result);
474
+ #elif NK_TARGET_RVV
475
+ nk_rmsd_f64_rvv(a, b, n, a_centroid, b_centroid, rotation, scale, result);
476
+ #elif NK_TARGET_V128RELAXED
477
+ nk_rmsd_f64_v128relaxed(a, b, n, a_centroid, b_centroid, rotation, scale, result);
478
+ #else
479
+ nk_rmsd_f64_serial(a, b, n, a_centroid, b_centroid, rotation, scale, result);
480
+ #endif
481
+ }
482
+
483
+ NK_PUBLIC void nk_rmsd_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
484
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
485
+ #if NK_TARGET_SKYLAKE
486
+ nk_rmsd_f32_skylake(a, b, n, a_centroid, b_centroid, rotation, scale, result);
487
+ #elif NK_TARGET_HASWELL
488
+ nk_rmsd_f32_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
489
+ #elif NK_TARGET_NEON
490
+ nk_rmsd_f32_neon(a, b, n, a_centroid, b_centroid, rotation, scale, result);
491
+ #elif NK_TARGET_RVV
492
+ nk_rmsd_f32_rvv(a, b, n, a_centroid, b_centroid, rotation, scale, result);
493
+ #elif NK_TARGET_V128RELAXED
494
+ nk_rmsd_f32_v128relaxed(a, b, n, a_centroid, b_centroid, rotation, scale, result);
495
+ #else
496
+ nk_rmsd_f32_serial(a, b, n, a_centroid, b_centroid, rotation, scale, result);
497
+ #endif
498
+ }
499
+
500
+ NK_PUBLIC void nk_rmsd_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
501
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
502
+ #if NK_TARGET_HASWELL
503
+ nk_rmsd_f16_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
504
+ #elif NK_TARGET_NEONHALF
505
+ nk_rmsd_f16_neonhalf(a, b, n, a_centroid, b_centroid, rotation, scale, result);
506
+ #elif NK_TARGET_RVV
507
+ nk_rmsd_f16_rvv(a, b, n, a_centroid, b_centroid, rotation, scale, result);
508
+ #else
509
+ nk_rmsd_f16_serial(a, b, n, a_centroid, b_centroid, rotation, scale, result);
510
+ #endif
511
+ }
512
+
513
+ NK_PUBLIC void nk_rmsd_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
514
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
515
+ #if NK_TARGET_HASWELL
516
+ nk_rmsd_bf16_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
517
+ #elif NK_TARGET_NEONBFDOT
518
+ nk_rmsd_bf16_neonbfdot(a, b, n, a_centroid, b_centroid, rotation, scale, result);
519
+ #elif NK_TARGET_RVV
520
+ nk_rmsd_bf16_rvv(a, b, n, a_centroid, b_centroid, rotation, scale, result);
521
+ #else
522
+ nk_rmsd_bf16_serial(a, b, n, a_centroid, b_centroid, rotation, scale, result);
523
+ #endif
524
+ }
525
+
526
+ NK_PUBLIC void nk_kabsch_f64(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
527
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
528
+ #if NK_TARGET_SKYLAKE
529
+ nk_kabsch_f64_skylake(a, b, n, a_centroid, b_centroid, rotation, scale, result);
530
+ #elif NK_TARGET_HASWELL
531
+ nk_kabsch_f64_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
532
+ #elif NK_TARGET_NEON
533
+ nk_kabsch_f64_neon(a, b, n, a_centroid, b_centroid, rotation, scale, result);
534
+ #elif NK_TARGET_RVV
535
+ nk_kabsch_f64_rvv(a, b, n, a_centroid, b_centroid, rotation, scale, result);
536
+ #elif NK_TARGET_V128RELAXED
537
+ nk_kabsch_f64_v128relaxed(a, b, n, a_centroid, b_centroid, rotation, scale, result);
538
+ #else
539
+ nk_kabsch_f64_serial(a, b, n, a_centroid, b_centroid, rotation, scale, result);
540
+ #endif
541
+ }
542
+
543
+ NK_PUBLIC void nk_kabsch_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
544
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
545
+ #if NK_TARGET_SKYLAKE
546
+ nk_kabsch_f32_skylake(a, b, n, a_centroid, b_centroid, rotation, scale, result);
547
+ #elif NK_TARGET_HASWELL
548
+ nk_kabsch_f32_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
549
+ #elif NK_TARGET_NEON
550
+ nk_kabsch_f32_neon(a, b, n, a_centroid, b_centroid, rotation, scale, result);
551
+ #elif NK_TARGET_RVV
552
+ nk_kabsch_f32_rvv(a, b, n, a_centroid, b_centroid, rotation, scale, result);
553
+ #elif NK_TARGET_V128RELAXED
554
+ nk_kabsch_f32_v128relaxed(a, b, n, a_centroid, b_centroid, rotation, scale, result);
555
+ #else
556
+ nk_kabsch_f32_serial(a, b, n, a_centroid, b_centroid, rotation, scale, result);
557
+ #endif
558
+ }
559
+
560
+ NK_PUBLIC void nk_kabsch_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
561
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
562
+ #if NK_TARGET_HASWELL
563
+ nk_kabsch_f16_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
564
+ #elif NK_TARGET_NEONHALF
565
+ nk_kabsch_f16_neonhalf(a, b, n, a_centroid, b_centroid, rotation, scale, result);
566
+ #elif NK_TARGET_RVV
567
+ nk_kabsch_f16_rvv(a, b, n, a_centroid, b_centroid, rotation, scale, result);
568
+ #else
569
+ nk_kabsch_f16_serial(a, b, n, a_centroid, b_centroid, rotation, scale, result);
570
+ #endif
571
+ }
572
+
573
+ NK_PUBLIC void nk_kabsch_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
574
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
575
+ #if NK_TARGET_HASWELL
576
+ nk_kabsch_bf16_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
577
+ #elif NK_TARGET_NEONBFDOT
578
+ nk_kabsch_bf16_neonbfdot(a, b, n, a_centroid, b_centroid, rotation, scale, result);
579
+ #elif NK_TARGET_RVV
580
+ nk_kabsch_bf16_rvv(a, b, n, a_centroid, b_centroid, rotation, scale, result);
581
+ #else
582
+ nk_kabsch_bf16_serial(a, b, n, a_centroid, b_centroid, rotation, scale, result);
583
+ #endif
584
+ }
585
+
586
+ NK_PUBLIC void nk_umeyama_f64(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *a_centroid,
587
+ nk_f64_t *b_centroid, nk_f64_t *rotation, nk_f64_t *scale, nk_f64_t *result) {
588
+ #if NK_TARGET_SKYLAKE
589
+ nk_umeyama_f64_skylake(a, b, n, a_centroid, b_centroid, rotation, scale, result);
590
+ #elif NK_TARGET_HASWELL
591
+ nk_umeyama_f64_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
592
+ #elif NK_TARGET_NEON
593
+ nk_umeyama_f64_neon(a, b, n, a_centroid, b_centroid, rotation, scale, result);
594
+ #elif NK_TARGET_RVV
595
+ nk_umeyama_f64_rvv(a, b, n, a_centroid, b_centroid, rotation, scale, result);
596
+ #elif NK_TARGET_V128RELAXED
597
+ nk_umeyama_f64_v128relaxed(a, b, n, a_centroid, b_centroid, rotation, scale, result);
598
+ #else
599
+ nk_umeyama_f64_serial(a, b, n, a_centroid, b_centroid, rotation, scale, result);
600
+ #endif
601
+ }
602
+
603
+ NK_PUBLIC void nk_umeyama_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f32_t *a_centroid,
604
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f64_t *result) {
605
+ #if NK_TARGET_SKYLAKE
606
+ nk_umeyama_f32_skylake(a, b, n, a_centroid, b_centroid, rotation, scale, result);
607
+ #elif NK_TARGET_HASWELL
608
+ nk_umeyama_f32_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
609
+ #elif NK_TARGET_NEON
610
+ nk_umeyama_f32_neon(a, b, n, a_centroid, b_centroid, rotation, scale, result);
611
+ #elif NK_TARGET_RVV
612
+ nk_umeyama_f32_rvv(a, b, n, a_centroid, b_centroid, rotation, scale, result);
613
+ #elif NK_TARGET_V128RELAXED
614
+ nk_umeyama_f32_v128relaxed(a, b, n, a_centroid, b_centroid, rotation, scale, result);
615
+ #else
616
+ nk_umeyama_f32_serial(a, b, n, a_centroid, b_centroid, rotation, scale, result);
617
+ #endif
618
+ }
619
+
620
+ NK_PUBLIC void nk_umeyama_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
621
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
622
+ #if NK_TARGET_HASWELL
623
+ nk_umeyama_f16_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
624
+ #elif NK_TARGET_NEONHALF
625
+ nk_umeyama_f16_neonhalf(a, b, n, a_centroid, b_centroid, rotation, scale, result);
626
+ #elif NK_TARGET_RVV
627
+ nk_umeyama_f16_rvv(a, b, n, a_centroid, b_centroid, rotation, scale, result);
628
+ #else
629
+ nk_umeyama_f16_serial(a, b, n, a_centroid, b_centroid, rotation, scale, result);
630
+ #endif
631
+ }
632
+
633
+ NK_PUBLIC void nk_umeyama_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
634
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
635
+ #if NK_TARGET_HASWELL
636
+ nk_umeyama_bf16_haswell(a, b, n, a_centroid, b_centroid, rotation, scale, result);
637
+ #elif NK_TARGET_NEONBFDOT
638
+ nk_umeyama_bf16_neonbfdot(a, b, n, a_centroid, b_centroid, rotation, scale, result);
639
+ #elif NK_TARGET_RVV
640
+ nk_umeyama_bf16_rvv(a, b, n, a_centroid, b_centroid, rotation, scale, result);
641
+ #else
642
+ nk_umeyama_bf16_serial(a, b, n, a_centroid, b_centroid, rotation, scale, result);
643
+ #endif
644
+ }
645
+
646
+ #endif // !NK_DYNAMIC_DISPATCH
647
+
648
+ #if defined(__cplusplus)
649
+ } // extern "C"
650
+ #endif
651
+
652
+ #endif // NK_MESH_H