numkong 7.0.0 → 7.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +239 -122
  2. package/binding.gyp +25 -491
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -46,7 +46,7 @@ NK_PUBLIC nk_size_t nk_maxsim_packed_size_f16_v128relaxed(nk_size_t vector_count
46
46
  }
47
47
 
48
48
  NK_PUBLIC void nk_maxsim_pack_bf16_v128relaxed( //
49
- nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride, void *packed) {
49
+ nk_bf16_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride_in_bytes, void *packed) {
50
50
 
51
51
  nk_size_t const element_bytes = sizeof(nk_bf16_t);
52
52
  nk_size_t depth_i8_padded = nk_maxsim_packed_header_setup_(packed, vector_count, depth, 16, element_bytes);
@@ -58,7 +58,7 @@ NK_PUBLIC void nk_maxsim_pack_bf16_v128relaxed( //
58
58
  nk_size_t const original_stride = header->original_stride_bytes;
59
59
 
60
60
  for (nk_size_t vector_index = 0; vector_index < vector_count; vector_index++) {
61
- char const *source_row = (char const *)vectors + vector_index * stride;
61
+ char const *source_row = (char const *)vectors + vector_index * stride_in_bytes;
62
62
  nk_f32_t norm_sq;
63
63
  nk_maxsim_quantize_vector_(source_row, element_bytes, depth, depth_i8_padded, 63.0f,
64
64
  (nk_maxsim_to_f32_t)nk_bf16_to_f32_serial,
@@ -72,7 +72,7 @@ NK_PUBLIC void nk_maxsim_pack_bf16_v128relaxed( //
72
72
  }
73
73
 
74
74
  NK_PUBLIC void nk_maxsim_pack_f32_v128relaxed( //
75
- nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride, void *packed) {
75
+ nk_f32_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride_in_bytes, void *packed) {
76
76
 
77
77
  nk_size_t const element_bytes = sizeof(nk_f32_t);
78
78
  nk_size_t depth_i8_padded = nk_maxsim_packed_header_setup_(packed, vector_count, depth, 16, element_bytes);
@@ -84,7 +84,7 @@ NK_PUBLIC void nk_maxsim_pack_f32_v128relaxed( //
84
84
  nk_size_t const original_stride = header->original_stride_bytes;
85
85
 
86
86
  for (nk_size_t vector_index = 0; vector_index < vector_count; vector_index++) {
87
- char const *source_row = (char const *)vectors + vector_index * stride;
87
+ char const *source_row = (char const *)vectors + vector_index * stride_in_bytes;
88
88
  nk_f32_t norm_sq;
89
89
  nk_maxsim_quantize_vector_(source_row, element_bytes, depth, depth_i8_padded, 63.0f, nk_f32_to_f32_,
90
90
  &quantized_i8[vector_index * depth_i8_padded], &metadata[vector_index], &norm_sq);
@@ -97,7 +97,7 @@ NK_PUBLIC void nk_maxsim_pack_f32_v128relaxed( //
97
97
  }
98
98
 
99
99
  NK_PUBLIC void nk_maxsim_pack_f16_v128relaxed( //
100
- nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride, void *packed) {
100
+ nk_f16_t const *vectors, nk_size_t vector_count, nk_size_t depth, nk_size_t stride_in_bytes, void *packed) {
101
101
 
102
102
  nk_size_t const element_bytes = sizeof(nk_f16_t);
103
103
  nk_size_t depth_i8_padded = nk_maxsim_packed_header_setup_(packed, vector_count, depth, 16, element_bytes);
@@ -109,7 +109,7 @@ NK_PUBLIC void nk_maxsim_pack_f16_v128relaxed( //
109
109
  nk_size_t const original_stride = header->original_stride_bytes;
110
110
 
111
111
  for (nk_size_t vector_index = 0; vector_index < vector_count; vector_index++) {
112
- char const *source_row = (char const *)vectors + vector_index * stride;
112
+ char const *source_row = (char const *)vectors + vector_index * stride_in_bytes;
113
113
  nk_f32_t norm_sq;
114
114
  nk_maxsim_quantize_vector_(source_row, element_bytes, depth, depth_i8_padded, 63.0f,
115
115
  (nk_maxsim_to_f32_t)nk_f16_to_f32_serial,
@@ -6,33 +6,33 @@ Used in structural biology (protein alignment), robotics (point cloud registrati
6
6
 
7
7
  Centroid:
8
8
 
9
- ```math
9
+ $$
10
10
  \bar{a} = \frac{1}{n}\sum a_i
11
- ```
11
+ $$
12
12
 
13
13
  Cross-covariance matrix:
14
14
 
15
- ```math
15
+ $$
16
16
  H = \sum (a_i - \bar{a})(b_i - \bar{b})^T
17
- ```
17
+ $$
18
18
 
19
19
  SVD-based rotation:
20
20
 
21
- ```math
21
+ $$
22
22
  H = U \Sigma V^T, \quad R = V U^T
23
- ```
23
+ $$
24
24
 
25
25
  Umeyama scale factor:
26
26
 
27
- ```math
27
+ $$
28
28
  s = \frac{\text{tr}(\Sigma)}{n \cdot \sigma_a^2}
29
- ```
29
+ $$
30
30
 
31
31
  RMSD after alignment:
32
32
 
33
- ```math
33
+ $$
34
34
  \text{RMSD} = \sqrt{\frac{1}{n}\sum \|s \cdot R(a_i - \bar{a}) - (b_i - \bar{b})\|^2}
35
- ```
35
+ $$
36
36
 
37
37
  Reformulating as Python pseudocode:
38
38
 
@@ -87,14 +87,14 @@ RVV uses indexed loads with dynamic stride to adapt to variable vector length.
87
87
 
88
88
  ### Reflection Correction
89
89
 
90
- `nk_kabsch_f32_haswell`, `nk_kabsch_f64_skylake` check for improper rotations (det(R) = -1, reflections) after computing R = V·Uᵀ.
91
- If det(R) is negative, the last column of V is flipped.
92
- This ensures the output is always a proper rotation matrix (det = +1).
90
+ `nk_kabsch_f32_haswell`, `nk_kabsch_f64_skylake` check for improper rotations after computing $R = V U^T$ from the SVD of the cross-covariance matrix $H = U \Sigma V^T$.
91
+ If $\det(R) = -1$ (a reflection rather than a rotation), the last column of $V$ is negated before recomputing $R$.
92
+ This ensures the output is always a proper rotation matrix with $\det(R) = +1$.
93
93
 
94
94
  ### Pre-Scaled Rotation for Umeyama
95
95
 
96
- `nk_umeyama_f32_haswell`, `nk_umeyama_f64_skylake` fold the computed scale factor into the rotation matrix before applying to points.
97
- `sr[i] = scale * r[i]` is computed once and broadcast avoiding a per-point scalar multiply.
96
+ `nk_umeyama_f32_haswell`, `nk_umeyama_f64_skylake` fold the computed scale factor $s$ into the rotation matrix before applying to points.
97
+ The Umeyama transform is $b_i = s R a_i + t$; by precomputing $R' = s R$ once, the per-point operation reduces to $b_i = R' a_i + t$, avoiding a per-point scalar multiply.
98
98
 
99
99
  ### Why SME and SVE Were Removed
100
100
 
@@ -142,17 +142,23 @@ Workloads that significantly degrade CPU frequencies (Intel AMX, Apple SME) run
142
142
  | `nk_rmsd_f32_haswell` | 447 mp/s, 0.3 ulp | 484 mp/s, 0.3 ulp | 350 mp/s, 0.4 ulp |
143
143
  | `nk_kabsch_f32_haswell` | 101 mp/s, 0.7 ulp | 192 mp/s, 0.9 ulp | 213 mp/s, 1.3 ulp |
144
144
  | `nk_umeyama_f32_haswell` | 97.4 mp/s, 0.3 ulp | 155 mp/s, 0.4 ulp | 207 mp/s, 0.8 ulp |
145
- | `nk_rmsd_f32_skylake` | 936 mp/s, 0.3 ulp | 970 mp/s, 0.3 ulp | 426 mp/s, 0.3 ulp |
146
- | `nk_kabsch_f32_skylake` | 122 mp/s, 0.7 ulp | 258 mp/s, 0.7 ulp | 290 mp/s, 0.9 ulp |
147
- | `nk_umeyama_f32_skylake` | 133 mp/s, 0.2 ulp | 231 mp/s, 0.3 ulp | 285 mp/s, 0.5 ulp |
145
+ | `nk_rmsd_f32_skylake` | 1,000 mp/s, 0.7 ulp | 974 mp/s, 1.2 ulp | 786 mp/s, 2.4 ulp |
146
+ | `nk_kabsch_f32_skylake` | 97.5 mp/s, 0.7 ulp | 232 mp/s, 0.7 ulp | 332 mp/s, 0.9 ulp |
147
+ | `nk_umeyama_f32_skylake` | 92.5 mp/s, 0.2 ulp | 227 mp/s, 0.2 ulp | 325 mp/s, 0.3 ulp |
148
148
  | __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
149
149
  | `nk_rmsd_bf16_haswell` | 511 mp/s, 0.3 ulp | 481 mp/s, 3.5 ulp | 497 mp/s, 12.8 ulp |
150
150
  | `nk_kabsch_bf16_haswell` | 52.4 mp/s, 0.7 ulp | 65.3 mp/s, 0.9 ulp | 74.8 mp/s, 1.3 ulp |
151
151
  | `nk_umeyama_bf16_haswell` | 51.5 mp/s, 0.2 ulp | 69.2 mp/s, 0.4 ulp | 74.6 mp/s, 0.8 ulp |
152
+ | `nk_rmsd_bf16_skylake` | 1,765 mp/s, 0.3 ulp | 1,945 mp/s, 0.5 ulp | 2,056 mp/s, 6.0 ulp |
153
+ | `nk_kabsch_bf16_skylake` | 132 mp/s, 0.7 ulp | 370 mp/s, 0.8 ulp | 689 mp/s, 0.9 ulp |
154
+ | `nk_umeyama_bf16_skylake` | 130 mp/s, 0.2 ulp | 366 mp/s, 0.3 ulp | 689 mp/s, 0.5 ulp |
152
155
  | __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
153
156
  | `nk_rmsd_f16_haswell` | 415 mp/s, 0.3 ulp | 497 mp/s, 0.7 ulp | 458 mp/s, 2.5 ulp |
154
157
  | `nk_kabsch_f16_haswell` | 151 mp/s, 0.7 ulp | 222 mp/s, 0.9 ulp | 221 mp/s, 1.4 ulp |
155
158
  | `nk_umeyama_f16_haswell` | 186 mp/s, 0.2 ulp | 232 mp/s, 0.5 ulp | 222 mp/s, 0.9 ulp |
159
+ | `nk_rmsd_f16_skylake` | 1,813 mp/s, 0.3 ulp | 1,982 mp/s, 0.4 ulp | 2,049 mp/s, 1.8 ulp |
160
+ | `nk_kabsch_f16_skylake` | 367 mp/s, 0.7 ulp | 695 mp/s, 0.7 ulp | 903 mp/s, 0.9 ulp |
161
+ | `nk_umeyama_f16_skylake` | 341 mp/s, 0.2 ulp | 686 mp/s, 0.2 ulp | 882 mp/s, 0.4 ulp |
156
162
 
157
163
  #### WASM
158
164
 
@@ -176,52 +182,52 @@ Measured with Wasmtime v42 (Cranelift backend).
176
182
  | `nk_umeyama_f32_v128relaxed` | 18.3 mp/s, 0.4 ulp | 38.9 mp/s, 0.8 ulp | ? mp/s, 1.5 ulp |
177
183
 
178
184
 
179
- ### Apple M4
185
+ ### Apple M5
180
186
 
181
187
  #### Native
182
188
 
183
- | Kernel | 256 | 1024 | 4096 |
184
- | :----------------------- | -----------------------: | -----------------------: | -----------------------: |
185
- | __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
186
- | `nk_rmsd_f64_serial` | 447 mp/s, 1.4 ulp | 410 mp/s, 2.6 ulp | 412 mp/s, 5.3 ulp |
187
- | `nk_kabsch_f64_serial` | 95.2 mp/s, 1.4 ulp | 169 mp/s, 2.6 ulp | 214 mp/s, 5.4 ulp |
188
- | `nk_umeyama_f64_serial` | 89.2 mp/s, 1.0 ulp | 157 mp/s, 1.9 ulp | 195 mp/s, 3.7 ulp |
189
- | `nk_rmsd_f64_neon` | 823 mp/s, 0.4 ulp | 761 mp/s, 0.7 ulp | 702 mp/s, 1.3 ulp |
190
- | `nk_kabsch_f64_neon` | 105 mp/s, 0.8 ulp | 213 mp/s, 1.3 ulp | 287 mp/s, 2.2 ulp |
191
- | `nk_umeyama_f64_neon` | 106 mp/s, 0.4 ulp | 214 mp/s, 0.8 ulp | 297 mp/s, 1.6 ulp |
192
- | __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
193
- | `nk_rmsd_f32_serial` | 554 mp/s, 1.4 ulp | 566 mp/s, 2.6 ulp | 532 mp/s, 5.2 ulp |
194
- | `nk_kabsch_f32_serial` | 110 mp/s, 1.4 ulp | 214 mp/s, 2.7 ulp | 264 mp/s, 5.0 ulp |
195
- | `nk_umeyama_f32_serial` | 104 mp/s, 0.9 ulp | 197 mp/s, 1.8 ulp | 240 mp/s, 3.5 ulp |
196
- | `nk_rmsd_f32_neon` | 1,580 mp/s, 0.3 ulp | 1,560 mp/s, 0.4 ulp | 1,200 mp/s, 0.8 ulp |
197
- | `nk_kabsch_f32_neon` | 139 mp/s, 0.7 ulp | 336 mp/s, 0.9 ulp | 485 mp/s, 1.4 ulp |
198
- | `nk_umeyama_f32_neon` | 137 mp/s, 0.3 ulp | 325 mp/s, 0.4 ulp | 470 mp/s, 0.8 ulp |
199
- | __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
200
- | `nk_rmsd_bf16_serial` | 1,740 mp/s, 0.5 ulp | 1,880 mp/s, 6.0 ulp | 1,860 mp/s, 10.0 ulp |
201
- | `nk_kabsch_bf16_serial` | 137 mp/s, 0.7 ulp | 335 mp/s, 0.9 ulp | 527 mp/s, 1.3 ulp |
202
- | `nk_umeyama_bf16_serial` | 135 mp/s, 0.2 ulp | 329 mp/s, 0.4 ulp | 510 mp/s, 0.8 ulp |
203
- | __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
204
- | `nk_rmsd_f16_serial` | 1,840 mp/s, 0.4 ulp | 1,900 mp/s, 1.7 ulp | 1,860 mp/s, 4.6 ulp |
205
- | `nk_kabsch_f16_serial` | 140 mp/s, 0.9 ulp | 349 mp/s, 1.3 ulp | 547 mp/s, 2.4 ulp |
206
- | `nk_umeyama_f16_serial` | 135 mp/s, 0.4 ulp | 316 mp/s, 0.8 ulp | 474 mp/s, 1.5 ulp |
189
+ | Kernel | 256 | 1024 | 4096 |
190
+ | :-------------------------- | -----------------------: | -----------------------: | -----------------------: |
191
+ | __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
192
+ | `nk_rmsd_f64_serial` | 120 mp/s, 1.4 ulp | 118 mp/s, 2.6 ulp | 121 mp/s, 5.3 ulp |
193
+ | `nk_kabsch_f64_serial` | 40.4 mp/s, 1.4 ulp | 47.3 mp/s, 2.6 ulp | 50.2 mp/s, 5.4 ulp |
194
+ | `nk_umeyama_f64_serial` | 34.5 mp/s, 1.0 ulp | 39.2 mp/s, 1.9 ulp | 41.6 mp/s, 3.7 ulp |
195
+ | `nk_rmsd_f64_neon` | 1,418 mp/s, 0.4 ulp | 1,338 mp/s, 0.7 ulp | 1,419 mp/s, 1.3 ulp |
196
+ | `nk_kabsch_f64_neon` | 119 mp/s, 0.8 ulp | 222 mp/s, 1.3 ulp | 304 mp/s, 2.2 ulp |
197
+ | `nk_umeyama_f64_neon` | 115 mp/s, 0.4 ulp | 220 mp/s, 0.8 ulp | 296 mp/s, 1.6 ulp |
198
+ | __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
199
+ | `nk_rmsd_f32_serial` | 122 mp/s, 1.4 ulp | 123 mp/s, 2.6 ulp | 125 mp/s, 5.2 ulp |
200
+ | `nk_kabsch_f32_serial` | 39.4 mp/s, 1.4 ulp | 46.0 mp/s, 2.7 ulp | 49.9 mp/s, 5.0 ulp |
201
+ | `nk_umeyama_f32_serial` | 33.6 mp/s, 0.9 ulp | 38.8 mp/s, 1.8 ulp | 41.4 mp/s, 3.5 ulp |
202
+ | `nk_rmsd_f32_neon` | 1,337 mp/s, 0.3 ulp | 1,377 mp/s, 0.4 ulp | 1,261 mp/s, 0.8 ulp |
203
+ | `nk_kabsch_f32_neon` | 135 mp/s, 0.7 ulp | 288 mp/s, 0.9 ulp | 385 mp/s, 1.4 ulp |
204
+ | `nk_umeyama_f32_neon` | 130 mp/s, 0.3 ulp | 272 mp/s, 0.4 ulp | 367 mp/s, 0.8 ulp |
205
+ | __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
206
+ | `nk_rmsd_bf16_neonbfdot` | 2,342 mp/s, 0.5 ulp | 2,378 mp/s, 6.0 ulp | 2,416 mp/s, 10.0 ulp |
207
+ | `nk_kabsch_bf16_neonbfdot` | 180 mp/s, 0.7 ulp | 448 mp/s, 0.9 ulp | 726 mp/s, 1.3 ulp |
208
+ | `nk_umeyama_bf16_neonbfdot` | 176 mp/s, 0.2 ulp | 433 mp/s, 0.4 ulp | 705 mp/s, 0.8 ulp |
209
+ | __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
210
+ | `nk_rmsd_f16_neonhalf` | 2,315 mp/s, 0.4 ulp | 2,372 mp/s, 1.7 ulp | 2,423 mp/s, 4.6 ulp |
211
+ | `nk_kabsch_f16_neonhalf` | 178 mp/s, 0.9 ulp | 443 mp/s, 1.3 ulp | 711 mp/s, 2.4 ulp |
212
+ | `nk_umeyama_f16_neonhalf` | 175 mp/s, 0.4 ulp | 408 mp/s, 0.8 ulp | 620 mp/s, 1.5 ulp |
207
213
 
208
214
  #### WASM
209
215
 
210
- Measured with Wasmtime v42 (Cranelift backend).
216
+ Measured with Wasmtime v43 (Cranelift backend).
211
217
 
212
218
  | Kernel | 256 | 1024 | 4096 |
213
219
  | :--------------------------- | -----------------------: | -----------------------: | -----------------------: |
214
220
  | __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
215
- | `nk_rmsd_f64_serial` | 219 mp/s, 2.6 ulp | 202 mp/s, 2.6 ulp | 255 mp/s, 2.6 ulp |
216
- | `nk_rmsd_f64_v128relaxed` | 434 mp/s, 0.8 ulp | 363 mp/s, 0.8 ulp | 586 mp/s, 0.8 ulp |
217
- | `nk_kabsch_f64_serial` | 42.8 mp/s, 2.7 ulp | 76.0 mp/s, 2.7 ulp | 110 mp/s, 2.7 ulp |
218
- | `nk_kabsch_f64_v128relaxed` | 55.2 mp/s, 2.2 ulp | 110 mp/s, 2.2 ulp | 202 mp/s, 2.2 ulp |
219
- | `nk_umeyama_f64_serial` | 36.1 mp/s, 1.8 ulp | 58.9 mp/s, 1.8 ulp | 98.7 mp/s, 1.8 ulp |
220
- | `nk_umeyama_f64_v128relaxed` | 52.4 mp/s, 1.5 ulp | 103 mp/s, 1.5 ulp | 183 mp/s, 1.5 ulp |
221
+ | `nk_rmsd_f64_serial` | 137 mp/s, 2.6 ulp | 134 mp/s, 2.6 ulp | 142 mp/s, 2.6 ulp |
222
+ | `nk_rmsd_f64_v128relaxed` | 1,377 mp/s, 0.8 ulp | 1,038 mp/s, 0.8 ulp | 1,566 mp/s, 0.8 ulp |
223
+ | `nk_kabsch_f64_serial` | 42.3 mp/s, 2.7 ulp | 50.4 mp/s, 2.7 ulp | 55.5 mp/s, 2.7 ulp |
224
+ | `nk_kabsch_f64_v128relaxed` | 121 mp/s, 2.2 ulp | 225 mp/s, 2.2 ulp | 345 mp/s, 2.2 ulp |
225
+ | `nk_umeyama_f64_serial` | 36.1 mp/s, 1.8 ulp | 41.3 mp/s, 1.8 ulp | 46.0 mp/s, 1.8 ulp |
226
+ | `nk_umeyama_f64_v128relaxed` | 112 mp/s, 1.5 ulp | 207 mp/s, 1.5 ulp | 293 mp/s, 1.5 ulp |
221
227
  | __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
222
- | `nk_rmsd_f32_serial` | 218 mp/s, 2.7 ulp | 223 mp/s, 2.7 ulp | 271 mp/s, 2.7 ulp |
223
- | `nk_rmsd_f32_v128relaxed` | 626 mp/s, 0.5 ulp | 626 mp/s, 0.5 ulp | 687 mp/s, 0.5 ulp |
224
- | `nk_kabsch_f32_serial` | 45.5 mp/s, 2.6 ulp | 77.0 mp/s, 2.6 ulp | 112 mp/s, 2.6 ulp |
225
- | `nk_kabsch_f32_v128relaxed` | 68.6 mp/s, 1.3 ulp | 160 mp/s, 1.3 ulp | 273 mp/s, 1.3 ulp |
226
- | `nk_umeyama_f32_serial` | 38.7 mp/s, 1.8 ulp | 60.0 mp/s, 1.8 ulp | 80.5 mp/s, 1.8 ulp |
227
- | `nk_umeyama_f32_v128relaxed` | 66.9 mp/s, 0.8 ulp | 157 mp/s, 0.8 ulp | 291 mp/s, 0.8 ulp |
228
+ | `nk_rmsd_f32_serial` | 120 mp/s, 2.7 ulp | 120 mp/s, 2.7 ulp | 124 mp/s, 2.7 ulp |
229
+ | `nk_rmsd_f32_v128relaxed` | 1,025 mp/s, 0.5 ulp | 1,038 mp/s, 0.5 ulp | 1,093 mp/s, 0.5 ulp |
230
+ | `nk_kabsch_f32_serial` | 39.6 mp/s, 2.6 ulp | 47.6 mp/s, 2.6 ulp | 51.4 mp/s, 2.6 ulp |
231
+ | `nk_kabsch_f32_v128relaxed` | 125 mp/s, 1.3 ulp | 255 mp/s, 1.3 ulp | 366 mp/s, 1.3 ulp |
232
+ | `nk_umeyama_f32_serial` | 30.5 mp/s, 1.8 ulp | 35.0 mp/s, 1.8 ulp | 38.9 mp/s, 1.8 ulp |
233
+ | `nk_umeyama_f32_v128relaxed` | 118 mp/s, 0.8 ulp | 240 mp/s, 0.8 ulp | 338 mp/s, 0.8 ulp |