numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,496 @@
1
+ # Batched Dot Products in NumKong
2
+
3
+ NumKong implements batched GEMM computing C = A × Bᵀ (packed) and C = A × Aᵀ (symmetric). B is pre-packed once and reused across queries. This is the foundation for the spatials, sets, and maxsim modules.
4
+
5
+ Packed dot product computes the full cross-product matrix:
6
+
7
+ ```math
8
+ C_{ij} = \sum_{k} A_{ik} \cdot B_{jk}^T
9
+ ```
10
+
11
+ Symmetric dot product uses the same matrix for both operands:
12
+
13
+ ```math
14
+ C_{ij} = \sum_{k} A_{ik} \cdot A_{jk}
15
+ ```
16
+
17
+ Reformulating as Python pseudocode:
18
+
19
+ ```python
20
+ import numpy as np
21
+
22
+ def dots_packed(a: np.ndarray, b: np.ndarray) -> np.ndarray:
23
+ return a @ b.T
24
+
25
+ def dots_symmetric(a: np.ndarray) -> np.ndarray:
26
+ return a @ a.T
27
+ ```
28
+
29
+ ## Input & Output Types
30
+
31
+ | Input Type | Output Type | Description |
32
+ | ---------- | ----------- | ---------------------------------------------- |
33
+ | `f64` | `f64` | 64-bit IEEE 754 double precision |
34
+ | `f32` | `f32` | 32-bit IEEE 754 single precision |
35
+ | `f16` | `f32` | 16-bit IEEE 754 half precision, widened output |
36
+ | `bf16` | `f32` | 16-bit brain float, widened output |
37
+ | `e4m3` | `f32` | 8-bit Float8: 4 exponent, 3 mantissa bits |
38
+ | `e5m2` | `f32` | 8-bit Float8: 5 exponent, 2 mantissa bits |
39
+ | `e2m3` | `f32` | 8-bit MX format: 2 exponent, 3 mantissa bits |
40
+ | `e3m2` | `f32` | 8-bit MX format: 3 exponent, 2 mantissa bits |
41
+ | `i8` | `i32` | 8-bit signed integers |
42
+ | `u8` | `u32` | 8-bit unsigned integers |
43
+ | `i4` | `i32` | 4-bit signed integers, packed nibble pairs |
44
+ | `u4` | `u32` | 4-bit unsigned integers, packed nibble pairs |
45
+ | `u1` | `u32` | 1-bit binary packed octets, popcount of AND |
46
+
47
+ ## Optimizations
48
+
49
+ ### B Matrix Pre-Packing with Stride Breaking
50
+
51
+ `nk_dots_pack_f32_serial`, `nk_dots_pack_f32_haswell`, `nk_dots_pack_bf16_haswell`, `nk_dots_pack_i8_haswell` pre-pack the B matrix into a contiguous buffer optimized for streaming access during GEMM.
52
+ Power-of-2 stride detection — when `stride_bytes & (stride_bytes - 1) == 0` — adds `depth_simd_dimensions` padding to avoid cache associativity conflicts on set-associative caches.
53
+ Type conversion is amortized into the pack step: BFloat16 → Float32, Float16 → Float32, and Float8 → Float32 conversions happen once during packing instead of per-row during GEMM.
54
+ A 64-byte header stores metadata: column count, depth dimensions, and padded depth.
55
+ Row grouping (`group_size=16`) zero-pads partial groups at matrix edges for uniform SIMD processing.
56
+
57
+ ### Tiled Register Accumulation
58
+
59
+ `nk_dots_packed_f32_haswell`, `nk_dots_packed_f32_skylake`, `nk_dots_packed_f32_neon` use a 4×4 tile kernel with 16 accumulators to handle ~80% of the work.
60
+ A 1×8 tile kernel with 8 accumulators handles edge rows that don't fill a full 4-row tile.
61
+ No depth blocking is used — the kernel relies on hardware prefetch for streaming A/B access patterns.
62
+ Row loads are amortized across multiple dot products: each A row is loaded once and multiplied against 4 B columns per tile pass.
63
+
64
+ ### AMX 2D Tile Engine
65
+
66
+ The Sapphire Rapids AMX backends for `bf16`, mini-floats, `i8`, and `u8` use Intel AMX's 8 tile registers (TMM0–TMM7), each 1 KB (16 rows × 64 bytes).
67
+ Convention: TMM0–1 hold A tiles, TMM2–3 hold B tiles, TMM4–7 are C accumulators — giving a 2×2 output tile (32×32 Float32 results) per tile pass.
68
+ `TDPBF16PS tmm_c, tmm_a, tmm_b` performs a 16×16 outer product with 32 BFloat16 multiply-adds per cell (16×16×32 = 8,192 MACs per instruction).
69
+ Each A row contains 16 BFloat16 pairs interleaved as [a₀, a₁, a₀, a₁, ...] and B columns as [b₀, b₁, b₀, b₁, ...] — the hardware consumes two BFloat16 elements per slot, accumulating into Float32.
70
+ `TDPBSSD tmm_c, tmm_a, tmm_b` does the same for Int8: 64 bytes per row gives 16×16×64 = 16,384 Int8 MACs per instruction.
71
+ Int8 data is quad-interleaved: [a₀, a₁, a₂, a₃, a₀, a₁, a₂, a₃, ...] so the hardware can consume four Int8 elements per 32-bit slot.
72
+ Tile configuration via `LDTILECFG` sets row counts and column byte-widths per tile — allows undersized tiles at matrix edges without masking.
73
+ Morton Z-curve ordering for tile traversal improves cache reuse when both A and B exceed L2.
74
+ This eliminates the explicit M×N×K loop nesting and register file pressure of vector ISAs — the entire dot-product reduction happens inside the tile instruction.
75
+
76
+ ### SME Outer-Product Streaming
77
+
78
+ `nk_dots_packed_f32_smef64`, `nk_dots_packed_bf16_sme`, `nk_dots_packed_f64_smef64` use Arm's SME ZA tile array (up to 4 named tiles ZA0–ZA3 in 32-bit mode, each SVL×SVL elements).
79
+ `FMOPA za, pn/m, pm/m, zn.s, zm.s` computes a full SVL×SVL rank-1 update in one instruction — one row of A times one row of B, accumulated into ZA.
80
+ ZA0 time-shares between data staging and accumulation: A rows are loaded horizontally into ZA0 (`st1w {za0h.s[ws]}, ...`), then read vertically (`svread_ver_za32_f32_m`) to produce transposed column vectors for B.
81
+ This avoids explicit transpose operations — the tile's 2D addressing provides free transposition.
82
+ ZA1–ZA3 serve as accumulators while ZA0 stages the next data.
83
+ A 3-column-tile fast path handles B column count ≤ 3×SVL using ZA1–ZA3 as three separate accumulator tiles, avoiding spill/reload cycles.
84
+ For wider B, the kernel falls back to multi-pass accumulation with ZA store/load between passes.
85
+ `BFMOPA` for BFloat16 uses the same outer-product pattern but with BFloat16 → Float32 widening — 2× the depth per instruction vs Float32 `FMOPA`.
86
+ `SMSTART`/`SMSTOP` streaming mode transitions cost ~50–100 cycles, amortized across the full M×N output.
87
+ Ozaki splitting for Float64 (`nk_dots_packed_f64_smef64`) splits each Float64 into 3 mantissa-masked Float32 slices, computes 6 FMOPAs (all cross-products of 3×2 slices) into 3 ZA accumulators, then reconstructs the Float64 result — achieving Float64 precision using Float32 tile hardware.
88
+
89
+ ### Compensated Integer GEMM
90
+
91
+ `nk_dots_packed_i8_icelake`, `nk_dots_packed_u8_icelake`, `nk_dots_packed_i8_haswell` work around the unsigned×signed operand requirement of integer dot-product instructions.
92
+ `VPDPBUSD` (Ice Lake+) computes UInt8×Int8 dot products accumulating directly to Int32 — but requires one unsigned and one signed operand.
93
+ For signed×signed (Int8×Int8), one operand is XOR'd with `0x80` to shift to unsigned range, introducing a bias of $128 \cdot \sum_k b_k$ per output element.
94
+ Rather than computing the bias correction per-element inside the inner loop (requiring extra registers for running sums), the B column sums $\sum_k b_k$ are pre-computed once during packing and stored in the packed buffer metadata.
95
+ The inner loop only needs the `VPDPBUSD` accumulator — the bias subtraction is a single post-loop correction: `result[i][j] -= 128 * b_column_sum[j]`.
96
+ This reduces per-accumulator state from 2 registers (dot + running sum) to 1 register (dot only), freeing registers for more accumulators in the 4×4 tile.
97
+ Haswell fallback uses `VPMADDUBSW` (UInt8×Int8→Int16) + `VPMADDWD` (Int16→Int32), a two-instruction chain with Int16 intermediate overflow risk — quantization ranges must be tighter ([-79, 79] vs [-127, 127]).
98
+
99
+ ### 4-Way Finalizer Amortization
100
+
101
+ All packed and symmetric kernels across the dots, spatials, and sets modules share a finalizer-based design.
102
+ The 4×4 tile accumulates 16 dot products in registers, then stores results 4-wide via `nk_b128_vec_t` — a union of `f32[4]`, `i32[4]`, `u32[4]` fitting a 128-bit register.
103
+ A finalizer function pointer processes 4 results simultaneously, amortizing horizontal reductions and type conversions:
104
+
105
+ ```
106
+ // 4-wide finalizer signature
107
+ void finalizer(nk_b128_vec_t dots, // 4 dot products
108
+ nk_f32_t query_norm, // precomputed query squared-norm
109
+ nk_b128_vec_t target_norms, // 4 target squared-norms
110
+ nk_b128_vec_t *results) // 4 output distances
111
+
112
+ // Angular: 4 divisions + 4 subtractions in one call
113
+ results->f32s[i] = 1 - dots.f32s[i] / sqrt(query_norm * target_norms.f32s[i])
114
+
115
+ // Euclidean: 4 sqrt(a² + b² - 2ab) in one call
116
+ results->f32s[i] = sqrt(query_norm + target_norms.f32s[i] - 2 * dots.f32s[i])
117
+ ```
118
+
119
+ The 4×4 tile emits 4 rows of 4 results each — the finalizer is called 4 times per tile, once per query row.
120
+ For the 1×8 edge tile, two finalizer calls handle 8 results.
121
+ This design decouples the GEMM loop from the distance metric: the same tiled accumulation code serves dots, spatials, and sets by swapping only the finalizer function pointer.
122
+
123
+ ## Performance
124
+
125
+ The following performance tables are produced by manually re-running `nk_test` and `nk_bench` included internal tools to measure both accuracy and throughput at different input shapes.
126
+ The input size is controlled by `NK_MATRIX_HEIGHT`, `NK_MATRIX_WIDTH`, and `NK_MATRIX_DEPTH` environment variables, all set to the same value for products of two square matrices.
127
+ Columns show throughput for 256³, 1024³, and 4096³ matrix products.
128
+ The throughput is measured in GSO/s as Giga Scalar Operations per Second, with `ops = 2 · M · N · K` arithmetic complexity for an M × K by K × N product.
129
+ Accuracy is reported as mean ULP (units in last place) unless noted otherwise — the average number of representable floating-point values between the result and the exact answer.
130
+ Rows marked `🧩` use external BLAS or MKL baselines rather than NumKong kernels.
131
+ Each kernel runs for at least 20 seconds per configuration.
132
+ Benchmark threads are pinned to specific cores; on machines with heterogeneous core types (e.g., Apple P/E cores), only the fastest cores are used.
133
+ Workloads that significantly degrade CPU frequencies (Intel AMX, Apple SME) run in separate passes to avoid affecting throughput measurements of other kernels.
134
+
135
+ ### Intel Sapphire Rapids
136
+
137
+ #### Native
138
+
139
+ | Kernel | 256³ | 1024³ | 4096³ |
140
+ | :----------------------------------- | -----------------------: | -----------------------: | -----------------------: |
141
+ | __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
142
+ | `dots_packed_f64_with_blas` 🧩 | 58.7 gso/s, 16 ulp | 73.1 gso/s, 58 ulp | 73.8 gso/s, 56.2 ulp |
143
+ | `dots_packed_f64_with_mkl` 🧩 | 59.9 gso/s, 16 ulp | 73.7 gso/s, 58 ulp | 73.3 gso/s, 56.2 ulp |
144
+ | `dots_symmetric_f64_with_blas` 🧩 | 50.8 gso/s, 13 ulp | 70.4 gso/s, 30 ulp | 74 gso/s, 50.8 ulp |
145
+ | `nk_dots_packed_f64_serial` | 0.393 gso/s, 2 ulp | 0.489 gso/s, 4.6 ulp | 0.488 gso/s, 5.9 ulp |
146
+ | `nk_dots_symmetric_f64_serial` | 0.346 gso/s, 2 ulp | 0.357 gso/s, 2.9 ulp | 0.574 gso/s, 3.9 ulp |
147
+ | `nk_dots_packed_f64_haswell` | 5.56 gso/s, 0 ulp | 5.97 gso/s, 0 ulp | 6.15 gso/s, 0 ulp |
148
+ | `nk_dots_symmetric_f64_haswell` | 5.1 gso/s, 0 ulp | 5.71 gso/s, 0 ulp | 11.5 gso/s, 0 ulp |
149
+ | `nk_dots_packed_f64_skylake` | 8.05 gso/s, 0 ulp | 8.69 gso/s, 0 ulp | 8.93 gso/s, 0 ulp |
150
+ | `nk_dots_symmetric_f64_skylake` | 7.52 gso/s, 0 ulp | 8.88 gso/s, 0 ulp | 17.6 gso/s, 0 ulp |
151
+ | __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
152
+ | `dots_packed_f32_with_blas` 🧩 | 113 gso/s, 18 ulp | 139 gso/s, 30 ulp | 147 gso/s, 267 ulp |
153
+ | `dots_symmetric_f32_with_blas` 🧩 | 94.5 gso/s, 23 ulp | 126 gso/s, 39 ulp | 146 gso/s, 260 ulp |
154
+ | `nk_dots_packed_f32_serial` | 9.89 gso/s, 5.3 ulp | 10.2 gso/s, 11.8 ulp | 10.1 gso/s, ? ulp |
155
+ | `nk_dots_symmetric_f32_serial` | 6.30 gso/s, 11.1 ulp | 6.57 gso/s, 13.4 ulp | 6.53 gso/s, ? ulp |
156
+ | `nk_dots_packed_f32_haswell` | 30.1 gso/s, 0 ulp | 31.6 gso/s, 0 ulp | 31.9 gso/s, 0 ulp |
157
+ | `nk_dots_symmetric_f32_haswell` | 21.4 gso/s, 0 ulp | 26.2 gso/s, 0 ulp | 53.3 gso/s, 0 ulp |
158
+ | `nk_dots_packed_f32_skylake` | 35 gso/s, 0 ulp | 38.6 gso/s, 0 ulp | 39.5 gso/s, 0 ulp |
159
+ | `nk_dots_symmetric_f32_skylake` | 26.6 gso/s, 0 ulp | 30.5 gso/s, 0 ulp | 62 gso/s, 0 ulp |
160
+ | __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
161
+ | `dots_packed_bf16_with_mkl` 🧩 | 190 gso/s, 0 ulp | 531 gso/s, 0.7 ulp | 865 gso/s, 5.8 ulp |
162
+ | `nk_dots_packed_bf16_serial` | 0.842 gso/s, 0 ulp | 0.824 gso/s, 0.5 ulp | 0.825 gso/s, 5.4 ulp |
163
+ | `nk_dots_symmetric_bf16_serial` | 0.808 gso/s, 0 ulp | 0.759 gso/s, 0.9 ulp | 1.74 gso/s, 5.4 ulp |
164
+ | `nk_dots_packed_bf16_haswell` | 57.4 gso/s, 0 ulp | 66.5 gso/s, 0.4 ulp | 67.1 gso/s, 4.5 ulp |
165
+ | `nk_dots_symmetric_bf16_haswell` | 39.5 gso/s, 0 ulp | 50.8 gso/s, 0.3 ulp | 111 gso/s, 4.2 ulp |
166
+ | `nk_dots_packed_bf16_skylake` | 73.8 gso/s, 0 ulp | 90.1 gso/s, 0.3 ulp | 90 gso/s, 3.7 ulp |
167
+ | `nk_dots_symmetric_bf16_skylake` | 52.7 gso/s, 0 ulp | 58.5 gso/s, 0.3 ulp | 127 gso/s, 3.2 ulp |
168
+ | `nk_dots_packed_bf16_genoa` | 64.1 gso/s, 0 ulp | 85.3 gso/s, 0.3 ulp | 90.3 gso/s, 3.5 ulp |
169
+ | `nk_dots_symmetric_bf16_genoa` | 58.1 gso/s, 0 ulp | 61.3 gso/s, 0.5 ulp | 133 gso/s, 3.5 ulp |
170
+ | `nk_dots_packed_bf16_sapphireamx` | 391 gso/s, 0 ulp | 531 gso/s, 0.7 ulp | 604 gso/s, 5.8 ulp |
171
+ | `nk_dots_symmetric_bf16_sapphireamx` | 81.6 gso/s, 0 ulp | 120 gso/s, 0.5 ulp | 124 gso/s, 5.8 ulp |
172
+ | __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
173
+ | `dots_packed_f16_with_mkl` 🧩 | 123 gso/s, 17 ulp | 138 gso/s, 31 ulp | 138 gso/s, 39.5 ulp |
174
+ | `nk_dots_packed_f16_serial` | 4.44 gso/s, 14 ulp | 4.42 gso/s, 40 ulp | 4.40 gso/s, 326 ulp |
175
+ | `nk_dots_symmetric_f16_serial` | 3.66 gso/s, 8.9 ulp | 3.44 gso/s, 25 ulp | 5.06 gso/s, 55.6 ulp |
176
+ | `nk_dots_packed_f16_haswell` | 63.4 gso/s, 12 ulp | 72.4 gso/s, 22 ulp | 71.8 gso/s, 374 ulp |
177
+ | `nk_dots_symmetric_f16_haswell` | 39.9 gso/s, 7.7 ulp | 55.7 gso/s, 32 ulp | 127 gso/s, 486 ulp |
178
+ | `nk_dots_packed_f16_skylake` | 74.3 gso/s, 7.3 ulp | 98.7 gso/s, 21 ulp | 85.4 gso/s, 138 ulp |
179
+ | `nk_dots_symmetric_f16_skylake` | 53 gso/s, 5.9 ulp | 59.3 gso/s, 25 ulp | 133 gso/s, 32 ulp |
180
+ | __e5m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
181
+ | `nk_dots_packed_e5m2_serial` | 2.51 gso/s, 0 ulp | 2.57 gso/s, 0 ulp | 4.82 gso/s, 0 ulp |
182
+ | `nk_dots_symmetric_e5m2_serial` | 2.47 gso/s, 0 ulp | 2.54 gso/s, 0 ulp | 5.06 gso/s, 0 ulp |
183
+ | `nk_dots_packed_e5m2_haswell` | 28.7 gso/s, 0 ulp | 31.6 gso/s, 0 ulp | 30.6 gso/s, 0 ulp |
184
+ | `nk_dots_symmetric_e5m2_haswell` | 15.1 gso/s, 0 ulp | 15.4 gso/s, 0 ulp | 32.1 gso/s, 0 ulp |
185
+ | `nk_dots_packed_e5m2_skylake` | 33.7 gso/s, 0 ulp | 37.3 gso/s, 0 ulp | 39.3 gso/s, 0 ulp |
186
+ | `nk_dots_symmetric_e5m2_skylake` | 20.7 gso/s, 0 ulp | 20.8 gso/s, 0 ulp | 42.8 gso/s, 0 ulp |
187
+ | `nk_dots_packed_e5m2_genoa` | 43.4 gso/s, 0 ulp | 49.9 gso/s, 0 ulp | 50.8 gso/s, 0 ulp |
188
+ | `nk_dots_symmetric_e5m2_genoa` | 31.6 gso/s, 0 ulp | 33.2 gso/s, 0 ulp | 65.4 gso/s, 0 ulp |
189
+ | `nk_dots_packed_e5m2_sapphireamx` | 255 gso/s, 0 ulp | 380 gso/s, 0 ulp | 433 gso/s, 0 ulp |
190
+ | __e4m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
191
+ | `nk_dots_packed_e4m3_serial` | 0.326 gso/s, 0 ulp | 0.334 gso/s, 0 ulp | 0.318 gso/s, 0 ulp |
192
+ | `nk_dots_symmetric_e4m3_serial` | 0.307 gso/s, 0 ulp | 0.315 gso/s, 0 ulp | 0.618 gso/s, 0 ulp |
193
+ | `nk_dots_packed_e4m3_haswell` | 19.4 gso/s, 0 ulp | 22.7 gso/s, 0 ulp | 21.9 gso/s, 0 ulp |
194
+ | `nk_dots_symmetric_e4m3_haswell` | 11.3 gso/s, 0 ulp | 11.9 gso/s, 0 ulp | 24.4 gso/s, 0 ulp |
195
+ | `nk_dots_packed_e4m3_skylake` | 30.8 gso/s, 0 ulp | 32.8 gso/s, 0 ulp | 31.2 gso/s, 0 ulp |
196
+ | `nk_dots_symmetric_e4m3_skylake` | 16.5 gso/s, 0 ulp | 17.4 gso/s, 0 ulp | 34.1 gso/s, 0 ulp |
197
+ | `nk_dots_packed_e4m3_genoa` | 43.8 gso/s, 0 ulp | 47.7 gso/s, 0 ulp | 49.3 gso/s, 0 ulp |
198
+ | `nk_dots_symmetric_e4m3_genoa` | 29.4 gso/s, 0 ulp | 33.3 gso/s, 0 ulp | 67.7 gso/s, 0 ulp |
199
+ | `nk_dots_packed_e4m3_sapphireamx` | 255 gso/s, 0 ulp | 401 gso/s, 0 ulp | 433 gso/s, 0 ulp |
200
+ | __e3m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
201
+ | `nk_dots_packed_e3m2_serial` | 2.34 gso/s, 0 ulp | 2.53 gso/s, 0 ulp | 2.80 gso/s, 0 ulp |
202
+ | `nk_dots_symmetric_e3m2_serial` | 1.88 gso/s, 0 ulp | 2.04 gso/s, 0 ulp | 3.96 gso/s, 0 ulp |
203
+ | `nk_dots_packed_e3m2_haswell` | 30.8 gso/s, 0 ulp | 32.6 gso/s, 0 ulp | 32.3 gso/s, 0 ulp |
204
+ | `nk_dots_symmetric_e3m2_haswell` | 28.5 gso/s, 0 ulp | 32.4 gso/s, 0 ulp | 67.1 gso/s, 0 ulp |
205
+ | `nk_dots_packed_e3m2_skylake` | 38.9 gso/s, 0 ulp | 42.4 gso/s, 0 ulp | 43.3 gso/s, 0 ulp |
206
+ | `nk_dots_symmetric_e3m2_skylake` | 37.2 gso/s, 0 ulp | 43.6 gso/s, 0 ulp | 89.4 gso/s, 0 ulp |
207
+ | `nk_dots_packed_e3m2_genoa` | 50.2 gso/s, 0 ulp | 57.8 gso/s, 0 ulp | 60.7 gso/s, 0 ulp |
208
+ | `nk_dots_symmetric_e3m2_genoa` | 38.7 gso/s, 0 ulp | 41.6 gso/s, 0 ulp | 87.9 gso/s, 0 ulp |
209
+ | `nk_dots_packed_e3m2_sapphireamx` | 255 gso/s, 0 ulp | 448 gso/s, 0 ulp | 433 gso/s, 0 ulp |
210
+ | `nk_dots_symmetric_e3m2_sapphireamx` | 65.1 gso/s, 0 ulp | 101 gso/s, 0 ulp | 91 gso/s, 0 ulp |
211
+ | __e2m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
212
+ | `nk_dots_packed_e2m3_serial` | 2.33 gso/s, 0 ulp | 2.47 gso/s, 0 ulp | 4.82 gso/s, 0 ulp |
213
+ | `nk_dots_symmetric_e2m3_serial` | 1.94 gso/s, 0 ulp | 2.02 gso/s, 0 ulp | 4.05 gso/s, 0 ulp |
214
+ | `nk_dots_packed_e2m3_haswell` | 57.3 gso/s, 0 ulp | 61.5 gso/s, 0 ulp | 65.9 gso/s, 0 ulp |
215
+ | `nk_dots_symmetric_e2m3_haswell` | 50.1 gso/s, 0 ulp | 62 gso/s, 0 ulp | 132 gso/s, 0 ulp |
216
+ | `nk_dots_packed_e2m3_skylake` | 69.5 gso/s, 0 ulp | 79.5 gso/s, 0 ulp | 89.5 gso/s, 0 ulp |
217
+ | `nk_dots_symmetric_e2m3_skylake` | 64.2 gso/s, 0 ulp | 81 gso/s, 0 ulp | 168 gso/s, 0 ulp |
218
+ | `nk_dots_packed_e2m3_genoa` | 49.6 gso/s, 0 ulp | 59 gso/s, 0 ulp | 59.7 gso/s, 0 ulp |
219
+ | `nk_dots_symmetric_e2m3_genoa` | 39.9 gso/s, 0 ulp | 43 gso/s, 0 ulp | 88.8 gso/s, 0 ulp |
220
+ | `nk_dots_packed_e2m3_sapphireamx` | 422 gso/s, 0 ulp | 1,090 gso/s, 0 ulp | 1,060 gso/s, 0 ulp |
221
+ | `nk_dots_symmetric_e2m3_sapphireamx` | 93.6 gso/s, 0 ulp | 211 gso/s, 0 ulp | 163 gso/s, 0 ulp |
222
+ | `nk_dots_packed_e2m3_alder` | 71.8 gso/s, 0 ulp | 80.8 gso/s, 0 ulp | 89.5 gso/s, 0 ulp |
223
+ | `nk_dots_symmetric_e2m3_alder` | 59.3 gso/s, 0 ulp | 73.4 gso/s, 0 ulp | 78.6 gso/s, 0 ulp |
224
+ | __i8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
225
+ | `dots_packed_i8u8_with_mkl` 🧩 | 250 gso/s | 627 gso/s | 1,670 gso/s |
226
+ | `nk_dots_packed_i8_serial` | 6.44 gso/s | 6.62 gso/s | 7.44 gso/s |
227
+ | `nk_dots_symmetric_i8_serial` | 2.93 gso/s | 2.99 gso/s | 5.83 gso/s |
228
+ | `nk_dots_packed_i8_haswell` | 87.7 gso/s | 104 gso/s | 108 gso/s |
229
+ | `nk_dots_symmetric_i8_haswell` | 64 gso/s | 80.9 gso/s | 173 gso/s |
230
+ | `nk_dots_packed_i8_icelake` | 191 gso/s | 326 gso/s | 410 gso/s |
231
+ | `nk_dots_symmetric_i8_icelake` | 79.2 gso/s | 303 gso/s | 760 gso/s |
232
+ | `nk_dots_packed_i8_sapphireamx` | 547 gso/s | 1,610 gso/s | 1,300 gso/s |
233
+ | `nk_dots_symmetric_i8_sapphireamx` | 112 gso/s | 266 gso/s | 221 gso/s |
234
+ | `nk_dots_packed_i8_alder` | 180 gso/s | 229 gso/s | 270 gso/s |
235
+ | `nk_dots_symmetric_i8_alder` | 108 gso/s | 218 gso/s | 263 gso/s |
236
+ | __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
237
+ | `nk_dots_packed_u8_serial` | 7.45 gso/s | 7.79 gso/s | 7.88 gso/s |
238
+ | `nk_dots_symmetric_u8_serial` | 2.81 gso/s | 2.91 gso/s | 5.35 gso/s |
239
+ | `nk_dots_packed_u8_haswell` | 88 gso/s | 102 gso/s | 107 gso/s |
240
+ | `nk_dots_symmetric_u8_haswell` | 64.3 gso/s | 79.8 gso/s | 181 gso/s |
241
+ | `nk_dots_packed_u8_icelake` | 194 gso/s | 329 gso/s | 402 gso/s |
242
+ | `nk_dots_symmetric_u8_icelake` | 83.9 gso/s | 300 gso/s | 755 gso/s |
243
+ | `nk_dots_packed_u8_sapphireamx` | 550 gso/s | 1,680 gso/s | 1,330 gso/s |
244
+ | `nk_dots_symmetric_u8_sapphireamx` | 113 gso/s | 270 gso/s | 223 gso/s |
245
+ | `nk_dots_packed_u8_alder` | 181 gso/s | 230 gso/s | 266 gso/s |
246
+ | `nk_dots_symmetric_u8_alder` | 108 gso/s | 216 gso/s | 257 gso/s |
247
+ | __i4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
248
+ | `nk_dots_packed_i4_serial` | 2.43 gso/s | 2.43 gso/s | 2.24 gso/s |
249
+ | `nk_dots_symmetric_i4_serial` | 2.26 gso/s | 2.13 gso/s | 4.44 gso/s |
250
+ | `nk_dots_packed_i4_icelake` | 135 gso/s | 211 gso/s | 254 gso/s |
251
+ | `nk_dots_symmetric_i4_icelake` | 78.7 gso/s | 252 gso/s | 581 gso/s |
252
+ | __u4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
253
+ | `nk_dots_packed_u4_serial` | 3.27 gso/s | 3.37 gso/s | 3.33 gso/s |
254
+ | `nk_dots_symmetric_u4_serial` | 3.02 gso/s | 3.06 gso/s | 6.13 gso/s |
255
+ | `nk_dots_packed_u4_icelake` | 152 gso/s | 302 gso/s | 387 gso/s |
256
+ | `nk_dots_symmetric_u4_icelake` | 97.3 gso/s | 311 gso/s | 697 gso/s |
257
+ | __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
258
+ | `nk_dots_packed_u1_haswell` | 225 gso/s | 261 gso/s | 344 gso/s |
259
+ | `nk_dots_symmetric_u1_haswell` | 122 gso/s | 277 gso/s | 756 gso/s |
260
+ | `nk_dots_packed_u1_icelake` | 196 gso/s | 750 gso/s | 1,390 gso/s |
261
+ | `nk_dots_symmetric_u1_icelake` | 171 gso/s | 661 gso/s | 2,500 gso/s |
262
+
263
+ #### WASM
264
+
265
+ Measured with Wasmtime v42 (Cranelift backend).
266
+
267
+ | Kernel | 256³ | 1024³ | 4096³ |
268
+ | :----------------------------------- | -----------------------: | -----------------------: | -----------------------: |
269
+ | __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
270
+ | `nk_dots_packed_f64_serial` | 0.947 gso/s, 3.4 ulp | 0.969 gso/s, 2.4 ulp | 0.969 gso/s, 0 ulp |
271
+ | `nk_dots_symmetric_f64_serial` | 0.957 gso/s, 3.7 ulp | 1.11 gso/s, 2.5 ulp | 1.16 gso/s, 0 ulp |
272
+ | `nk_dots_packed_f64_v128relaxed` | 2.73 gso/s, 23.6 ulp | 2.79 gso/s, 32.5 ulp | 2.81 gso/s, 3.9 ulp |
273
+ | `nk_dots_symmetric_f64_v128relaxed` | 2.01 gso/s, 21.6 ulp | 2.55 gso/s, 41.2 ulp | 2.77 gso/s, 2.9 ulp |
274
+ | __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
275
+ | `nk_dots_packed_f32_serial` | 4.27 gso/s, 14.6 ulp | 4.35 gso/s, 28.6 ulp | 4.47 gso/s, 25.3 ulp |
276
+ | `nk_dots_symmetric_f32_serial` | 3.13 gso/s, 11.5 ulp | 5.09 gso/s, 34.8 ulp | 5.78 gso/s, 44.7 ulp |
277
+ | `nk_dots_packed_f32_v128relaxed` | 10.4 gso/s, 12.9 ulp | 10.6 gso/s, 26.5 ulp | 10.9 gso/s, 39.7 ulp |
278
+ | `nk_dots_symmetric_f32_v128relaxed` | 3.73 gso/s, 10.3 ulp | 6.27 gso/s, 28.6 ulp | 7.43 gso/s, 76.2 ulp |
279
+ | __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
280
+ | `nk_dots_packed_bf16_serial` | 4.33 gso/s, 0 ulp | 4.46 gso/s, 0.4 ulp | 4.45 gso/s, 9.5 ulp |
281
+ | `nk_dots_symmetric_bf16_serial` | 3.76 gso/s, 0 ulp | 6.36 gso/s, 0.5 ulp | 7.43 gso/s, 4.9 ulp |
282
+ | `nk_dots_packed_bf16_v128relaxed` | 23.2 gso/s, 0 ulp | 24.5 gso/s, 0.4 ulp | 24.9 gso/s, 6.8 ulp |
283
+ | `nk_dots_symmetric_bf16_v128relaxed` | 4.92 gso/s, 0 ulp | 10.5 gso/s, 0.5 ulp | 13.7 gso/s, 4.9 ulp |
284
+ | __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
285
+ | `nk_dots_packed_f16_serial` | 4.33 gso/s, 26 ulp | 4.46 gso/s, 26 ulp | 4.45 gso/s, 26 ulp |
286
+ | `nk_dots_symmetric_f16_serial` | 3.76 gso/s, 28 ulp | 6.36 gso/s, 28 ulp | 7.43 gso/s, 28 ulp |
287
+ | `nk_dots_packed_f16_v128relaxed` | 7.39 gso/s, 27 ulp | 7.36 gso/s, 27 ulp | 7.45 gso/s, 27 ulp |
288
+ | `nk_dots_symmetric_f16_v128relaxed` | 3.70 gso/s, 28 ulp | 3.83 gso/s, 28 ulp | 3.87 gso/s, 28 ulp |
289
+ | __e5m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
290
+ | `nk_dots_packed_e5m2_serial` | 2.63 gso/s, 0 ulp | 2.69 gso/s, 0 ulp | 2.70 gso/s, 0 ulp |
291
+ | `nk_dots_symmetric_e5m2_serial` | 1.62 gso/s, 0 ulp | 2.04 gso/s, 0 ulp | 2.16 gso/s, 0 ulp |
292
+ | `nk_dots_packed_e5m2_v128relaxed` | 6.25 gso/s, 0 ulp | 6.50 gso/s, 0 ulp | 6.55 gso/s, 0 ulp |
293
+ | `nk_dots_symmetric_e5m2_v128relaxed` | 3.37 gso/s, 0 ulp | 5.23 gso/s, 0 ulp | 6.06 gso/s, 0 ulp |
294
+ | __e4m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
295
+ | `nk_dots_packed_e4m3_serial` | 0.348 gso/s, 0 ulp | 0.345 gso/s, 0 ulp | 0.345 gso/s, 0 ulp |
296
+ | `nk_dots_symmetric_e4m3_serial` | 0.321 gso/s, 0 ulp | 0.340 gso/s, 0 ulp | 0.345 gso/s, 0 ulp |
297
+ | `nk_dots_packed_e4m3_v128relaxed` | 4.80 gso/s, 0 ulp | 4.92 gso/s, 0 ulp | 4.96 gso/s, 0 ulp |
298
+ | `nk_dots_symmetric_e4m3_v128relaxed` | 2.85 gso/s, 0 ulp | 4.17 gso/s, 0 ulp | 4.62 gso/s, 0 ulp |
299
+ | __e2m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
300
+ | `nk_dots_packed_e2m3_serial` | 2.63 gso/s, 0 ulp | 2.69 gso/s, 0 ulp | 2.71 gso/s, 0 ulp |
301
+ | `nk_dots_symmetric_e2m3_serial` | 1.62 gso/s, 0 ulp | 2.06 gso/s, 0 ulp | 2.14 gso/s, 0 ulp |
302
+ | `nk_dots_packed_e2m3_v128relaxed` | 17.2 gso/s, 0 ulp | 18.2 gso/s, 0 ulp | 18.7 gso/s, 0 ulp |
303
+ | `nk_dots_symmetric_e2m3_v128relaxed` | 5.35 gso/s, 0 ulp | 11.6 gso/s, 0 ulp | 16.3 gso/s, 0 ulp |
304
+ | __i8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
305
+ | `nk_dots_packed_i8_serial` | 4.40 gso/s | 4.54 gso/s | 4.73 gso/s |
306
+ | `nk_dots_symmetric_i8_serial` | 2.74 gso/s | 3.89 gso/s | 4.29 gso/s |
307
+ | `nk_dots_packed_i8_v128relaxed` | 36.5 gso/s | 38.5 gso/s | 41.1 gso/s |
308
+ | `nk_dots_symmetric_i8_v128relaxed` | 29.2 gso/s | 36.3 gso/s | 39.2 gso/s |
309
+ | __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
310
+ | `nk_dots_packed_u8_serial` | 4.94 gso/s | 5.14 gso/s | 4.88 gso/s |
311
+ | `nk_dots_symmetric_u8_serial` | 2.74 gso/s | 3.94 gso/s | 4.40 gso/s |
312
+ | `nk_dots_packed_u8_v128relaxed` | 35.2 gso/s | 37.7 gso/s | 40.5 gso/s |
313
+ | `nk_dots_symmetric_u8_v128relaxed` | 21.0 gso/s | 26.6 gso/s | 28.6 gso/s |
314
+ | __i4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
315
+ | `nk_dots_packed_i4_serial` | 6.34 gso/s | 6.40 gso/s | 6.59 gso/s |
316
+ | `nk_dots_symmetric_i4_serial` | 2.70 gso/s | 3.76 gso/s | 4.13 gso/s |
317
+ | `nk_dots_packed_i4_v128relaxed` | 9.81 gso/s | 10.3 gso/s | 10.4 gso/s |
318
+ | `nk_dots_symmetric_i4_v128relaxed` | 4.95 gso/s | 15.6 gso/s | 32.8 gso/s |
319
+ | __u4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
320
+ | `nk_dots_packed_u4_serial` | 5.61 gso/s | 5.76 gso/s | 5.79 gso/s |
321
+ | `nk_dots_symmetric_u4_serial` | 3.01 gso/s | 4.34 gso/s | 4.94 gso/s |
322
+ | `nk_dots_packed_u4_v128relaxed` | 58.6 gso/s | 71.0 gso/s | 76.5 gso/s |
323
+ | `nk_dots_symmetric_u4_v128relaxed` | 6.97 gso/s | 21.9 gso/s | 46.7 gso/s |
324
+ | __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
325
+ | `nk_dots_packed_u1_serial` | 96.2 gso/s | 143 gso/s | 151 gso/s |
326
+ | `nk_dots_packed_u1_v128relaxed` | 166 gso/s | 280 gso/s | 294 gso/s |
327
+ | `nk_dots_symmetric_u1_serial` | 7.42 gso/s | 27.9 gso/s | 87.3 gso/s |
328
+ | `nk_dots_symmetric_u1_v128relaxed` | 7.35 gso/s | 27.5 gso/s | 81.9 gso/s |
329
+
330
+ ### Apple M4
331
+
332
+ #### Native
333
+
334
+ | Kernel | 256³ | 1024³ | 4096³ |
335
+ | :--------------------------------- | -----------------------: | -----------------------: | -----------------------: |
336
+ | __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
337
+ | `nk_dots_packed_f64_serial` | 1.82 gso/s, 3 ulp | 1.81 gso/s, 5 ulp | 1.82 gso/s, 6 ulp |
338
+ | `nk_dots_symmetric_f64_serial` | 1.40 gso/s, 0 ulp | 1.42 gso/s, 0 ulp | 1.42 gso/s, 0 ulp |
339
+ | `nk_dots_packed_f64_neon` | 5.62 gso/s, 0 ulp | 5.48 gso/s, 0 ulp | 5.21 gso/s, 0 ulp |
340
+ | `nk_dots_symmetric_f64_neon` | 4.46 gso/s, 0 ulp | 4.94 gso/s, 0 ulp | 5.71 gso/s, 0 ulp |
341
+ | `nk_dots_packed_f64_smef64` | 13.9 gso/s, 1.5 ulp | 12.1 gso/s, 1.1 ulp | 12.9 gso/s, 0.9 ulp |
342
+ | `nk_dots_symmetric_f64_smef64` | 5.18 gso/s, 1.5 ulp | 5.06 gso/s, 1.2 ulp | 4.46 gso/s, 1.1 ulp |
343
+ | __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
344
+ | `nk_dots_packed_f32_serial` | 10.4 gso/s, 19 ulp | 10.6 gso/s, 30 ulp | 11.2 gso/s, 725 ulp |
345
+ | `nk_dots_symmetric_f32_serial` | 8.34 gso/s, 3.1 ulp | 8.64 gso/s, 12.8 ulp | 8.96 gso/s, 39.9 ulp |
346
+ | `nk_dots_packed_f32_neon` | 40.7 gso/s, 0 ulp | 40.1 gso/s, 0 ulp | 41.5 gso/s, 0 ulp |
347
+ | `nk_dots_symmetric_f32_neon` | 10.0 gso/s, 4.6 ulp | 10.2 gso/s, 17.7 ulp | 10.1 gso/s, 59 ulp |
348
+ | `nk_dots_packed_f32_smef64` | 42.8 gso/s, 0 ulp | 57.8 gso/s, 15 ulp | 50.7 gso/s, 0 ulp |
349
+ | `nk_dots_symmetric_f32_smef64` | 15.0 gso/s, 4.3 ulp | 15.4 gso/s, 19.0 ulp | 10.2 gso/s, 0 ulp |
350
+ | __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
351
+ | `nk_dots_packed_bf16_serial` | 17.0 gso/s, 0.1 ulp | 17.6 gso/s, 0.5 ulp | 17.2 gso/s, 5 ulp |
352
+ | `nk_dots_symmetric_bf16_serial` | 13.5 gso/s, 0.01 ulp | 13.4 gso/s, 0.7 ulp | 16.6 gso/s, 115 ulp |
353
+ | `nk_dots_packed_bf16_neonbfdot` | 57.7 gso/s, 0 ulp | 58.1 gso/s, 0.5 ulp | 58.8 gso/s, 7.2 ulp |
354
+ | `nk_dots_symmetric_bf16_neonbfdot` | 38.9 gso/s, 0 ulp | 39.1 gso/s, 0.5 ulp | 37.7 gso/s, ? ulp |
355
+ | `nk_dots_packed_bf16_sme` | 437 gso/s, 0 ulp | 571 gso/s, 4.2 ulp | 507 gso/s, 3.8 ulp |
356
+ | `nk_dots_symmetric_bf16_sme` | 106 gso/s, 0.07 ulp | 94.2 gso/s, 1.2 ulp | 90.0 gso/s, 1.8 ulp |
357
+ | __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
358
+ | `nk_dots_packed_f16_serial` | 13.1 gso/s, 204 ulp | 13.4 gso/s, 36 ulp | 14.1 gso/s, 326 ulp |
359
+ | `nk_dots_symmetric_f16_serial` | 21.7 gso/s, 13 ulp | 18.0 gso/s, 24.6 ulp | 26.2 gso/s, 506 ulp |
360
+ | `nk_dots_packed_f16_neonhalf` | 76.6 gso/s, 16.8 ulp | 75.5 gso/s, 25.5 ulp | 82.3 gso/s, 618 ulp |
361
+ | `nk_dots_symmetric_f16_neonhalf` | 20.0 gso/s, 12.1 ulp | 20.4 gso/s, 25.0 ulp | 20.7 gso/s, 506 ulp |
362
+ | `nk_dots_packed_f16_neonfhm` | 111 gso/s, 16.7 ulp | 110 gso/s, 25.5 ulp | 108 gso/s, 618 ulp |
363
+ | `nk_dots_symmetric_f16_neonfhm` | 35.3 gso/s, 12.1 ulp | 36.4 gso/s, 25.0 ulp | 36.7 gso/s, 506 ulp |
364
+ | `nk_dots_packed_f16_sme` | 461 gso/s, 14.8 ulp | 484 gso/s, 28.2 ulp | 476 gso/s, 28.2 ulp |
365
+ | `nk_dots_symmetric_f16_sme` | 98.6 gso/s, 12.1 ulp | 95.2 gso/s, 23.8 ulp | 88.4 gso/s, 24.4 ulp |
366
+ | __e5m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
367
+ | `nk_dots_packed_e5m2_serial` | 12.1 gso/s, 0 ulp | 11.8 gso/s, 0 ulp | 13.7 gso/s, 0 ulp |
368
+ | `nk_dots_symmetric_e5m2_serial` | 7.88 gso/s, 0 ulp | 8.23 gso/s, 0 ulp | 8.31 gso/s, 0 ulp |
369
+ | `nk_dots_packed_e5m2_neonfhm` | 95.2 gso/s, 0 ulp | 98.3 gso/s, 0 ulp | 99.3 gso/s, 0 ulp |
370
+ | `nk_dots_symmetric_e5m2_neonfhm` | 61.0 gso/s, 0 ulp | 67.7 gso/s, 0 ulp | 76.3 gso/s, 0 ulp |
371
+ | `nk_dots_packed_e5m2_sme` | 327 gso/s, 0 ulp | 1,120 gso/s, 0 ulp | 552 gso/s, 0 ulp |
372
+ | `nk_dots_symmetric_e5m2_sme` | 70.4 gso/s, 0 ulp | 66.3 gso/s, 0 ulp | 120 gso/s, 0 ulp |
373
+ | __e4m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
374
+ | `nk_dots_packed_e4m3_serial` | 1.19 gso/s, 0 ulp | 1.20 gso/s, 0 ulp | 1.20 gso/s, 0 ulp |
375
+ | `nk_dots_symmetric_e4m3_serial` | 1.23 gso/s, 0 ulp | 1.24 gso/s, 0 ulp | 1.24 gso/s, 0.0001 ulp |
376
+ | `nk_dots_packed_e4m3_neonfhm` | 28.7 gso/s, 0 ulp | 29.6 gso/s, 0 ulp | 26.7 gso/s, 0 ulp |
377
+ | `nk_dots_symmetric_e4m3_neonfhm` | 36.1 gso/s, 0 ulp | 37.4 gso/s, 0 ulp | 37.6 gso/s, 0 ulp |
378
+ | `nk_dots_packed_e4m3_sme` | 168 gso/s, 0 ulp | 219 gso/s, 0 ulp | 181 gso/s, 0 ulp |
379
+ | `nk_dots_symmetric_e4m3_sme` | 25.6 gso/s, 0 ulp | 23.2 gso/s, 0 ulp | 35.1 gso/s, 0 ulp |
380
+ | __e3m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
381
+ | `nk_dots_packed_e3m2_serial` | 12.0 gso/s, 0 ulp | 11.7 gso/s, 0 ulp | 11.4 gso/s, 0 ulp |
382
+ | `nk_dots_symmetric_e3m2_serial` | 7.56 gso/s, 0 ulp | 7.75 gso/s, 0 ulp | 7.76 gso/s, 0 ulp |
383
+ | `nk_dots_packed_e3m2_sme` | 15.4 gso/s, 0 ulp | 11.9 gso/s, 0 ulp | 12.3 gso/s, 0 ulp |
384
+ | `nk_dots_symmetric_e3m2_sme` | 1.59 gso/s, 0 ulp | 1.67 gso/s, 0 ulp | 1.79 gso/s, 0 ulp |
385
+ | __e2m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
386
+ | `nk_dots_packed_e2m3_serial` | 13.3 gso/s, 0 ulp | 11.5 gso/s, 0 ulp | 14.6 gso/s, 0 ulp |
387
+ | `nk_dots_symmetric_e2m3_serial` | 7.85 gso/s, 0 ulp | 8.23 gso/s, 0 ulp | 8.30 gso/s, 0 ulp |
388
+ | `nk_dots_packed_e2m3_sme` | 518 gso/s, 0 ulp | 740 gso/s, 0 ulp | 1,017 gso/s, 0 ulp |
389
+ | `nk_dots_symmetric_e2m3_sme` | 65.1 gso/s, 0 ulp | 79.6 gso/s, 0 ulp | 98.4 gso/s, 0 ulp |
390
+ | __i8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
391
+ | `nk_dots_packed_i8_serial` | 17.0 gso/s | 17.3 gso/s | 17.3 gso/s |
392
+ | `nk_dots_symmetric_i8_serial` | 13.9 gso/s | 14.1 gso/s | 14.3 gso/s |
393
+ | `nk_dots_packed_i8_neonsdot` | 301 gso/s | 365 gso/s | 389 gso/s |
394
+ | `nk_dots_symmetric_i8_neonsdot` | 72.8 gso/s | 79.7 gso/s | 79.9 gso/s |
395
+ | `nk_dots_packed_i8_sme` | 854 gso/s | 904 gso/s | 1,206 gso/s |
396
+ | `nk_dots_symmetric_i8_sme` | 156 gso/s | 195 gso/s | 154 gso/s |
397
+ | __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
398
+ | `nk_dots_packed_u8_serial` | 15.4 gso/s | 16.0 gso/s | 16.0 gso/s |
399
+ | `nk_dots_symmetric_u8_serial` | 15.8 gso/s | 16.2 gso/s | 16.2 gso/s |
400
+ | `nk_dots_packed_u8_neonsdot` | 329 gso/s | 401 gso/s | 387 gso/s |
401
+ | `nk_dots_symmetric_u8_neonsdot` | 72.5 gso/s | 79.8 gso/s | 80.4 gso/s |
402
+ | `nk_dots_packed_u8_sme` | 907 gso/s | 970 gso/s | 1,224 gso/s |
403
+ | `nk_dots_symmetric_u8_sme` | 167 gso/s | 204 gso/s | 163 gso/s |
404
+ | __i4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
405
+ | `nk_dots_packed_i4_serial` | 17.3 gso/s | 17.6 gso/s | 18.9 gso/s |
406
+ | `nk_dots_symmetric_i4_serial` | 15.1 gso/s | 15.3 gso/s | 15.3 gso/s |
407
+ | `nk_dots_packed_i4_neonsdot` | 255 gso/s | 278 gso/s | 283 gso/s |
408
+ | `nk_dots_symmetric_i4_neonsdot` | 117 gso/s | 144 gso/s | 149 gso/s |
409
+ | `nk_dots_packed_i4_sme` | 1,028 gso/s | 960 gso/s | 1,105 gso/s |
410
+ | `nk_dots_symmetric_i4_sme` | 310 gso/s | 249 gso/s | 393 gso/s |
411
+ | __u4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
412
+ | `nk_dots_packed_u4_serial` | 19.1 gso/s | 19.7 gso/s | 19.4 gso/s |
413
+ | `nk_dots_symmetric_u4_serial` | 16.1 gso/s | 16.4 gso/s | 16.1 gso/s |
414
+ | `nk_dots_packed_u4_neonsdot` | 290 gso/s | 320 gso/s | 328 gso/s |
415
+ | `nk_dots_symmetric_u4_neonsdot` | 125 gso/s | 144 gso/s | 149 gso/s |
416
+ | `nk_dots_packed_u4_sme` | 1,230 gso/s | 1,013 gso/s | 1,246 gso/s |
417
+ | `nk_dots_symmetric_u4_sme` | 340 gso/s | 285 gso/s | 401 gso/s |
418
+ | __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
419
+ | `nk_dots_packed_u1_serial` | 342 gso/s | 429 gso/s | 447 gso/s |
420
+ | `nk_dots_symmetric_u1_serial` | 226 gso/s | 342 gso/s | 357 gso/s |
421
+ | `nk_dots_packed_u1_neon` | 795 gso/s | 931 gso/s | 983 gso/s |
422
+ | `nk_dots_symmetric_u1_neon` | 290 gso/s | 483 gso/s | 561 gso/s |
423
+ | `nk_dots_packed_u1_smebi32` | 998 gso/s | 4,251 gso/s | 7,369 gso/s |
424
+ | `nk_dots_symmetric_u1_smebi32` | 53.7 gso/s | 278 gso/s | 548 gso/s |
425
+
426
+ #### WASM
427
+
428
+ Measured with Wasmtime v42 (Cranelift backend).
429
+
430
+ | Kernel | 256³ | 1024³ | 4096³ |
431
+ | :----------------------------------- | -----------------------: | -----------------------: | -----------------------: |
432
+ | __f64__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
433
+ | `nk_dots_packed_f64_serial` | 2.11 gso/s, 3 ulp | 4.67 gso/s, 5 ulp | 5.37 gso/s, 2.2 ulp |
434
+ | `nk_dots_symmetric_f64_serial` | 1.89 gso/s, 4 ulp | 3.21 gso/s, 3 ulp | 5.62 gso/s, 2.4 ulp |
435
+ | `nk_dots_packed_f64_v128relaxed` | 34.9 gso/s, 32.4 ulp | 32.5 gso/s, 32.4 ulp | 38.4 gso/s, 32.4 ulp |
436
+ | `nk_dots_symmetric_f64_v128relaxed` | 10.2 gso/s, 37.6 ulp | 10.4 gso/s, 37.6 ulp | 10.9 gso/s, 37.6 ulp |
437
+ | __f32__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
438
+ | `nk_dots_packed_f32_serial` | 9.09 gso/s, 19 ulp | 17.2 gso/s, 30 ulp | 26.3 gso/s, 41.7 ulp |
439
+ | `nk_dots_symmetric_f32_serial` | 6.90 gso/s, 20 ulp | 18.2 gso/s, 29 ulp | 18.2 gso/s, 58.8 ulp |
440
+ | `nk_dots_packed_f32_v128relaxed` | 61.5 gso/s, 44.1 ulp | 65.7 gso/s, 44.1 ulp | 68.0 gso/s, 44.1 ulp |
441
+ | `nk_dots_symmetric_f32_v128relaxed` | 19.5 gso/s, 48.2 ulp | 20.3 gso/s, 48.2 ulp | 20.4 gso/s, 48.2 ulp |
442
+ | __bf16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
443
+ | `nk_dots_packed_bf16_serial` | 7.36 gso/s, 0.1 ulp | 17.8 gso/s, 0.5 ulp | 21.2 gso/s, 1.3 ulp |
444
+ | `nk_dots_symmetric_bf16_serial` | 8.24 gso/s, 0 ulp | 26.9 gso/s, 0.6 ulp | 26.9 gso/s, 1.1 ulp |
445
+ | `nk_dots_packed_bf16_v128relaxed` | 52.7 gso/s, 1.4 ulp | 55.1 gso/s, 1.4 ulp | 59.0 gso/s, 1.4 ulp |
446
+ | `nk_dots_symmetric_bf16_v128relaxed` | 16.8 gso/s, 1.3 ulp | 16.3 gso/s, 1.3 ulp | 18.0 gso/s, 1.3 ulp |
447
+ | __f16__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
448
+ | `nk_dots_packed_f16_serial` | 0.615 gso/s, 204 ulp | 1.42 gso/s, 36 ulp | 1.67 gso/s, 25.9 ulp |
449
+ | `nk_dots_symmetric_f16_serial` | 0.528 gso/s, 13 ulp | 1.38 gso/s, 29 ulp | 1.46 gso/s, 27.9 ulp |
450
+ | `nk_dots_packed_f16_v128relaxed` | ? gso/s, ? ulp | ? gso/s, ? ulp | ? gso/s, ? ulp |
451
+ | `nk_dots_symmetric_f16_v128relaxed` | ? gso/s, ? ulp | ? gso/s, ? ulp | ? gso/s, ? ulp |
452
+ | __e5m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
453
+ | `nk_dots_packed_e5m2_serial` | 1.96 gso/s, 0 ulp | 4.57 gso/s, 0 ulp | 5.72 gso/s, 0 ulp |
454
+ | `nk_dots_symmetric_e5m2_serial` | 2.78 gso/s, 0 ulp | 7.59 gso/s, 0 ulp | 8.02 gso/s, 0 ulp |
455
+ | `nk_dots_packed_e5m2_v128relaxed` | 11.5 gso/s, ? ulp | 11.6 gso/s, ? ulp | 11.6 gso/s, ? ulp |
456
+ | `nk_dots_symmetric_e5m2_v128relaxed` | 11.7 gso/s, 0 ulp | 12.2 gso/s, 0 ulp | 12.3 gso/s, 0 ulp |
457
+ | __e4m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
458
+ | `nk_dots_packed_e4m3_serial` | 0.340 gso/s, 0 ulp | 0.717 gso/s, 0 ulp | 0.864 gso/s, 0 ulp |
459
+ | `nk_dots_symmetric_e4m3_serial` | 0.331 gso/s, 0 ulp | 0.822 gso/s, 0 ulp | 0.874 gso/s, 0 ulp |
460
+ | `nk_dots_packed_e4m3_v128relaxed` | 9.44 gso/s, ? ulp | 9.57 gso/s, ? ulp | 9.58 gso/s, ? ulp |
461
+ | `nk_dots_symmetric_e4m3_v128relaxed` | 9.46 gso/s, 0 ulp | 10.0 gso/s, 0 ulp | 10.2 gso/s, 0 ulp |
462
+ | __e3m2__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
463
+ | `nk_dots_packed_e3m2_serial` | 1.90 gso/s, 0 ulp | 4.38 gso/s, 0 ulp | 5.66 gso/s, 0 ulp |
464
+ | `nk_dots_symmetric_e3m2_serial` | 2.72 gso/s, 0 ulp | 7.33 gso/s, 0 ulp | 7.70 gso/s, 0 ulp |
465
+ | `nk_dots_packed_e3m2_v128relaxed` | ? gso/s, ? ulp | ? gso/s, ? ulp | ? gso/s, ? ulp |
466
+ | `nk_dots_symmetric_e3m2_v128relaxed` | ? gso/s, ? ulp | ? gso/s, ? ulp | ? gso/s, ? ulp |
467
+ | __e2m3__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
468
+ | `nk_dots_packed_e2m3_serial` | 1.93 gso/s, 0 ulp | 4.42 gso/s, 0 ulp | 5.66 gso/s, 0 ulp |
469
+ | `nk_dots_symmetric_e2m3_serial` | 2.71 gso/s, 0 ulp | 7.31 gso/s, 0 ulp | 7.70 gso/s, 0 ulp |
470
+ | `nk_dots_packed_e2m3_v128relaxed` | 34.8 gso/s, 0 ulp | 35.0 gso/s, 0 ulp | 38.6 gso/s, 0 ulp |
471
+ | `nk_dots_symmetric_e2m3_v128relaxed` | 32.8 gso/s, 0 ulp | 35.3 gso/s, 0 ulp | 38.2 gso/s, 0 ulp |
472
+ | __i8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
473
+ | `nk_dots_packed_i8_serial` | 4.60 gso/s | 11.1 gso/s | 13.8 gso/s |
474
+ | `nk_dots_symmetric_i8_serial` | 6.57 gso/s | 17.2 gso/s | 18.4 gso/s |
475
+ | `nk_dots_packed_i8_v128relaxed` | 47.1 gso/s | 48.8 gso/s | 52.7 gso/s |
476
+ | `nk_dots_symmetric_i8_v128relaxed` | 43.1 gso/s | 41.8 gso/s | 52.7 gso/s |
477
+ | __u8__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
478
+ | `nk_dots_packed_u8_serial` | 4.67 gso/s | 11.4 gso/s | 14.1 gso/s |
479
+ | `nk_dots_symmetric_u8_serial` | 7.18 gso/s | 17.0 gso/s | 18.6 gso/s |
480
+ | `nk_dots_packed_u8_v128relaxed` | 24.5 gso/s | 25.3 gso/s | 25.6 gso/s |
481
+ | `nk_dots_symmetric_u8_v128relaxed` | 20.3 gso/s | 23.5 gso/s | 24.4 gso/s |
482
+ | __i4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
483
+ | `nk_dots_packed_i4_serial` | 7.06 gso/s | 18.5 gso/s | 19.7 gso/s |
484
+ | `nk_dots_symmetric_i4_serial` | 5.43 gso/s | 12.5 gso/s | 13.1 gso/s |
485
+ | `nk_dots_packed_i4_v128relaxed` | 26.2 gso/s | 24.3 gso/s | 22.8 gso/s |
486
+ | `nk_dots_symmetric_i4_v128relaxed` | 54.2 gso/s | 67.9 gso/s | 73.4 gso/s |
487
+ | __u4__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
488
+ | `nk_dots_packed_u4_serial` | 5.17 gso/s | 14.0 gso/s | 15.0 gso/s |
489
+ | `nk_dots_symmetric_u4_serial` | 5.26 gso/s | 11.5 gso/s | 12.1 gso/s |
490
+ | `nk_dots_packed_u4_v128relaxed` | 78.0 gso/s | 82.7 gso/s | 84.0 gso/s |
491
+ | `nk_dots_symmetric_u4_v128relaxed` | 66.9 gso/s | 81.4 gso/s | 85.3 gso/s |
492
+ | __u1__ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░░░░░░░░░░ |
493
+ | `nk_dots_packed_u1_serial` | 209 gso/s | 250 gso/s | 270 gso/s |
494
+ | `nk_dots_symmetric_u1_serial` | 165 gso/s | 314 gso/s | 395 gso/s |
495
+ | `nk_dots_packed_u1_v128relaxed` | 603 gso/s | 736 gso/s | 872 gso/s |
496
+ | `nk_dots_symmetric_u1_v128relaxed` | 182 gso/s | 359 gso/s | 497 gso/s |