numkong 7.0.0 → 7.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +239 -122
  2. package/binding.gyp +25 -491
  3. package/c/dispatch_bf16.c +59 -1
  4. package/c/dispatch_e2m3.c +41 -8
  5. package/c/dispatch_e3m2.c +49 -8
  6. package/c/dispatch_e4m3.c +51 -9
  7. package/c/dispatch_e5m2.c +45 -1
  8. package/c/dispatch_f16.c +79 -26
  9. package/c/dispatch_f16c.c +5 -5
  10. package/c/dispatch_f32.c +56 -0
  11. package/c/dispatch_f64.c +52 -0
  12. package/c/dispatch_i4.c +3 -0
  13. package/c/dispatch_i8.c +62 -3
  14. package/c/dispatch_other.c +18 -0
  15. package/c/dispatch_u1.c +54 -9
  16. package/c/dispatch_u4.c +3 -0
  17. package/c/dispatch_u8.c +64 -3
  18. package/c/numkong.c +3 -0
  19. package/include/README.md +79 -9
  20. package/include/numkong/attention/sapphireamx.h +278 -276
  21. package/include/numkong/attention/sme.h +983 -977
  22. package/include/numkong/attention.h +1 -1
  23. package/include/numkong/capabilities.h +289 -94
  24. package/include/numkong/cast/README.md +40 -40
  25. package/include/numkong/cast/diamond.h +64 -0
  26. package/include/numkong/cast/haswell.h +42 -194
  27. package/include/numkong/cast/icelake.h +42 -37
  28. package/include/numkong/cast/loongsonasx.h +252 -0
  29. package/include/numkong/cast/neon.h +216 -249
  30. package/include/numkong/cast/powervsx.h +449 -0
  31. package/include/numkong/cast/rvv.h +223 -274
  32. package/include/numkong/cast/sapphire.h +18 -18
  33. package/include/numkong/cast/serial.h +1018 -944
  34. package/include/numkong/cast/skylake.h +82 -23
  35. package/include/numkong/cast/v128relaxed.h +462 -105
  36. package/include/numkong/cast.h +24 -0
  37. package/include/numkong/cast.hpp +44 -0
  38. package/include/numkong/curved/README.md +17 -17
  39. package/include/numkong/curved/neon.h +131 -7
  40. package/include/numkong/curved/neonbfdot.h +6 -7
  41. package/include/numkong/curved/rvv.h +26 -26
  42. package/include/numkong/curved/smef64.h +186 -182
  43. package/include/numkong/curved.h +14 -18
  44. package/include/numkong/dot/README.md +154 -137
  45. package/include/numkong/dot/alder.h +43 -43
  46. package/include/numkong/dot/diamond.h +158 -0
  47. package/include/numkong/dot/genoa.h +4 -30
  48. package/include/numkong/dot/haswell.h +215 -180
  49. package/include/numkong/dot/icelake.h +190 -76
  50. package/include/numkong/dot/loongsonasx.h +671 -0
  51. package/include/numkong/dot/neon.h +124 -73
  52. package/include/numkong/dot/neonbfdot.h +11 -12
  53. package/include/numkong/dot/neonfhm.h +44 -46
  54. package/include/numkong/dot/neonfp8.h +323 -0
  55. package/include/numkong/dot/neonsdot.h +190 -76
  56. package/include/numkong/dot/powervsx.h +752 -0
  57. package/include/numkong/dot/rvv.h +92 -84
  58. package/include/numkong/dot/rvvbf16.h +12 -12
  59. package/include/numkong/dot/rvvhalf.h +12 -12
  60. package/include/numkong/dot/sapphire.h +4 -4
  61. package/include/numkong/dot/serial.h +66 -30
  62. package/include/numkong/dot/sierra.h +31 -31
  63. package/include/numkong/dot/skylake.h +142 -110
  64. package/include/numkong/dot/sve.h +217 -177
  65. package/include/numkong/dot/svebfdot.h +10 -10
  66. package/include/numkong/dot/svehalf.h +85 -41
  67. package/include/numkong/dot/svesdot.h +89 -0
  68. package/include/numkong/dot/v128relaxed.h +124 -89
  69. package/include/numkong/dot.h +114 -48
  70. package/include/numkong/dots/README.md +203 -203
  71. package/include/numkong/dots/alder.h +12 -9
  72. package/include/numkong/dots/diamond.h +86 -0
  73. package/include/numkong/dots/genoa.h +10 -4
  74. package/include/numkong/dots/haswell.h +63 -48
  75. package/include/numkong/dots/icelake.h +27 -18
  76. package/include/numkong/dots/loongsonasx.h +176 -0
  77. package/include/numkong/dots/neon.h +14 -11
  78. package/include/numkong/dots/neonbfdot.h +4 -3
  79. package/include/numkong/dots/neonfhm.h +11 -9
  80. package/include/numkong/dots/neonfp8.h +99 -0
  81. package/include/numkong/dots/neonsdot.h +48 -12
  82. package/include/numkong/dots/powervsx.h +194 -0
  83. package/include/numkong/dots/rvv.h +451 -344
  84. package/include/numkong/dots/sapphireamx.h +1028 -984
  85. package/include/numkong/dots/serial.h +213 -197
  86. package/include/numkong/dots/sierra.h +10 -7
  87. package/include/numkong/dots/skylake.h +47 -36
  88. package/include/numkong/dots/sme.h +2001 -2364
  89. package/include/numkong/dots/smebi32.h +175 -162
  90. package/include/numkong/dots/smef64.h +328 -323
  91. package/include/numkong/dots/v128relaxed.h +64 -41
  92. package/include/numkong/dots.h +573 -293
  93. package/include/numkong/dots.hpp +45 -43
  94. package/include/numkong/each/README.md +133 -137
  95. package/include/numkong/each/haswell.h +6 -6
  96. package/include/numkong/each/icelake.h +7 -7
  97. package/include/numkong/each/neon.h +76 -42
  98. package/include/numkong/each/neonbfdot.h +11 -12
  99. package/include/numkong/each/neonhalf.h +24 -116
  100. package/include/numkong/each/rvv.h +28 -28
  101. package/include/numkong/each/sapphire.h +27 -161
  102. package/include/numkong/each/serial.h +6 -6
  103. package/include/numkong/each/skylake.h +7 -7
  104. package/include/numkong/each/v128relaxed.h +562 -0
  105. package/include/numkong/each.h +148 -62
  106. package/include/numkong/each.hpp +2 -2
  107. package/include/numkong/geospatial/README.md +18 -18
  108. package/include/numkong/geospatial/haswell.h +365 -325
  109. package/include/numkong/geospatial/neon.h +350 -306
  110. package/include/numkong/geospatial/rvv.h +4 -4
  111. package/include/numkong/geospatial/skylake.h +376 -340
  112. package/include/numkong/geospatial/v128relaxed.h +366 -327
  113. package/include/numkong/geospatial.h +17 -17
  114. package/include/numkong/matrix.hpp +4 -4
  115. package/include/numkong/maxsim/README.md +14 -14
  116. package/include/numkong/maxsim/alder.h +6 -6
  117. package/include/numkong/maxsim/genoa.h +4 -4
  118. package/include/numkong/maxsim/haswell.h +6 -6
  119. package/include/numkong/maxsim/icelake.h +18 -18
  120. package/include/numkong/maxsim/neonsdot.h +21 -21
  121. package/include/numkong/maxsim/sapphireamx.h +14 -14
  122. package/include/numkong/maxsim/serial.h +6 -6
  123. package/include/numkong/maxsim/sme.h +221 -196
  124. package/include/numkong/maxsim/v128relaxed.h +6 -6
  125. package/include/numkong/mesh/README.md +62 -56
  126. package/include/numkong/mesh/haswell.h +339 -464
  127. package/include/numkong/mesh/neon.h +1100 -519
  128. package/include/numkong/mesh/neonbfdot.h +36 -68
  129. package/include/numkong/mesh/rvv.h +530 -435
  130. package/include/numkong/mesh/serial.h +75 -91
  131. package/include/numkong/mesh/skylake.h +1627 -302
  132. package/include/numkong/mesh/v128relaxed.h +443 -330
  133. package/include/numkong/mesh.h +63 -49
  134. package/include/numkong/mesh.hpp +4 -4
  135. package/include/numkong/numkong.h +3 -3
  136. package/include/numkong/numkong.hpp +1 -0
  137. package/include/numkong/probability/README.md +23 -19
  138. package/include/numkong/probability/neon.h +82 -52
  139. package/include/numkong/probability/rvv.h +28 -23
  140. package/include/numkong/probability/serial.h +51 -39
  141. package/include/numkong/probability.h +20 -23
  142. package/include/numkong/random.h +1 -1
  143. package/include/numkong/reduce/README.md +143 -138
  144. package/include/numkong/reduce/alder.h +81 -77
  145. package/include/numkong/reduce/haswell.h +222 -220
  146. package/include/numkong/reduce/neon.h +629 -519
  147. package/include/numkong/reduce/neonbfdot.h +7 -218
  148. package/include/numkong/reduce/neonfhm.h +9 -381
  149. package/include/numkong/reduce/neonsdot.h +9 -9
  150. package/include/numkong/reduce/rvv.h +928 -802
  151. package/include/numkong/reduce/serial.h +23 -27
  152. package/include/numkong/reduce/sierra.h +20 -20
  153. package/include/numkong/reduce/skylake.h +326 -324
  154. package/include/numkong/reduce/v128relaxed.h +52 -52
  155. package/include/numkong/reduce.h +4 -23
  156. package/include/numkong/reduce.hpp +156 -11
  157. package/include/numkong/scalar/README.md +6 -6
  158. package/include/numkong/scalar/haswell.h +26 -17
  159. package/include/numkong/scalar/loongsonasx.h +74 -0
  160. package/include/numkong/scalar/neon.h +9 -9
  161. package/include/numkong/scalar/powervsx.h +96 -0
  162. package/include/numkong/scalar/rvv.h +2 -2
  163. package/include/numkong/scalar/sapphire.h +21 -10
  164. package/include/numkong/scalar/serial.h +21 -21
  165. package/include/numkong/scalar.h +13 -0
  166. package/include/numkong/set/README.md +28 -28
  167. package/include/numkong/set/haswell.h +12 -12
  168. package/include/numkong/set/icelake.h +14 -14
  169. package/include/numkong/set/loongsonasx.h +181 -0
  170. package/include/numkong/set/neon.h +17 -18
  171. package/include/numkong/set/powervsx.h +326 -0
  172. package/include/numkong/set/rvv.h +4 -4
  173. package/include/numkong/set/serial.h +6 -6
  174. package/include/numkong/set/sve.h +60 -59
  175. package/include/numkong/set/v128relaxed.h +6 -6
  176. package/include/numkong/set.h +21 -7
  177. package/include/numkong/sets/README.md +26 -26
  178. package/include/numkong/sets/loongsonasx.h +52 -0
  179. package/include/numkong/sets/powervsx.h +65 -0
  180. package/include/numkong/sets/smebi32.h +395 -364
  181. package/include/numkong/sets.h +83 -40
  182. package/include/numkong/sparse/README.md +4 -4
  183. package/include/numkong/sparse/icelake.h +101 -101
  184. package/include/numkong/sparse/serial.h +1 -1
  185. package/include/numkong/sparse/sve2.h +137 -141
  186. package/include/numkong/sparse/turin.h +12 -12
  187. package/include/numkong/sparse.h +10 -10
  188. package/include/numkong/spatial/README.md +230 -226
  189. package/include/numkong/spatial/alder.h +113 -116
  190. package/include/numkong/spatial/diamond.h +240 -0
  191. package/include/numkong/spatial/genoa.h +0 -68
  192. package/include/numkong/spatial/haswell.h +74 -55
  193. package/include/numkong/spatial/icelake.h +539 -58
  194. package/include/numkong/spatial/loongsonasx.h +483 -0
  195. package/include/numkong/spatial/neon.h +125 -52
  196. package/include/numkong/spatial/neonbfdot.h +8 -9
  197. package/include/numkong/spatial/neonfp8.h +258 -0
  198. package/include/numkong/spatial/neonsdot.h +180 -12
  199. package/include/numkong/spatial/powervsx.h +738 -0
  200. package/include/numkong/spatial/rvv.h +146 -139
  201. package/include/numkong/spatial/rvvbf16.h +17 -12
  202. package/include/numkong/spatial/rvvhalf.h +13 -10
  203. package/include/numkong/spatial/serial.h +13 -12
  204. package/include/numkong/spatial/sierra.h +232 -39
  205. package/include/numkong/spatial/skylake.h +73 -74
  206. package/include/numkong/spatial/sve.h +93 -72
  207. package/include/numkong/spatial/svebfdot.h +29 -29
  208. package/include/numkong/spatial/svehalf.h +52 -26
  209. package/include/numkong/spatial/svesdot.h +142 -0
  210. package/include/numkong/spatial/v128relaxed.h +293 -41
  211. package/include/numkong/spatial.h +338 -82
  212. package/include/numkong/spatials/README.md +194 -194
  213. package/include/numkong/spatials/diamond.h +82 -0
  214. package/include/numkong/spatials/haswell.h +2 -2
  215. package/include/numkong/spatials/loongsonasx.h +153 -0
  216. package/include/numkong/spatials/neonfp8.h +111 -0
  217. package/include/numkong/spatials/neonsdot.h +34 -0
  218. package/include/numkong/spatials/powervsx.h +153 -0
  219. package/include/numkong/spatials/rvv.h +259 -243
  220. package/include/numkong/spatials/sapphireamx.h +173 -173
  221. package/include/numkong/spatials/serial.h +2 -2
  222. package/include/numkong/spatials/skylake.h +2 -2
  223. package/include/numkong/spatials/sme.h +590 -605
  224. package/include/numkong/spatials/smef64.h +139 -130
  225. package/include/numkong/spatials/v128relaxed.h +2 -2
  226. package/include/numkong/spatials.h +820 -500
  227. package/include/numkong/spatials.hpp +49 -48
  228. package/include/numkong/tensor.hpp +406 -17
  229. package/include/numkong/trigonometry/README.md +19 -19
  230. package/include/numkong/trigonometry/haswell.h +402 -401
  231. package/include/numkong/trigonometry/neon.h +386 -387
  232. package/include/numkong/trigonometry/rvv.h +52 -51
  233. package/include/numkong/trigonometry/serial.h +13 -13
  234. package/include/numkong/trigonometry/skylake.h +373 -369
  235. package/include/numkong/trigonometry/v128relaxed.h +375 -374
  236. package/include/numkong/trigonometry.h +13 -13
  237. package/include/numkong/trigonometry.hpp +2 -2
  238. package/include/numkong/types.h +287 -49
  239. package/include/numkong/types.hpp +436 -12
  240. package/include/numkong/vector.hpp +82 -14
  241. package/javascript/dist/cjs/numkong-wasm.js +6 -12
  242. package/javascript/dist/cjs/numkong.d.ts +7 -1
  243. package/javascript/dist/cjs/numkong.js +37 -11
  244. package/javascript/dist/cjs/types.d.ts +9 -0
  245. package/javascript/dist/cjs/types.js +96 -0
  246. package/javascript/dist/esm/numkong-browser.d.ts +14 -0
  247. package/javascript/dist/esm/numkong-browser.js +23 -0
  248. package/javascript/dist/esm/numkong-wasm.js +6 -12
  249. package/javascript/dist/esm/numkong.d.ts +7 -1
  250. package/javascript/dist/esm/numkong.js +37 -11
  251. package/javascript/dist/esm/types.d.ts +9 -0
  252. package/javascript/dist/esm/types.js +96 -0
  253. package/javascript/node-gyp-build.d.ts +4 -1
  254. package/javascript/numkong-browser.ts +40 -0
  255. package/javascript/numkong-wasm.ts +7 -13
  256. package/javascript/numkong.c +5 -26
  257. package/javascript/numkong.ts +36 -11
  258. package/javascript/tsconfig-base.json +1 -0
  259. package/javascript/tsconfig-cjs.json +6 -1
  260. package/javascript/types.ts +110 -0
  261. package/numkong.gypi +101 -0
  262. package/package.json +34 -13
  263. package/probes/arm_neon.c +8 -0
  264. package/probes/arm_neon_bfdot.c +9 -0
  265. package/probes/arm_neon_fhm.c +9 -0
  266. package/probes/arm_neon_half.c +8 -0
  267. package/probes/arm_neon_sdot.c +9 -0
  268. package/probes/arm_neonfp8.c +9 -0
  269. package/probes/arm_sme.c +16 -0
  270. package/probes/arm_sme2.c +16 -0
  271. package/probes/arm_sme2p1.c +16 -0
  272. package/probes/arm_sme_bf16.c +16 -0
  273. package/probes/arm_sme_bi32.c +16 -0
  274. package/probes/arm_sme_f64.c +16 -0
  275. package/probes/arm_sme_fa64.c +14 -0
  276. package/probes/arm_sme_half.c +16 -0
  277. package/probes/arm_sme_lut2.c +15 -0
  278. package/probes/arm_sve.c +18 -0
  279. package/probes/arm_sve2.c +20 -0
  280. package/probes/arm_sve2p1.c +18 -0
  281. package/probes/arm_sve_bfdot.c +20 -0
  282. package/probes/arm_sve_half.c +18 -0
  283. package/probes/arm_sve_sdot.c +21 -0
  284. package/probes/loongarch_lasx.c +12 -0
  285. package/probes/power_vsx.c +12 -0
  286. package/probes/probe.js +127 -0
  287. package/probes/riscv_rvv.c +14 -0
  288. package/probes/riscv_rvv_bb.c +15 -0
  289. package/probes/riscv_rvv_bf16.c +17 -0
  290. package/probes/riscv_rvv_half.c +14 -0
  291. package/probes/wasm_v128relaxed.c +11 -0
  292. package/probes/x86_alder.c +17 -0
  293. package/probes/x86_diamond.c +17 -0
  294. package/probes/x86_genoa.c +17 -0
  295. package/probes/x86_graniteamx.c +19 -0
  296. package/probes/x86_haswell.c +11 -0
  297. package/probes/x86_icelake.c +17 -0
  298. package/probes/x86_sapphire.c +16 -0
  299. package/probes/x86_sapphireamx.c +18 -0
  300. package/probes/x86_sierra.c +17 -0
  301. package/probes/x86_skylake.c +15 -0
  302. package/probes/x86_turin.c +17 -0
  303. package/wasm/numkong-emscripten.js +2 -0
  304. package/wasm/numkong.d.ts +14 -0
  305. package/wasm/numkong.js +1124 -0
  306. package/wasm/numkong.wasm +0 -0
  307. package/include/numkong/curved/neonhalf.h +0 -212
  308. package/include/numkong/dot/neonhalf.h +0 -198
  309. package/include/numkong/dots/neonhalf.h +0 -57
  310. package/include/numkong/mesh/neonhalf.h +0 -616
  311. package/include/numkong/reduce/neonhalf.h +0 -157
  312. package/include/numkong/spatial/neonhalf.h +0 -118
  313. package/include/numkong/spatial/sapphire.h +0 -343
  314. package/include/numkong/spatials/neonhalf.h +0 -58
  315. package/javascript/README.md +0 -246
@@ -40,7 +40,7 @@
40
40
  *
41
41
  * @section references References
42
42
  *
43
- * - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
43
+ * - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
44
44
  * - Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
45
45
  * - Neumaier, A. (1974). "Rundungsfehleranalyse einiger Verfahren zur Summation endlicher Summen"
46
46
  * - Ogita, T., Rump, S.M., Oishi, S. (2005). "Accurate Sum and Dot Product"
@@ -173,19 +173,16 @@ NK_PUBLIC void nk_bilinear_f32c_neon(nk_f32c_t const *a, nk_f32c_t const *b, nk_
173
173
  /** @copydoc nk_mahalanobis_f32 */
174
174
  NK_PUBLIC void nk_mahalanobis_f32_neon(nk_f32_t const *a, nk_f32_t const *b, nk_f32_t const *c, nk_size_t n,
175
175
  nk_f64_t *result);
176
- #endif // NK_TARGET_NEON
177
-
178
- #if NK_TARGET_NEONHALF
179
176
  /** @copydoc nk_bilinear_f16 */
180
- NK_PUBLIC void nk_bilinear_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b, nk_f16_t const *c, nk_size_t n,
181
- nk_f32_t *result);
177
+ NK_PUBLIC void nk_bilinear_f16_neon(nk_f16_t const *a, nk_f16_t const *b, nk_f16_t const *c, nk_size_t n,
178
+ nk_f32_t *result);
182
179
  /** @copydoc nk_bilinear_f16c */
183
- NK_PUBLIC void nk_bilinear_f16c_neonhalf(nk_f16c_t const *a, nk_f16c_t const *b, nk_f16c_t const *c, nk_size_t n,
184
- nk_f32c_t *results);
180
+ NK_PUBLIC void nk_bilinear_f16c_neon(nk_f16c_t const *a, nk_f16c_t const *b, nk_f16c_t const *c, nk_size_t n,
181
+ nk_f32c_t *results);
185
182
  /** @copydoc nk_mahalanobis_f16 */
186
- NK_PUBLIC void nk_mahalanobis_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b, nk_f16_t const *c, nk_size_t n,
187
- nk_f32_t *result);
188
- #endif // NK_TARGET_NEONHALF
183
+ NK_PUBLIC void nk_mahalanobis_f16_neon(nk_f16_t const *a, nk_f16_t const *b, nk_f16_t const *c, nk_size_t n,
184
+ nk_f32_t *result);
185
+ #endif // NK_TARGET_NEON
189
186
 
190
187
  #if NK_TARGET_NEONBFDOT
191
188
  /** @copydoc nk_bilinear_bf16 */
@@ -337,7 +334,6 @@ NK_INTERNAL nk_dtype_t nk_mahalanobis_output_dtype(nk_dtype_t dtype) {
337
334
 
338
335
  #include "numkong/curved/serial.h"
339
336
  #include "numkong/curved/neon.h"
340
- #include "numkong/curved/neonhalf.h"
341
337
  #include "numkong/curved/neonbfdot.h"
342
338
  #include "numkong/curved/smef64.h"
343
339
  #include "numkong/curved/haswell.h"
@@ -382,8 +378,8 @@ NK_PUBLIC void nk_bilinear_f32(nk_f32_t const *a, nk_f32_t const *b, nk_f32_t co
382
378
  NK_PUBLIC void nk_bilinear_f16(nk_f16_t const *a, nk_f16_t const *b, nk_f16_t const *c, nk_size_t n, nk_f32_t *result) {
383
379
  #if NK_TARGET_HASWELL
384
380
  nk_bilinear_f16_haswell(a, b, c, n, result);
385
- #elif NK_TARGET_NEONHALF
386
- nk_bilinear_f16_neonhalf(a, b, c, n, result);
381
+ #elif NK_TARGET_NEON
382
+ nk_bilinear_f16_neon(a, b, c, n, result);
387
383
  #elif NK_TARGET_RVV
388
384
  nk_bilinear_f16_rvv(a, b, c, n, result);
389
385
  #else
@@ -432,8 +428,8 @@ NK_PUBLIC void nk_bilinear_f32c(nk_f32c_t const *a, nk_f32c_t const *b, nk_f32c_
432
428
 
433
429
  NK_PUBLIC void nk_bilinear_f16c(nk_f16c_t const *a, nk_f16c_t const *b, nk_f16c_t const *c, nk_size_t n,
434
430
  nk_f32c_t *results) {
435
- #if NK_TARGET_NEONHALF
436
- nk_bilinear_f16c_neonhalf(a, b, c, n, results);
431
+ #if NK_TARGET_NEON
432
+ nk_bilinear_f16c_neon(a, b, c, n, results);
437
433
  #else
438
434
  nk_bilinear_f16c_serial(a, b, c, n, results);
439
435
  #endif
@@ -484,8 +480,8 @@ NK_PUBLIC void nk_mahalanobis_f16(nk_f16_t const *a, nk_f16_t const *b, nk_f16_t
484
480
  nk_f32_t *result) {
485
481
  #if NK_TARGET_HASWELL
486
482
  nk_mahalanobis_f16_haswell(a, b, c, n, result);
487
- #elif NK_TARGET_NEONHALF
488
- nk_mahalanobis_f16_neonhalf(a, b, c, n, result);
483
+ #elif NK_TARGET_NEON
484
+ nk_mahalanobis_f16_neon(a, b, c, n, result);
489
485
  #elif NK_TARGET_RVV
490
486
  nk_mahalanobis_f16_rvv(a, b, c, n, result);
491
487
  #else
@@ -1,24 +1,24 @@
1
1
  # Vector-Vector Dot Products in NumKong
2
2
 
3
- NumKong implements dot products for every numeric type supported by the library, as the most important building block of higher-level functionality for vectors and higher rank tensors.
3
+ NumKong implements dot products for every numeric type supported by the library, as a core building block of higher-level functionality for vectors and higher rank tensors.
4
4
 
5
5
  Dot product for real numbers and integers is defined as:
6
6
 
7
- ```math
7
+ $$
8
8
  \text{dot}(a, b) = \sum_{i=0}^{n-1} a_i \cdot b_i
9
- ```
9
+ $$
10
10
 
11
11
  For complex numbers, the dot product expands via the distributive property of complex multiplication:
12
12
 
13
- ```math
13
+ $$
14
14
  \text{dot}(a, b) = \sum_{i=0}^{n-1} (a_{i,re} \cdot b_{i,re} - a_{i,im} \cdot b_{i,im}) + j \sum_{i=0}^{n-1} (a_{i,re} \cdot b_{i,im} + a_{i,im} \cdot b_{i,re})
15
- ```
15
+ $$
16
16
 
17
17
  The conjugate dot product negates the imaginary part of $b$:
18
18
 
19
- ```math
19
+ $$
20
20
  \text{vdot}(a, b) = \sum_{i=0}^{n-1} a_i \cdot \bar{b_i} = \sum_{i=0}^{n-1} (a_{i,re} \cdot b_{i,re} + a_{i,im} \cdot b_{i,im}) + j \sum_{i=0}^{n-1} (a_{i,im} \cdot b_{i,re} - a_{i,re} \cdot b_{i,im})
21
- ```
21
+ $$
22
22
 
23
23
  Where $\bar{b_i}$ is the complex conjugate of $b_i$.
24
24
  Reformulating as Python pseudocode for interleaved real/imaginary scalar arrays:
@@ -97,9 +97,19 @@ For Int8 × Int8, one operand is XORed with `0x80` to shift to unsigned, and the
97
97
  For Int8 × Int8, the sign bit of $b$ is cleared to produce a 7-bit value, and a windowed correction $-128 \cdot \sum_{b_i < 0} a_i$ is accumulated in Int16 and flushed every 127 iterations to prevent overflow.
98
98
  For UInt8 × UInt8, $b$ is XORed with `0x80` to shift into signed range, same as Ice Lake, with the correction $128 \cdot \sum a_i$ computed via pairwise widening adds.
99
99
 
100
+ ### Octave Decomposition for E4M3 via VNNI
101
+
102
+ `nk_dot_e4m3_icelake` splits the 4-bit E4M3 exponent into 2 "octave" bits (top) and 2 "remainder" bits (bottom).
103
+ The bottom 5 bits (2 remainder + 3 mantissa) map via `VPERMB` to u8 integers in [0, 120] — identical structure to the E2M3 $\times 16$ LUT.
104
+ A subnormal fixup replaces LUT entries for magnitude < 8 with $2 \times \text{mantissa}$ via a second masked `VPERMB`, avoiding `VPADDB` on the VPDPBUSD execution ports.
105
+ Sign is computed via `VPTERNLOGD` with immediate 0x14, fusing `(a \oplus b) \wedge \lnot \text{0x7F}` in one instruction.
106
+ The 4 octave bins per operand produce $4 \times 4 = 16$ `VPDPBUSD` cross-products accumulated into 7 registers grouped by octave sum $k = o_a + o_b \in [0, 6]$.
107
+ Each accumulator is scaled by $2^{4k-20}$ — an exact power of two, introducing no rounding.
108
+ This processes 64 E4M3 bytes per iteration in u8, doubling the element density of the BF16 upcast path.
109
+
100
110
  ### Widening Fusion Through BFloat16 on x86
101
111
 
102
- `nk_dot_e4m3_genoa`, `nk_dot_e5m2_genoa` convert FP8 values to BF16, then accumulate via `VDPBF16PS` — repurposing Genoa's BF16 dot-product hardware for types it was never designed for.
112
+ `nk_dot_e5m2_genoa` converts FP8 values to BF16, then accumulates via `VDPBF16PS`, reusing Genoa's BF16 dot-product instruction for FP8 types.
103
113
  Each `VDPBF16PS` fuses two BF16 multiply-adds per 32-bit lane at 6-cycle throughput.
104
114
  `nk_dot_bf16c_genoa` uses the same instruction for complex BF16, preparing operands with `VPSHUFB` for lane swapping and `VPXORD` with `0x80000000` for sign flips before feeding into `VDPBF16PS`.
105
115
 
@@ -170,66 +180,68 @@ Workloads that significantly degrade CPU frequencies (Intel AMX, Apple SME) run
170
180
  | `nk_vdot_f16c_haswell` | 24.0 gb/s, 11.1 ulp | 20.0 gb/s, 17.4 ulp | 17.1 gb/s, 29.2 ulp |
171
181
  | __f64__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
172
182
  | `dot_f64_with_blas` 🧩 | 27.8 gb/s, 6.9 ulp | 30.1 gb/s, 9.3 ulp | 15.7 gb/s, 20 ulp |
173
- | `nk_dot_f64_serial` | 5.41 gb/s, 2.2 ulp | 6.31 gb/s, 2.0 ulp | 6.77 gb/s, 3.3 ulp |
174
- | `nk_dot_f64_haswell` | 21.9 gb/s, 0 ulp | 26.1 gb/s, 0 ulp | 12.4 gb/s, 0 ulp |
175
- | `nk_dot_f64_skylake` | 23.9 gb/s, 0 ulp | 27.0 gb/s, 0 ulp | 16.4 gb/s, 0 ulp |
183
+ | `nk_dot_f64_serial` | 4.28 gb/s, 2.2 ulp | 4.39 gb/s, 2.0 ulp | 4.42 gb/s, 3.3 ulp |
184
+ | `nk_dot_f64_haswell` | 24.2 gb/s, 0 ulp | 25.7 gb/s, 0 ulp | 18.3 gb/s, 0 ulp |
185
+ | `nk_dot_f64_skylake` | 29.0 gb/s, 0 ulp | 28.6 gb/s, 0 ulp | 24.9 gb/s, 0 ulp |
176
186
  | __f32__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
177
187
  | `dot_f32_with_blas` 🧩 | 47.8 gb/s, 14 ulp | 30.7 gb/s, 14 ulp | 29.7 gb/s, 15 ulp |
178
- | `nk_dot_f32_serial` | 11.3 gb/s, 0 ulp | 11.7 gb/s, 0 ulp | 10.7 gb/s, 0 ulp |
179
- | `nk_dot_f32_haswell` | 28.0 gb/s, 0 ulp | 23.6 gb/s, 0 ulp | 21.3 gb/s, 0 ulp |
180
- | `nk_dot_f32_skylake` | 36.3 gb/s, 0 ulp | 29.2 gb/s, 0 ulp | 23.7 gb/s, 0 ulp |
188
+ | `nk_dot_f32_serial` | 11.0 gb/s, 0 ulp | 11.2 gb/s, 0 ulp | 11.5 gb/s, 0 ulp |
189
+ | `nk_dot_f32_haswell` | 30.5 gb/s, 0 ulp | 23.9 gb/s, 0 ulp | 24.4 gb/s, 0 ulp |
190
+ | `nk_dot_f32_skylake` | 44.2 gb/s, 0 ulp | 29.8 gb/s, 0 ulp | 30.0 gb/s, 0 ulp |
181
191
  | __bf16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
182
- | `nk_dot_bf16_serial` | 0.655 gb/s, 0 ulp | 0.644 gb/s, 0.6 ulp | 0.651 gb/s, 5.2 ulp |
183
- | `nk_dot_bf16_haswell` | 30.1 gb/s, 0 ulp | 20.3 gb/s, 0.2 ulp | 19.4 gb/s, 41.3 ulp |
184
- | `nk_dot_bf16_skylake` | 53.6 gb/s, 0 ulp | 30.4 gb/s, 0.2 ulp | 29.6 gb/s, 21.8 ulp |
185
- | `nk_dot_bf16_genoa` | 88.1 gb/s, 0 ulp | 31.6 gb/s, 0.2 ulp | 31.1 gb/s, 2.2 ulp |
192
+ | `nk_dot_bf16_serial` | 0.633 gb/s, 0 ulp | 0.630 gb/s, 0.5 ulp | 0.638 gb/s, 5.4 ulp |
193
+ | `nk_dot_bf16_haswell` | 39.3 gb/s, 0 ulp | 25.5 gb/s, 0.2 ulp | 20.2 gb/s, 25.3 ulp |
194
+ | `nk_dot_bf16_skylake` | 62.7 gb/s, 0 ulp | 30.2 gb/s, 0.2 ulp | 29.5 gb/s, 2.3 ulp |
195
+ | `nk_dot_bf16_genoa` | 88.8 gb/s, 0 ulp | 29.7 gb/s, 0.2 ulp | 31.2 gb/s, 2.2 ulp |
186
196
  | __f16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
187
- | `nk_dot_f16_serial` | 1.38 gb/s, 11.5 ulp | 1.37 gb/s, 33.7 ulp | 1.32 gb/s, 59.7 ulp |
188
- | `nk_dot_f16_haswell` | 30.6 gb/s, 7.0 ulp | 23.3 gb/s, 14.0 ulp | 20.0 gb/s, 29.8 ulp |
189
- | `nk_dot_f16_skylake` | 54.4 gb/s, 6.2 ulp | 31.4 gb/s, 8.6 ulp | 30.0 gb/s, 22.8 ulp |
197
+ | `nk_dot_f16_serial` | 1.31 gb/s, 11.5 ulp | 1.32 gb/s, 33.7 ulp | 1.30 gb/s, 59.7 ulp |
198
+ | `nk_dot_f16_haswell` | 31.3 gb/s, 7.0 ulp | 22.8 gb/s, 14.0 ulp | 19.8 gb/s, 29.8 ulp |
199
+ | `nk_dot_f16_skylake` | 54.9 gb/s, 6.2 ulp | 31.7 gb/s, 8.6 ulp | 30.9 gb/s, 22.8 ulp |
190
200
  | __e5m2__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
191
- | `nk_dot_e5m2_serial` | 1.99 gb/s, 0 ulp | 2.11 gb/s, 0 ulp | 2.13 gb/s, 0 ulp |
192
- | `nk_dot_e5m2_haswell` | 4.90 gb/s, 0 ulp | 4.87 gb/s, 0 ulp | 4.56 gb/s, 0 ulp |
193
- | `nk_dot_e5m2_skylake` | 6.34 gb/s, 0 ulp | 6.45 gb/s, 0 ulp | 6.17 gb/s, 0 ulp |
194
- | `nk_dot_e5m2_genoa` | 12.6 gb/s, 0 ulp | 12.7 gb/s, 0 ulp | 12.8 gb/s, 0 ulp |
201
+ | `nk_dot_e5m2_serial` | 1.90 gb/s, 0 ulp | 1.07 gb/s, 0 ulp | 1.08 gb/s, 0 ulp |
202
+ | `nk_dot_e5m2_haswell` | 4.92 gb/s, 0 ulp | 4.95 gb/s, 0 ulp | 4.80 gb/s, 0 ulp |
203
+ | `nk_dot_e5m2_skylake` | 6.20 gb/s, 0 ulp | 6.36 gb/s, 0 ulp | 6.25 gb/s, 0 ulp |
204
+ | `nk_dot_e5m2_genoa` | 12.1 gb/s, 0 ulp | 12.6 gb/s, 0 ulp | 12.6 gb/s, 0 ulp |
195
205
  | __e4m3__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
196
- | `nk_dot_e4m3_serial` | 0.797 gb/s, 0 ulp | 0.801 gb/s, 0 ulp | 0.816 gb/s, 0 ulp |
197
- | `nk_dot_e4m3_haswell` | 3.25 gb/s, 0 ulp | 3.25 gb/s, 0 ulp | 2.53 gb/s, 0 ulp |
198
- | `nk_dot_e4m3_skylake` | 4.99 gb/s, 0 ulp | 5.03 gb/s, 0 ulp | 4.94 gb/s, 0 ulp |
199
- | `nk_dot_e4m3_genoa` | 12.4 gb/s, 0 ulp | 13.0 gb/s, 0 ulp | 12.7 gb/s, 0 ulp |
206
+ | `nk_dot_e4m3_serial` | 0.762 gb/s, 0 ulp | 0.424 gb/s, 0 ulp | 0.420 gb/s, 0 ulp |
207
+ | `nk_dot_e4m3_haswell` | 3.78 gb/s, 0 ulp | 3.77 gb/s, 0 ulp | 3.75 gb/s, 0 ulp |
208
+ | `nk_dot_e4m3_skylake` | 5.10 gb/s, 0 ulp | 5.16 gb/s, 0 ulp | 5.21 gb/s, 0 ulp |
209
+ | `nk_dot_e4m3_icelake` | 13.2 gb/s, 0 ulp | 14.9 gb/s, 0 ulp | 14.7 gb/s, 0 ulp |
200
210
  | __e3m2__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
201
- | `nk_dot_e3m2_serial` | 2.02 gb/s, 0 ulp | 2.08 gb/s, 0 ulp | 2.14 gb/s, 0 ulp |
202
- | `nk_dot_e3m2_haswell` | 11.9 gb/s, 0 ulp | 12.0 gb/s, 0 ulp | 11.5 gb/s, 0 ulp |
203
- | `nk_dot_e3m2_icelake` | 22.6 gb/s, 0 ulp | 24.1 gb/s, 0 ulp | 22.5 gb/s, 0 ulp |
211
+ | `nk_dot_e3m2_serial` | 1.47 gb/s, 0 ulp | 1.05 gb/s, 0 ulp | 1.04 gb/s, 0 ulp |
212
+ | `nk_dot_e3m2_haswell` | 12.0 gb/s, 0 ulp | 12.2 gb/s, 0 ulp | 12.2 gb/s, 0 ulp |
213
+ | `nk_dot_e3m2_skylake` | 21.6 gb/s, 0 ulp | 23.1 gb/s, 0 ulp | 23.2 gb/s, 0 ulp |
214
+ | `nk_dot_e3m2_icelake` | 23.1 gb/s, 0 ulp | 24.3 gb/s, 0 ulp | 23.9 gb/s, 0 ulp |
204
215
  | __e2m3__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
205
- | `nk_dot_e2m3_serial` | 2.07 gb/s, 0 ulp | 2.05 gb/s, 0 ulp | 2.14 gb/s, 0 ulp |
206
- | `nk_dot_e2m3_haswell` | 20.0 gb/s, 0 ulp | 19.5 gb/s, 0 ulp | 18.7 gb/s, 0 ulp |
207
- | `nk_dot_e2m3_icelake` | 56.9 gb/s, 0 ulp | 43.1 gb/s, 0 ulp | 30.1 gb/s, 0 ulp |
208
- | `nk_dot_e2m3_alder` | 29.8 gb/s, 0 ulp | 30.2 gb/s, 0 ulp | 25.6 gb/s, 0 ulp |
216
+ | `nk_dot_e2m3_serial` | 1.87 gb/s, 0 ulp | 1.25 gb/s, 0 ulp | 1.96 gb/s, 0 ulp |
217
+ | `nk_dot_e2m3_haswell` | 20.5 gb/s, 0 ulp | 20.4 gb/s, 0 ulp | 19.3 gb/s, 0 ulp |
218
+ | `nk_dot_e2m3_skylake` | 35.7 gb/s, 0 ulp | 33.2 gb/s, 0 ulp | 30.7 gb/s, 0 ulp |
219
+ | `nk_dot_e2m3_icelake` | 58.0 gb/s, 0 ulp | 46.0 gb/s, 0 ulp | 31.5 gb/s, 0 ulp |
220
+ | `nk_dot_e2m3_alder` | 29.9 gb/s, 0 ulp | 30.8 gb/s, 0 ulp | 29.1 gb/s, 0 ulp |
209
221
  | __i8__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
210
- | `nk_dot_i8_serial` | 17.4 gb/s | 17.2 gb/s | 16.0 gb/s |
211
- | `nk_dot_i8_haswell` | 33.4 gb/s | 23.5 gb/s | 24.9 gb/s |
212
- | `nk_dot_i8_skylake` | 53.6 gb/s | 39.9 gb/s | 29.7 gb/s |
213
- | `nk_dot_i8_icelake` | 63.3 gb/s | 49.5 gb/s | 30.4 gb/s |
214
- | `nk_dot_i8_alder` | 43.8 gb/s | 43.0 gb/s | 30.4 gb/s |
222
+ | `nk_dot_i8_serial` | 16.9 gb/s | 16.8 gb/s | 15.6 gb/s |
223
+ | `nk_dot_i8_haswell` | 43.2 gb/s | 35.8 gb/s | 29.1 gb/s |
224
+ | `nk_dot_i8_skylake` | 52.9 gb/s | 36.5 gb/s | 28.5 gb/s |
225
+ | `nk_dot_i8_icelake` | 64.0 gb/s | 46.2 gb/s | 26.8 gb/s |
226
+ | `nk_dot_i8_alder` | 42.8 gb/s | 40.4 gb/s | 31.1 gb/s |
215
227
  | __u8__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
216
- | `nk_dot_u8_serial` | 17.4 gb/s | 17.1 gb/s | 16.3 gb/s |
217
- | `nk_dot_u8_haswell` | 32.2 gb/s | 37.5 gb/s | 28.3 gb/s |
218
- | `nk_dot_u8_skylake` | 54.6 gb/s | 41.0 gb/s | 28.4 gb/s |
219
- | `nk_dot_u8_icelake` | 74.4 gb/s | 48.4 gb/s | 30.3 gb/s |
220
- | `nk_dot_u8_alder` | 54.2 gb/s | 43.9 gb/s | 32.3 gb/s |
228
+ | `nk_dot_u8_serial` | 16.9 gb/s | 16.5 gb/s | 15.8 gb/s |
229
+ | `nk_dot_u8_haswell` | 47.7 gb/s | 37.7 gb/s | 29.1 gb/s |
230
+ | `nk_dot_u8_skylake` | 48.7 gb/s | 32.6 gb/s | 27.5 gb/s |
231
+ | `nk_dot_u8_icelake` | 68.4 gb/s | 46.9 gb/s | 30.2 gb/s |
232
+ | `nk_dot_u8_alder` | 42.1 gb/s | 41.8 gb/s | 31.6 gb/s |
221
233
  | __i4__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
222
234
  | `nk_dot_i4_serial` | 9.37 gb/s | 11.8 gb/s | 11.8 gb/s |
223
- | `nk_dot_i4_haswell` | 8.39 gb/s | 8.47 gb/s | 8.30 gb/s |
224
- | `nk_dot_i4_icelake` | 24.9 gb/s | 35.9 gb/s | 25.8 gb/s |
235
+ | `nk_dot_i4_haswell` | 8.22 gb/s | 8.53 gb/s | 8.23 gb/s |
236
+ | `nk_dot_i4_icelake` | 24.3 gb/s | 36.3 gb/s | 25.5 gb/s |
225
237
  | __u4__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
226
238
  | `nk_dot_u4_serial` | 10.6 gb/s | 12.0 gb/s | 11.9 gb/s |
227
- | `nk_dot_u4_haswell` | 15.2 gb/s | 16.0 gb/s | 14.4 gb/s |
228
- | `nk_dot_u4_icelake` | 49.6 gb/s | 58.3 gb/s | 29.6 gb/s |
239
+ | `nk_dot_u4_haswell` | 15.0 gb/s | 16.4 gb/s | 14.3 gb/s |
240
+ | `nk_dot_u4_icelake` | 48.1 gb/s | 64.4 gb/s | 30.9 gb/s |
229
241
  | __u1__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
230
242
  | `nk_dot_u1_serial` | 3.92 gb/s | 5.04 gb/s | 4.97 gb/s |
231
- | `nk_dot_u1_haswell` | 14.2 gb/s | 46.7 gb/s | 70.9 gb/s |
232
- | `nk_dot_u1_icelake` | 21.1 gb/s | 70.9 gb/s | 109 gb/s |
243
+ | `nk_dot_u1_haswell` | 14.7 gb/s | 43.2 gb/s | 69.4 gb/s |
244
+ | `nk_dot_u1_icelake` | 17.9 gb/s | 68.8 gb/s | 110 gb/s |
233
245
 
234
246
  #### WASM
235
247
 
@@ -293,133 +305,138 @@ Measured with Wasmtime v42 (Cranelift backend).
293
305
  | `nk_dot_u1_serial` | 1.95 gb/s | 1.53 gb/s | 0.09 gb/s |
294
306
  | `nk_dot_u1_v128relaxed` | 0.548 gb/s | 1.88 gb/s | 0.13 gb/s |
295
307
 
296
- ### Apple M4
308
+ ### Apple M5
297
309
 
298
310
  #### Native
299
311
 
300
312
  | Kernel | 256 | 1024 | 4096 |
301
313
  | :------------------------ | -----------------------: | -----------------------: | -----------------------: |
302
314
  | __f64c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
303
- | `nk_dot_f64c_serial` | 12.5 gb/s, 5 ulp | 12.4 gb/s, 3 ulp | 11.9 gb/s, 9.7 ulp |
304
- | `nk_vdot_f64c_serial` | 12.4 gb/s, 4.2 ulp | 12.3 gb/s, 3.3 ulp | 11.7 gb/s, 3.3 ulp |
305
- | `nk_dot_f64c_neon` | 8.06 gb/s, 0 ulp | 8.05 gb/s, 0 ulp | 7.85 gb/s, 0 ulp |
306
- | `nk_vdot_f64c_neon` | 7.79 gb/s, 0 ulp | 8.05 gb/s, 0 ulp | 7.88 gb/s, 0 ulp |
315
+ | `nk_dot_f64c_serial` | 8.02 gb/s, 5 ulp | 7.30 gb/s, 3 ulp | 7.25 gb/s, 9.7 ulp |
316
+ | `nk_vdot_f64c_serial` | 8.29 gb/s, 4.2 ulp | 7.53 gb/s, 3.3 ulp | 7.38 gb/s, 3.3 ulp |
317
+ | `nk_dot_f64c_neon` | 23.7 gb/s, 0 ulp | 21.6 gb/s, 0 ulp | 21.3 gb/s, 0 ulp |
318
+ | `nk_vdot_f64c_neon` | 23.6 gb/s, 0 ulp | 21.8 gb/s, 0 ulp | 20.9 gb/s, 0 ulp |
307
319
  | __f32c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
308
- | `nk_dot_f32c_serial` | 12.9 gb/s, 0 ulp | 12.4 gb/s, 0 ulp | 11.9 gb/s, 0 ulp |
309
- | `nk_vdot_f32c_serial` | 12.8 gb/s, 0 ulp | 12.4 gb/s, 0 ulp | 12.1 gb/s, 0 ulp |
310
- | `nk_dot_f32c_neon` | 7.97 gb/s, 0 ulp | 7.26 gb/s, 0 ulp | 7.00 gb/s, 0 ulp |
311
- | `nk_vdot_f32c_neon` | 8.29 gb/s, 0 ulp | 7.58 gb/s, 0 ulp | 7.36 gb/s, 0 ulp |
320
+ | `nk_dot_f32c_serial` | 27.8 gb/s, 0 ulp | 24.6 gb/s, 0 ulp | 23.2 gb/s, 0 ulp |
321
+ | `nk_vdot_f32c_serial` | 27.2 gb/s, 0 ulp | 24.0 gb/s, 0 ulp | 22.6 gb/s, 0 ulp |
322
+ | `nk_dot_f32c_neon` | 22.8 gb/s, 0 ulp | 18.2 gb/s, 0 ulp | 16.9 gb/s, 0 ulp |
323
+ | `nk_vdot_f32c_neon` | 22.7 gb/s, 0 ulp | 17.5 gb/s, 0 ulp | 16.7 gb/s, 0 ulp |
312
324
  | __bf16c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
313
- | `nk_dot_bf16c_serial` | 7.47 gb/s, 0.2 ulp | 6.33 gb/s, 2.8 ulp | 6.08 gb/s, 15.8 ulp |
314
- | `nk_vdot_bf16c_serial` | 7.47 gb/s, 0.2 ulp | 6.43 gb/s, 2.6 ulp | 6.08 gb/s, 11.4 ulp |
315
- | `nk_dot_bf16c_neonbfdot` | 12.9 gb/s, 0.1 ulp | 9.31 gb/s, 2 ulp | 8.56 gb/s, 8.8 ulp |
316
- | `nk_vdot_bf16c_neonbfdot` | 12.9 gb/s, 0.1 ulp | 9.29 gb/s, 1.8 ulp | 8.56 gb/s, 8.8 ulp |
325
+ | `nk_dot_bf16c_serial` | 15.6 gb/s, 0.2 ulp | 12.5 gb/s, 2.8 ulp | 12.5 gb/s, 15.8 ulp |
326
+ | `nk_vdot_bf16c_serial` | 15.9 gb/s, 0.2 ulp | 12.9 gb/s, 2.6 ulp | 11.7 gb/s, 11.4 ulp |
327
+ | `nk_dot_bf16c_neonbfdot` | 26.3 gb/s, 0.1 ulp | 18.5 gb/s, 2 ulp | 17.6 gb/s, 8.8 ulp |
328
+ | `nk_vdot_bf16c_neonbfdot` | 26.5 gb/s, 0.1 ulp | 18.2 gb/s, 1.8 ulp | 17.3 gb/s, 8.8 ulp |
317
329
  | __f16c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
318
- | `nk_dot_f16c_serial` | 7.53 gb/s, 20.8 ulp | 6.34 gb/s, 64.1 ulp | 6.07 gb/s, 73.1 ulp |
319
- | `nk_vdot_f16c_serial` | 7.53 gb/s, 24.8 ulp | 6.34 gb/s, 31.9 ulp | 6.07 gb/s, 137 ulp |
320
- | `nk_dot_f16c_neonhalf` | 9.94 gb/s, 3.0 ulp | 7.94 gb/s, 6.5 ulp | 7.60 gb/s, 20.5 ulp |
321
- | `nk_vdot_f16c_neonhalf` | 9.85 gb/s, 34.9 ulp | 7.79 gb/s, 40.7 ulp | 7.57 gb/s, 73.1 ulp |
322
- | `nk_dot_f16c_neonfhm` | 9.39 gb/s, 3.0 ulp | 7.46 gb/s, 6.5 ulp | 7.19 gb/s, 20.5 ulp |
323
- | `nk_vdot_f16c_neonfhm` | 9.75 gb/s, 31.4 ulp | 7.50 gb/s, 38.6 ulp | 7.29 gb/s, 67.6 ulp |
330
+ | `nk_dot_f16c_serial` | 15.8 gb/s, 20.8 ulp | 13.0 gb/s, 64.1 ulp | 12.3 gb/s, 73.1 ulp |
331
+ | `nk_vdot_f16c_serial` | 15.8 gb/s, 24.8 ulp | 13.0 gb/s, 31.9 ulp | 12.3 gb/s, 137 ulp |
332
+ | `nk_dot_f16c_neonhalf` | 26.1 gb/s, 3.0 ulp | 18.4 gb/s, 6.5 ulp | 16.8 gb/s, 20.5 ulp |
333
+ | `nk_vdot_f16c_neonhalf` | 26.1 gb/s, 34.9 ulp | 18.5 gb/s, 40.7 ulp | 17.0 gb/s, 73.1 ulp |
334
+ | `nk_dot_f16c_neonfhm` | 25.3 gb/s, 3.0 ulp | 17.1 gb/s, 6.5 ulp | 15.9 gb/s, 20.5 ulp |
335
+ | `nk_vdot_f16c_neonfhm` | 25.0 gb/s, 31.4 ulp | 17.0 gb/s, 38.6 ulp | 15.8 gb/s, 67.6 ulp |
324
336
  | __f64__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
325
- | `nk_dot_f64_serial` | 7.79 gb/s, 2.4 ulp | 7.79 gb/s, 175 ulp | 7.74 gb/s, 2.7 ulp |
326
- | `nk_dot_f64_neon` | 14.8 gb/s, 0 ulp | 15.1 gb/s, 0 ulp | 14.7 gb/s, 0 ulp |
337
+ | `nk_dot_f64_serial` | 8.11 gb/s, 2.4 ulp | 8.13 gb/s, 175 ulp | 8.09 gb/s, 2.7 ulp |
338
+ | `nk_dot_f64_neon` | 44.2 gb/s, 0 ulp | 42.3 gb/s, 0 ulp | 38.4 gb/s, 0 ulp |
327
339
  | __f32__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
328
- | `nk_dot_f32_serial` | 11.0 gb/s, 0 ulp | 7.77 gb/s, 0 ulp | 7.18 gb/s, 0 ulp |
329
- | `nk_dot_f32_neon` | 9.23 gb/s, 0 ulp | 7.30 gb/s, 0 ulp | 6.96 gb/s, 0 ulp |
340
+ | `nk_dot_f32_serial` | 23.3 gb/s, 0 ulp | 15.8 gb/s, 0 ulp | 14.6 gb/s, 0 ulp |
341
+ | `nk_dot_f32_neon` | 46.4 gb/s, 0 ulp | 38.0 gb/s, 0 ulp | 34.8 gb/s, 0 ulp |
330
342
  | __bf16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
331
- | `nk_dot_bf16_serial` | 5.76 gb/s, 0 ulp | 4.10 gb/s, 0.9 ulp | 3.62 gb/s, 6 ulp |
332
- | `nk_dot_bf16_neonbfdot` | 35.1 gb/s, 0 ulp | 28.6 gb/s, 0.6 ulp | 22.9 gb/s, 4.5 ulp |
343
+ | `nk_dot_bf16_serial` | 12.4 gb/s, 0 ulp | 8.59 gb/s, 0.9 ulp | 7.36 gb/s, 6 ulp |
344
+ | `nk_dot_bf16_neon` | 39.0 gb/s, 3.7 ulp | 27.2 gb/s, 3.7 ulp | 19.9 gb/s, 3.7 ulp |
345
+ | `nk_dot_bf16_neonbfdot` | 70.8 gb/s, 0 ulp | 60.8 gb/s, 0.6 ulp | 47.8 gb/s, 4.5 ulp |
333
346
  | __f16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
334
- | `nk_dot_f16_serial` | 5.66 gb/s, 19 ulp | 3.99 gb/s, 31.1 ulp | 3.51 gb/s, 57.8 ulp |
335
- | `nk_dot_f16_neonhalf` | 12.1 gb/s, 19.4 ulp | 9.18 gb/s, 21.5 ulp | 7.72 gb/s, 36.3 ulp |
336
- | `nk_dot_f16_neonfhm` | 16.3 gb/s, 14.9 ulp | 10.7 gb/s, 26.7 ulp | 7.95 gb/s, 39.9 ulp |
347
+ | `nk_dot_f16_serial` | 12.0 gb/s, 19 ulp | 8.33 gb/s, 31.1 ulp | 7.11 gb/s, 57.8 ulp |
348
+ | `nk_dot_f16_neon` | 35.7 gb/s, 33.4 ulp | 25.8 gb/s, 37.4 ulp | 21.3 gb/s, 23.1 ulp |
349
+ | `nk_dot_f16_neonfhm` | 48.7 gb/s, 14.9 ulp | 27.5 gb/s, 26.7 ulp | 18.8 gb/s, 39.9 ulp |
337
350
  | __e5m2__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
338
- | `nk_dot_e5m2_serial` | 1.87 gb/s, 0 ulp | 1.76 gb/s, 0 ulp | 1.74 gb/s, 0 ulp |
339
- | `nk_dot_e5m2_neon` | 7.15 gb/s, 0 ulp | 4.95 gb/s, 0 ulp | 4.23 gb/s, 0 ulp |
351
+ | `nk_dot_e5m2_serial` | 3.80 gb/s, 0 ulp | 3.41 gb/s, 0 ulp | 3.41 gb/s, 0 ulp |
352
+ | `nk_dot_e5m2_neon` | 19.0 gb/s, 0 ulp | 13.2 gb/s, 0 ulp | 10.5 gb/s, 0 ulp |
353
+ | `nk_dot_e5m2_neonfhm` | 25.6 gb/s, 0 ulp | 15.3 gb/s, 0 ulp | 9.55 gb/s, 0 ulp |
354
+ | `nk_dot_e5m2_neonbfdot` | 3.65 gb/s, 0 ulp | 3.82 gb/s, 0 ulp | 3.68 gb/s, 0 ulp |
340
355
  | __e4m3__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
341
- | `nk_dot_e4m3_serial` | 0.874 gb/s, 0 ulp | 0.873 gb/s, 0 ulp | 0.871 gb/s, 0 ulp |
342
- | `nk_dot_e4m3_neon` | 1.62 gb/s, 0 ulp | 1.62 gb/s, 0 ulp | 1.60 gb/s, 0 ulp |
356
+ | `nk_dot_e4m3_serial` | 1.74 gb/s, 0 ulp | 1.72 gb/s, 0 ulp | 1.71 gb/s, 0 ulp |
357
+ | `nk_dot_e4m3_neon` | 4.44 gb/s, 0 ulp | 4.51 gb/s, 0 ulp | 4.57 gb/s, 0 ulp |
358
+ | `nk_dot_e4m3_neonfhm` | 10.1 gb/s, 0 ulp | 8.51 gb/s, 0 ulp | 7.96 gb/s, 0 ulp |
359
+ | `nk_dot_e4m3_neonbfdot` | 3.59 gb/s, 0 ulp | 3.68 gb/s, 0 ulp | 3.64 gb/s, 0 ulp |
343
360
  | __e3m2__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
344
- | `nk_dot_e3m2_serial` | 1.24 gb/s, 0 ulp | 1.15 gb/s, 0 ulp | 1.13 gb/s, 0 ulp |
345
- | `nk_dot_e3m2_neonsdot` | 7.69 gb/s, 0 ulp | 7.64 gb/s, 0 ulp | 7.70 gb/s, 0 ulp |
361
+ | `nk_dot_e3m2_serial` | 2.51 gb/s, 0 ulp | 2.33 gb/s, 0 ulp | 2.24 gb/s, 0 ulp |
362
+ | `nk_dot_e3m2_neonsdot` | 20.5 gb/s, 0 ulp | 20.7 gb/s, 0 ulp | 20.1 gb/s, 0 ulp |
346
363
  | __e2m3__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
347
- | `nk_dot_e2m3_serial` | 1.23 gb/s, 0 ulp | 1.15 gb/s, 0 ulp | 1.13 gb/s, 0 ulp |
348
- | `nk_dot_e2m3_neonsdot` | 16.6 gb/s, 0 ulp | 16.8 gb/s, 0 ulp | 16.4 gb/s, 0 ulp |
364
+ | `nk_dot_e2m3_serial` | 2.54 gb/s, 0 ulp | 2.27 gb/s, 0 ulp | 2.29 gb/s, 0 ulp |
365
+ | `nk_dot_e2m3_neonsdot` | 47.3 gb/s, 0 ulp | 47.5 gb/s, 0 ulp | 43.4 gb/s, 0 ulp |
349
366
  | __i8__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
350
- | `nk_dot_i8_serial` | 43.2 gb/s | 48.5 gb/s | 48.0 gb/s |
351
- | `nk_dot_i8_neonsdot` | 29.8 gb/s | 29.4 gb/s | 22.9 gb/s |
367
+ | `nk_dot_i8_serial` | 115 gb/s | 102 gb/s | 92.3 gb/s |
368
+ | `nk_dot_i8_neonsdot` | 92.8 gb/s | 87.4 gb/s | 59.9 gb/s |
352
369
  | __u8__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
353
- | `nk_dot_u8_serial` | 44.1 gb/s | 45.7 gb/s | 50.0 gb/s |
354
- | `nk_dot_u8_neonsdot` | 30.6 gb/s | 28.4 gb/s | 22.3 gb/s |
370
+ | `nk_dot_u8_serial` | 110 gb/s | 99.2 gb/s | 94.9 gb/s |
371
+ | `nk_dot_u8_neonsdot` | 92.5 gb/s | 86.6 gb/s | 59.5 gb/s |
355
372
  | __i4__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
356
- | `nk_dot_i4_serial` | 11.9 gb/s | 12.4 gb/s | 12.6 gb/s |
357
- | `nk_dot_i4_neonsdot` | 19.5 gb/s | 15.5 gb/s | 11.3 gb/s |
373
+ | `nk_dot_i4_serial` | 23.0 gb/s | 24.4 gb/s | 24.2 gb/s |
374
+ | `nk_dot_i4_neonsdot` | 58.2 gb/s | 44.7 gb/s | 30.4 gb/s |
358
375
  | __u4__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
359
- | `nk_dot_u4_serial` | 12.9 gb/s | 13.7 gb/s | 13.9 gb/s |
360
- | `nk_dot_u4_neonsdot` | 21.7 gb/s | 16.1 gb/s | 11.4 gb/s |
376
+ | `nk_dot_u4_serial` | 25.3 gb/s | 27.2 gb/s | 26.9 gb/s |
377
+ | `nk_dot_u4_neonsdot` | 67.3 gb/s | 47.4 gb/s | 29.4 gb/s |
361
378
  | __u1__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
362
- | `nk_dot_u1_serial` | 3.32 gb/s | 3.56 gb/s | 3.58 gb/s |
363
- | `nk_dot_u1_neon` | 11.2 gb/s | 21.8 gb/s | 27.4 gb/s |
379
+ | `nk_dot_u1_serial` | 7.01 gb/s | 7.63 gb/s | 7.19 gb/s |
380
+ | `nk_dot_u1_neon` | 33.3 gb/s | 64.6 gb/s | 88.0 gb/s |
364
381
 
365
382
  #### WASM
366
383
 
367
- Measured with Wasmtime v42 (Cranelift backend).
384
+ Measured with Wasmtime v43 (Cranelift backend).
368
385
 
369
386
  | Kernel | 256 | 1024 | 4096 |
370
387
  | :------------------------- | -----------------------: | -----------------------: | -----------------------: |
371
388
  | __f64c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
372
- | `nk_dot_f64c_serial` | 27.7 gb/s, 3.8 ulp | 24.1 gb/s, 3.9 ulp | 26.9 gb/s, 3.2 ulp |
373
- | `nk_vdot_f64c_serial` | 5.87 gb/s, 3.8 ulp | 5.47 gb/s, 3.4 ulp | 5.66 gb/s, 15.1 ulp |
374
- | `nk_dot_f64c_v128relaxed` | 45.0 gb/s, 26 ulp | 34.0 gb/s, 42 ulp | 35.9 gb/s, 88 ulp |
375
- | `nk_vdot_f64c_v128relaxed` | 22.0 gb/s, 22.8 ulp | 19.0 gb/s, 37.3 ulp | 17.6 gb/s, 43.6 ulp |
389
+ | `nk_dot_f64c_serial` | 4.17 gb/s, 3.8 ulp | 6.03 gb/s, 3.9 ulp | 6.37 gb/s, 3.2 ulp |
390
+ | `nk_vdot_f64c_serial` | 6.00 gb/s, 3.8 ulp | 6.55 gb/s, 3.4 ulp | 6.83 gb/s, 15.1 ulp |
391
+ | `nk_dot_f64c_v128relaxed` | 46.7 gb/s, 26 ulp | 38.2 gb/s, 42 ulp | 40.5 gb/s, 88 ulp |
392
+ | `nk_vdot_f64c_v128relaxed` | 46.1 gb/s, 22.8 ulp | 39.8 gb/s, 37.3 ulp | 39.9 gb/s, 43.6 ulp |
376
393
  | __f32c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
377
- | `nk_dot_f32c_serial` | 21.3 gb/s, 0 ulp | 19.8 gb/s, 0 ulp | 20.4 gb/s, 0 ulp |
378
- | `nk_vdot_f32c_serial` | 11.1 gb/s, 0 ulp | 10.7 gb/s, 0 ulp | 10.7 gb/s, 0 ulp |
379
- | `nk_dot_f32c_v128relaxed` | 21.4 gb/s, 0 ulp | 17.2 gb/s, 0 ulp | 18.0 gb/s, 0 ulp |
380
- | `nk_vdot_f32c_v128relaxed` | 10.4 gb/s, 0 ulp | 9.47 gb/s, 0 ulp | 8.71 gb/s, 0 ulp |
394
+ | `nk_dot_f32c_serial` | 20.9 gb/s, 0 ulp | 21.3 gb/s, 0 ulp | 22.5 gb/s, 0 ulp |
395
+ | `nk_vdot_f32c_serial` | 19.9 gb/s, 0 ulp | 21.5 gb/s, 0 ulp | 22.4 gb/s, 0 ulp |
396
+ | `nk_dot_f32c_v128relaxed` | 22.4 gb/s, 0 ulp | 20.3 gb/s, 0 ulp | 19.8 gb/s, 0 ulp |
397
+ | `nk_vdot_f32c_v128relaxed` | 21.9 gb/s, 0 ulp | 20.3 gb/s, 0 ulp | 19.9 gb/s, 0 ulp |
381
398
  | __bf16c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
382
- | `nk_dot_bf16c_serial` | 11.1 gb/s, 0.1 ulp | 11.2 gb/s, 2.5 ulp | 11.2 gb/s, 10 ulp |
383
- | `nk_vdot_bf16c_serial` | 5.79 gb/s, 0.2 ulp | 5.68 gb/s, 2.1 ulp | 5.80 gb/s, 11.4 ulp |
399
+ | `nk_dot_bf16c_serial` | 10.3 gb/s, 0.1 ulp | 11.6 gb/s, 2.5 ulp | 11.6 gb/s, 10 ulp |
400
+ | `nk_vdot_bf16c_serial` | 10.7 gb/s, 0.2 ulp | 11.7 gb/s, 2.1 ulp | 11.6 gb/s, 11.4 ulp |
384
401
  | __f16c__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
385
- | `nk_dot_f16c_serial` | 3.52 gb/s, 13 ulp | 3.48 gb/s, 20 ulp | 3.49 gb/s, 90 ulp |
386
- | `nk_vdot_f16c_serial` | 1.84 gb/s, 13.9 ulp | 1.79 gb/s, 35.5 ulp | 1.85 gb/s, 42.4 ulp |
402
+ | `nk_dot_f16c_serial` | 3.76 gb/s, 13 ulp | 3.83 gb/s, 20 ulp | 3.86 gb/s, 90 ulp |
403
+ | `nk_vdot_f16c_serial` | 3.81 gb/s, 13.9 ulp | 3.89 gb/s, 35.5 ulp | 3.85 gb/s, 42.4 ulp |
387
404
  | __f64__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
388
- | `nk_dot_f64_serial` | 22.2 gb/s, 2.4 ulp | 19.3 gb/s, 2.6 ulp | 21.1 gb/s, 2.2 ulp |
389
- | `nk_dot_f64_v128relaxed` | 39.6 gb/s, 2.6 ulp | 41.0 gb/s, 3.2 ulp | 32.9 gb/s, 2.6 ulp |
405
+ | `nk_dot_f64_serial` | 7.26 gb/s, 2.4 ulp | 7.45 gb/s, 2.6 ulp | 7.96 gb/s, 2.2 ulp |
406
+ | `nk_dot_f64_v128relaxed` | 38.7 gb/s, 2.6 ulp | 42.0 gb/s, 3.2 ulp | 43.9 gb/s, 2.6 ulp |
390
407
  | __f32__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
391
- | `nk_dot_f32_serial` | 17.9 gb/s, 16 ulp | 12.6 gb/s, 69 ulp | 12.7 gb/s, 104 ulp |
392
- | `nk_dot_f32_v128relaxed` | 19.5 gb/s, 0 ulp | 17.5 gb/s, 0 ulp | 17.2 gb/s, 0 ulp |
408
+ | `nk_dot_f32_serial` | 19.0 gb/s, 16 ulp | 14.6 gb/s, 69 ulp | 14.0 gb/s, 104 ulp |
409
+ | `nk_dot_f32_v128relaxed` | 20.4 gb/s, 0 ulp | 18.9 gb/s, 0 ulp | 18.7 gb/s, 0 ulp |
393
410
  | __bf16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
394
- | `nk_dot_bf16_serial` | 8.70 gb/s, 0 ulp | 6.36 gb/s, 0.6 ulp | 6.57 gb/s, 5.9 ulp |
395
- | `nk_dot_bf16_v128relaxed` | 9.07 gb/s, 0 ulp | 7.98 gb/s, 0.4 ulp | 8.24 gb/s, 3.7 ulp |
411
+ | `nk_dot_bf16_serial` | 9.53 gb/s, 0 ulp | 7.42 gb/s, 0.6 ulp | 7.20 gb/s, 5.9 ulp |
412
+ | `nk_dot_bf16_v128relaxed` | 41.9 gb/s, 0 ulp | 28.3 gb/s, 0.4 ulp | 21.5 gb/s, 3.7 ulp |
396
413
  | __f16__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
397
- | `nk_dot_f16_serial` | 3.15 gb/s, 16 ulp | 2.74 gb/s, 26 ulp | 3.14 gb/s, 53 ulp |
398
- | `nk_dot_f16_v128relaxed` | 4.78 gb/s, 9.0 ulp | 4.70 gb/s, 23 ulp | 4.92 gb/s, 39 ulp |
414
+ | `nk_dot_f16_serial` | 3.31 gb/s, 16 ulp | 3.63 gb/s, 26 ulp | 3.66 gb/s, 53 ulp |
415
+ | `nk_dot_f16_v128relaxed` | 11.4 gb/s, 9.0 ulp | 11.2 gb/s, 23 ulp | 12.0 gb/s, 39 ulp |
399
416
  | __e5m2__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
400
- | `nk_dot_e5m2_serial` | 2.90 gb/s, 0 ulp | 2.51 gb/s, 0 ulp | 2.88 gb/s, 0 ulp |
401
- | `nk_dot_e5m2_v128relaxed` | 3.05 gb/s, 0 ulp | 2.65 gb/s, 0 ulp | 2.99 gb/s, 0 ulp |
417
+ | `nk_dot_e5m2_serial` | 3.02 gb/s, 0 ulp | 2.95 gb/s, 0 ulp | 3.16 gb/s, 0 ulp |
418
+ | `nk_dot_e5m2_v128relaxed` | 3.47 gb/s, 0 ulp | 3.45 gb/s, 0 ulp | 3.48 gb/s, 0 ulp |
402
419
  | __e4m3__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
403
- | `nk_dot_e4m3_serial` | 0.903 gb/s, 0 ulp | 0.776 gb/s, 0 ulp | 0.874 gb/s, 0 ulp |
404
- | `nk_dot_e4m3_v128relaxed` | 2.42 gb/s, 0 ulp | 2.12 gb/s, 0 ulp | 2.36 gb/s, 0 ulp |
420
+ | `nk_dot_e4m3_serial` | 0.978 gb/s, 0 ulp | 0.893 gb/s, 0 ulp | 0.936 gb/s, 0 ulp |
421
+ | `nk_dot_e4m3_v128relaxed` | 2.78 gb/s, 0 ulp | 2.75 gb/s, 0 ulp | 2.78 gb/s, 0 ulp |
405
422
  | __e3m2__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
406
- | `nk_dot_e3m2_serial` | 2.90 gb/s, 0 ulp | 2.53 gb/s, 0 ulp | 2.88 gb/s, 0 ulp |
407
- | `nk_dot_e3m2_v128relaxed` | 11.8 gb/s, 0 ulp | 10.5 gb/s, 0 ulp | 11.7 gb/s, 0 ulp |
423
+ | `nk_dot_e3m2_serial` | 3.13 gb/s, 0 ulp | 2.95 gb/s, 0 ulp | 3.16 gb/s, 0 ulp |
424
+ | `nk_dot_e3m2_v128relaxed` | 12.1 gb/s, 0 ulp | 11.9 gb/s, 0 ulp | 12.6 gb/s, 0 ulp |
408
425
  | __e2m3__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
409
- | `nk_dot_e2m3_serial` | 2.90 gb/s, 0 ulp | 2.53 gb/s, 0 ulp | 2.88 gb/s, 0 ulp |
410
- | `nk_dot_e2m3_v128relaxed` | 20.0 gb/s, 0 ulp | 20.0 gb/s, 0 ulp | 20.0 gb/s, 0 ulp |
426
+ | `nk_dot_e2m3_serial` | 2.99 gb/s, 0 ulp | 3.00 gb/s, 0 ulp | 3.17 gb/s, 0 ulp |
427
+ | `nk_dot_e2m3_v128relaxed` | 20.4 gb/s, 0 ulp | 20.6 gb/s, 0 ulp | 21.7 gb/s, 0 ulp |
411
428
  | __i8__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
412
- | `nk_dot_i8_serial` | 21.7 gb/s | 16.8 gb/s | 16.3 gb/s |
413
- | `nk_dot_i8_v128relaxed` | 42.0 gb/s | 47.7 gb/s | 46.3 gb/s |
429
+ | `nk_dot_i8_serial` | 22.2 gb/s | 19.6 gb/s | 17.8 gb/s |
430
+ | `nk_dot_i8_v128relaxed` | 42.0 gb/s | 49.0 gb/s | 49.7 gb/s |
414
431
  | __u8__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
415
- | `nk_dot_u8_serial` | 21.4 gb/s | 16.8 gb/s | 16.3 gb/s |
416
- | `nk_dot_u8_v128relaxed` | 43.8 gb/s | 51.2 gb/s | 49.2 gb/s |
432
+ | `nk_dot_u8_serial` | 23.0 gb/s | 19.9 gb/s | 17.8 gb/s |
433
+ | `nk_dot_u8_v128relaxed` | 29.3 gb/s | 33.0 gb/s | 35.1 gb/s |
417
434
  | __i4__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
418
- | `nk_dot_i4_serial` | 0.984 gb/s | 0.824 gb/s | 0.957 gb/s |
419
- | `nk_dot_i4_v128relaxed` | 14.8 gb/s | 15.0 gb/s | 17.9 gb/s |
435
+ | `nk_dot_i4_serial` | 0.990 gb/s | 0.923 gb/s | 0.985 gb/s |
436
+ | `nk_dot_i4_v128relaxed` | 15.2 gb/s | 17.9 gb/s | 19.2 gb/s |
420
437
  | __u4__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
421
- | `nk_dot_u4_serial` | 0.988 gb/s | 0.957 gb/s | 0.959 gb/s |
422
- | `nk_dot_u4_v128relaxed` | 30.5 gb/s | 27.5 gb/s | 31.4 gb/s |
438
+ | `nk_dot_u4_serial` | 0.992 gb/s | 0.933 gb/s | 0.988 gb/s |
439
+ | `nk_dot_u4_v128relaxed` | 30.2 gb/s | 32.1 gb/s | 33.8 gb/s |
423
440
  | __u1__ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ | â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ |
424
- | `nk_dot_u1_serial` | 4.85 gb/s | 5.38 gb/s | 5.67 gb/s |
425
- | `nk_dot_u1_v128relaxed` | 21.8 gb/s | 29.8 gb/s | 52.0 gb/s |
441
+ | `nk_dot_u1_serial` | 5.26 gb/s | 5.80 gb/s | 6.48 gb/s |
442
+ | `nk_dot_u1_v128relaxed` | 21.2 gb/s | 47.4 gb/s | 67.3 gb/s |