numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,262 @@
1
+ # Type Conversions in NumKong
2
+
3
+ NumKong implements bidirectional type conversions between all supported numeric formats through Float32 as a hub type.
4
+ Conversions cover IEEE 754 floats (Float16, Float32, Float64), brain float (BFloat16), Float8 formats (e4m3, e5m2, e2m3, e3m2), and integers (Int8–Int64, UInt8–UInt64, packed i4x2/u4x2).
5
+ All conversions use round-to-nearest-even (RNE) for narrowing and exact widening where the target format has sufficient range and precision.
6
+
7
+ BFloat16 relates to Float32 by truncation with rounding:
8
+
9
+ ```math
10
+ \text{bf16} \approx \text{f32} \gg 16
11
+ ```
12
+
13
+ With RNE tie-breaking to preserve the least significant bit of the truncated result.
14
+
15
+ Float16 range and precision:
16
+
17
+ ```math
18
+ \text{f16} \in [-65504, 65504], \quad \text{min positive normal} = 2^{-14}
19
+ ```
20
+
21
+ Reformulating as Python pseudocode:
22
+
23
+ ```python
24
+ import numpy as np
25
+
26
+ def cast(a: np.ndarray, target_dtype: np.dtype) -> np.ndarray:
27
+ return a.astype(target_dtype)
28
+ ```
29
+
30
+ ## Input & Output Types
31
+
32
+ Float-to-float conversions:
33
+
34
+ | Input Type | Output Type | Description |
35
+ | ---------- | ----------- | ------------------------------------------ |
36
+ | `f64` | `f32` | 64-bit to 32-bit, narrowing with RNE |
37
+ | `f32` | `f64` | 32-bit to 64-bit, exact widening |
38
+ | `f32` | `f16` | 32-bit to 16-bit half precision |
39
+ | `f16` | `f32` | 16-bit half to 32-bit, exact widening |
40
+ | `f32` | `bf16` | 32-bit to brain float, truncation with RNE |
41
+ | `bf16` | `f32` | Brain float to 32-bit, exact widening |
42
+
43
+ Float-to-Float8 conversions:
44
+
45
+ | Input Type | Output Type | Description |
46
+ | ---------- | ----------- | --------------------------------------------- |
47
+ | `f32` | `e4m3` | 32-bit to Float8: 4 exponent, 3 mantissa bits |
48
+ | `e4m3` | `f32` | Float8 to 32-bit, exact via lookup table |
49
+ | `f32` | `e5m2` | 32-bit to Float8: 5 exponent, 2 mantissa bits |
50
+ | `e5m2` | `f32` | Float8 to 32-bit, exact via lookup table |
51
+ | `f32` | `e2m3` | 32-bit to MX: 2 exponent, 3 mantissa bits |
52
+ | `e2m3` | `f32` | MX to 32-bit, exact via lookup table |
53
+ | `f32` | `e3m2` | 32-bit to MX: 3 exponent, 2 mantissa bits |
54
+ | `e3m2` | `f32` | MX to 32-bit, exact via lookup table |
55
+
56
+ Float-to-integer conversions:
57
+
58
+ | Input Type | Output Type | Description |
59
+ | ---------- | ----------- | ----------------------------------- |
60
+ | `f32` | `i8` | Clamped to [-128, 127], rounded |
61
+ | `f32` | `u8` | Clamped to [0, 255], rounded |
62
+ | `f32` | `i16` | Clamped to [-32768, 32767], rounded |
63
+ | `f32` | `u16` | Clamped to [0, 65535], rounded |
64
+ | `f64` | `i32` | Clamped to Int32 range, rounded |
65
+ | `f64` | `u32` | Clamped to UInt32 range, rounded |
66
+ | `f64` | `i64` | Clamped to Int64 range, rounded |
67
+ | `f64` | `u64` | Clamped to UInt64 range, rounded |
68
+
69
+ Packed sub-byte conversions:
70
+
71
+ | Input Type | Output Type | Description |
72
+ | ---------- | ----------- | ------------------------------------------------ |
73
+ | `i4x2` | `i8` | Signed 4-bit pair to two signed 8-bit values |
74
+ | `u4x2` | `u8` | Unsigned 4-bit pair to two unsigned 8-bit values |
75
+
76
+ ## Optimizations
77
+
78
+ ### Lookup Tables for Mini-Floats
79
+
80
+ `nk_e4m3_to_f32_serial`, `nk_e5m2_to_f32_serial`, `nk_e2m3_to_f32_serial`, `nk_e3m2_to_f32_serial` use 256-entry precomputed lookup tables — each 8-bit input indexes directly into a Float32 result array.
81
+ The reverse direction (`nk_f32_to_e4m3_serial`) uses clamping + rounding: clamp to format range, multiply by scale, round-to-nearest, cast to UInt8.
82
+ SIMD backends (`nk_cast_haswell`, `nk_cast_skylake`) use `VPGATHERDD` to perform 8 or 16 simultaneous table lookups from the same 256-entry table.
83
+ AVX-512 gathers on Skylake achieve ~3cy throughput per 16-element lookup vs ~8cy on Haswell for 8-element gathers.
84
+
85
+ ### BFloat16 as Truncated Float32
86
+
87
+ `nk_bf16_to_f32_serial` zero-extends by left-shifting 16 bits — exact, no rounding error, single-cycle on all platforms.
88
+ `nk_f32_to_bf16_serial` right-shifts with round-to-nearest-even: adds a rounding bias of `0x7FFF + ((bits >> 16) & 1)` before truncating, matching the IEEE 754 RNE tie-breaking rule.
89
+ NEON backend uses `vreinterpretq_u16_u8` + `vzip` for zero-extension; Haswell uses `VPSLLD` / `VPSRLD` shifts.
90
+
91
+ ### F16C Hardware Conversion
92
+
93
+ `nk_f16_to_f32_haswell`, `nk_f32_to_f16_haswell` use the F16C extension instructions `VCVTPH2PS` / `VCVTPS2PH` — single-instruction conversion of 8 elements with correct denormal handling, NaN propagation, and RNE rounding.
94
+ The serial fallback (`nk_f16_to_f32_serial`) must handle denormals via explicit exponent/mantissa extraction and conditional re-normalization — ~15 integer ops per element vs 1 instruction with F16C.
95
+ AVX-512 (`nk_cast_skylake`) doubles throughput to 16 elements per instruction.
96
+
97
+ ## Performance
98
+
99
+ The following performance tables are produced by manually running `nk_bench` included internal tools to measure the throughput at different input shapes.
100
+ The input size is controlled by the `NK_DENSE_DIMENSIONS` environment variable and set to 256, 1024, and 4096 elements.
101
+ The throughput is measured in GB/s as the number of bytes read and written per second, with ↓ for downcasts and ↑ for upcasts.
102
+ Each kernel runs for at least 5 seconds per configuration.
103
+ Benchmark threads are pinned to specific cores; on machines with heterogeneous core types (e.g., Apple P/E cores), only the fastest cores are used.
104
+ Workloads that significantly degrade CPU frequencies (Intel AMX, Apple SME) run in separate passes to avoid affecting throughput measurements of other kernels.
105
+
106
+ ### Intel Sapphire Rapids
107
+
108
+ #### Native
109
+
110
+ | Kernel | ↓ 256 | ↓ 1K | ↓ 4K | ↑ 256 | ↑ 1K | ↑ 4K |
111
+ | :----------------- | -----------: | -----------: | -----------: | -----------: | -----------: | -----------: |
112
+ | __f32 ↔ bf16__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
113
+ | `nk_cast_serial` | 0.542 gb/s | 0.521 gb/s | 0.553 gb/s | 1.10 gb/s | 1.12 gb/s | 1.17 gb/s |
114
+ | `nk_cast_haswell` | 40.8 gb/s | 52.4 gb/s | 55.1 gb/s | 27.7 gb/s | 43.2 gb/s | 46.3 gb/s |
115
+ | `nk_cast_skylake` | 23.6 gb/s | 44.8 gb/s | 46.8 gb/s | 37.6 gb/s | 60.1 gb/s | 61.3 gb/s |
116
+ | `nk_cast_icelake` | 21.4 gb/s | 26.0 gb/s | 27.2 gb/s | 32.6 gb/s | 39.4 gb/s | 44.3 gb/s |
117
+ | `nk_cast_sapphire` | 21.5 gb/s | 21.1 gb/s | 49.5 gb/s | 39.2 gb/s | 38.3 gb/s | 56.3 gb/s |
118
+ | __f32 ↔ f16__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
119
+ | `nk_cast_serial` | 6.28 gb/s | 6.36 gb/s | 6.20 gb/s | 2.93 gb/s | 2.95 gb/s | 2.89 gb/s |
120
+ | `nk_cast_haswell` | 50.2 gb/s | 106 gb/s | 105 gb/s | 31.7 gb/s | 60.2 gb/s | 66.1 gb/s |
121
+ | `nk_cast_skylake` | 38.0 gb/s | 56.6 gb/s | 39.4 gb/s | 39.7 gb/s | 58.3 gb/s | 43.7 gb/s |
122
+ | `nk_cast_icelake` | 51.8 gb/s | 60.2 gb/s | 54.3 gb/s | 52.2 gb/s | 57.7 gb/s | 60.6 gb/s |
123
+ | `nk_cast_sapphire` | 31.8 gb/s | 33.8 gb/s | 38.8 gb/s | 35.0 gb/s | 33.6 gb/s | 51.5 gb/s |
124
+ | __f32 ↔ e5m2__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
125
+ | `nk_cast_serial` | 0.785 gb/s | 0.725 gb/s | 0.569 gb/s | 2.62 gb/s | 2.57 gb/s | 2.69 gb/s |
126
+ | `nk_cast_haswell` | 7.93 gb/s | 8.39 gb/s | 5.44 gb/s | 12.6 gb/s | 17.9 gb/s | 10.6 gb/s |
127
+ | `nk_cast_skylake` | 10.3 gb/s | 10.8 gb/s | 10.0 gb/s | 27.2 gb/s | 28.6 gb/s | 28.0 gb/s |
128
+ | `nk_cast_icelake` | 5.07 gb/s | 4.96 gb/s | 6.08 gb/s | 14.9 gb/s | 13.7 gb/s | 14.5 gb/s |
129
+ | `nk_cast_sapphire` | 7.81 gb/s | 5.25 gb/s | 10.7 gb/s | 24.7 gb/s | 15.2 gb/s | 25.0 gb/s |
130
+ | __f32 ↔ e4m3__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
131
+ | `nk_cast_serial` | 0.653 gb/s | 0.623 gb/s | 0.445 gb/s | 1.51 gb/s | 1.43 gb/s | 1.44 gb/s |
132
+ | `nk_cast_haswell` | 6.74 gb/s | 7.35 gb/s | 6.68 gb/s | 10.4 gb/s | 12.1 gb/s | 7.47 gb/s |
133
+ | `nk_cast_skylake` | 7.70 gb/s | 9.83 gb/s | 9.79 gb/s | 17.3 gb/s | 23.2 gb/s | 22.2 gb/s |
134
+ | `nk_cast_icelake` | 8.51 gb/s | 9.01 gb/s | 9.43 gb/s | 17.8 gb/s | 20.5 gb/s | 21.4 gb/s |
135
+ | `nk_cast_sapphire` | 4.98 gb/s | 4.90 gb/s | 8.56 gb/s | 15.7 gb/s | 11.0 gb/s | 17.1 gb/s |
136
+ | __f32 ↔ e3m2__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
137
+ | `nk_cast_serial` | 0.863 gb/s | 1.44 gb/s | 1.21 gb/s | 2.46 gb/s | 4.20 gb/s | 4.14 gb/s |
138
+ | `nk_cast_haswell` | 4.70 gb/s | 5.04 gb/s | 5.00 gb/s | 7.47 gb/s | 7.82 gb/s | 8.03 gb/s |
139
+ | `nk_cast_skylake` | 6.34 gb/s | 6.37 gb/s | 6.46 gb/s | 14.7 gb/s | 17.6 gb/s | 17.1 gb/s |
140
+ | `nk_cast_icelake` | 5.34 gb/s | 5.10 gb/s | 6.36 gb/s | 13.3 gb/s | 14.2 gb/s | 21.3 gb/s |
141
+ | `nk_cast_sapphire` | 8.78 gb/s | 9.93 gb/s | 7.02 gb/s | 23.0 gb/s | 18.5 gb/s | 20.8 gb/s |
142
+ | __f32 ↔ e2m3__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
143
+ | `nk_cast_serial` | 0.941 gb/s | 1.39 gb/s | 0.688 gb/s | 2.68 gb/s | 4.79 gb/s | 2.70 gb/s |
144
+ | `nk_cast_haswell` | 4.76 gb/s | 4.51 gb/s | 5.00 gb/s | 8.26 gb/s | 8.92 gb/s | 9.02 gb/s |
145
+ | `nk_cast_skylake` | 6.55 gb/s | 6.54 gb/s | 6.42 gb/s | 13.4 gb/s | 15.9 gb/s | 16.1 gb/s |
146
+ | `nk_cast_icelake` | 5.03 gb/s | 6.41 gb/s | 6.44 gb/s | 12.4 gb/s | 14.8 gb/s | 16.2 gb/s |
147
+ | `nk_cast_sapphire` | 9.95 gb/s | 8.90 gb/s | 9.17 gb/s | 19.7 gb/s | 24.1 gb/s | 16.8 gb/s |
148
+ | __f32 ↔ i16__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
149
+ | `nk_cast_serial` | 1.99 gb/s | 2.02 gb/s | 2.04 gb/s | 4.59 gb/s | 4.63 gb/s | 4.68 gb/s |
150
+ | `nk_cast_haswell` | 46.4 gb/s | 51.8 gb/s | 53.0 gb/s | 19.8 gb/s | 21.0 gb/s | 21.9 gb/s |
151
+ | `nk_cast_skylake` | 31.0 gb/s | 34.2 gb/s | 36.7 gb/s | 48.7 gb/s | 58.5 gb/s | 61.1 gb/s |
152
+ | __f32 ↔ u16__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
153
+ | `nk_cast_serial` | 3.19 gb/s | 3.13 gb/s | 3.14 gb/s | 4.60 gb/s | 4.82 gb/s | 4.75 gb/s |
154
+ | `nk_cast_haswell` | 36.4 gb/s | 43.6 gb/s | 48.4 gb/s | 19.1 gb/s | 20.6 gb/s | 21.2 gb/s |
155
+ | `nk_cast_skylake` | 32.0 gb/s | 36.1 gb/s | 37.3 gb/s | 48.4 gb/s | 55.0 gb/s | 59.5 gb/s |
156
+ | __f32 ↔ i8__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
157
+ | `nk_cast_serial` | 3.22 gb/s | 3.62 gb/s | 3.40 gb/s | 5.41 gb/s | 5.65 gb/s | 5.73 gb/s |
158
+ | `nk_cast_haswell` | 21.6 gb/s | 25.5 gb/s | 27.5 gb/s | 12.8 gb/s | 13.6 gb/s | 14.0 gb/s |
159
+ | `nk_cast_skylake` | 13.0 gb/s | 13.2 gb/s | 13.9 gb/s | 22.1 gb/s | 23.4 gb/s | 22.9 gb/s |
160
+ | `nk_cast_icelake` | 14.2 gb/s | 16.4 gb/s | 21.5 gb/s | 25.4 gb/s | 29.4 gb/s | 34.8 gb/s |
161
+ | `nk_cast_sapphire` | 26.0 gb/s | 27.3 gb/s | 19.5 gb/s | 33.1 gb/s | 48.9 gb/s | 49.4 gb/s |
162
+ | __f32 ↔ u8__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
163
+ | `nk_cast_serial` | 4.44 gb/s | 4.58 gb/s | 5.84 gb/s | 7.45 gb/s | 7.20 gb/s | 4.24 gb/s |
164
+ | `nk_cast_haswell` | 41.2 gb/s | 42.2 gb/s | 41.4 gb/s | 17.9 gb/s | 19.2 gb/s | 20.8 gb/s |
165
+ | `nk_cast_skylake` | 27.8 gb/s | 31.1 gb/s | 33.4 gb/s | 39.8 gb/s | 48.7 gb/s | 51.5 gb/s |
166
+ | __f64 ↔ f32__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
167
+ | `nk_cast_serial` | 11.6 gb/s | 12.2 gb/s | 12.3 gb/s | 12.1 gb/s | 12.9 gb/s | 13.2 gb/s |
168
+ | `nk_cast_skylake` | 52.1 gb/s | 59.4 gb/s | 53.8 gb/s | 54.4 gb/s | 65.9 gb/s | 60.6 gb/s |
169
+ | __f64 ↔ i64__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
170
+ | `nk_cast_serial` | 5.30 gb/s | 5.21 gb/s | 5.21 gb/s | 15.4 gb/s | 16.1 gb/s | 14.0 gb/s |
171
+ | `nk_cast_skylake` | 8.73 gb/s | 9.81 gb/s | 9.03 gb/s | 25.3 gb/s | 26.8 gb/s | 20.3 gb/s |
172
+ | __f64 ↔ u64__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
173
+ | `nk_cast_serial` | 9.17 gb/s | 8.55 gb/s | 8.57 gb/s | 16.3 gb/s | 15.1 gb/s | 15.0 gb/s |
174
+ | `nk_cast_skylake` | 13.8 gb/s | 14.5 gb/s | 15.4 gb/s | 25.5 gb/s | 28.1 gb/s | 19.6 gb/s |
175
+ | __f64 ↔ i32__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
176
+ | `nk_cast_serial` | 3.71 gb/s | 3.97 gb/s | 3.71 gb/s | 11.6 gb/s | 12.3 gb/s | 12.6 gb/s |
177
+ | `nk_cast_skylake` | 38.7 gb/s | 48.1 gb/s | 45.9 gb/s | 54.1 gb/s | 64.2 gb/s | 60.8 gb/s |
178
+ | __f64 ↔ u32__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
179
+ | `nk_cast_serial` | 6.37 gb/s | 6.16 gb/s | 6.08 gb/s | 10.9 gb/s | 11.9 gb/s | 10.3 gb/s |
180
+ | `nk_cast_skylake` | 46.6 gb/s | 48.9 gb/s | 49.5 gb/s | 50.2 gb/s | 60.5 gb/s | 62.3 gb/s |
181
+
182
+ #### WASM
183
+
184
+ Measured with Wasmtime v42 (Cranelift backend).
185
+
186
+ | Kernel | ↓ 256 | ↓ 1K | ↓ 4K | ↑ 256 | ↑ 1K | ↑ 4K |
187
+ | :--------------- | -----------: | -----------: | --------------: | -----------: | -----------: | --------------: |
188
+ | __f32 ↔ bf16__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░ |
189
+ | `nk_cast_serial` | ? gb/s | ? gb/s | 1.63 gb/s | ? gb/s | ? gb/s | 2.21 gb/s |
190
+ | __f32 ↔ f16__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░ |
191
+ | `nk_cast_serial` | ? gb/s | ? gb/s | 0.436 gb/s | ? gb/s | ? gb/s | 1.19 gb/s |
192
+ | __f32 ↔ e5m2__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░ |
193
+ | `nk_cast_serial` | ? gb/s | ? gb/s | 0.294 gb/s | ? gb/s | ? gb/s | 1.45 gb/s |
194
+ | __f32 ↔ e4m3__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░░░░ |
195
+ | `nk_cast_serial` | ? gb/s | ? gb/s | 0.239 gb/s | ? gb/s | ? gb/s | 0.746 gb/s |
196
+
197
+ ### Apple M4
198
+
199
+ #### Native
200
+
201
+ | Kernel | ↓ 256 | ↓ 1K | ↓ 4K | ↑ 256 | ↑ 1K | ↑ 4K |
202
+ | :--------------- | -----------: | -----------: | -----------: | -----------: | -----------: | -----------: |
203
+ | __f32 ↔ bf16__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
204
+ | `nk_cast_serial` | 10.2 gb/s | 10.6 gb/s | 10.7 gb/s | 8.06 gb/s | 8.34 gb/s | 8.32 gb/s |
205
+ | `nk_cast_neon` | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s |
206
+ | __f32 ↔ f16__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
207
+ | `nk_cast_serial` | 10.9 gb/s | 11.3 gb/s | 11.4 gb/s | 8.40 gb/s | 8.62 gb/s | 8.70 gb/s |
208
+ | `nk_cast_neon` | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s |
209
+ | __f32 ↔ e5m2__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
210
+ | `nk_cast_serial` | 1.65 gb/s | 1.52 gb/s | 1.36 gb/s | 5.96 gb/s | 6.08 gb/s | 6.11 gb/s |
211
+ | `nk_cast_neon` | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s |
212
+ | __f32 ↔ e4m3__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
213
+ | `nk_cast_serial` | 1.49 gb/s | 1.36 gb/s | 1.24 gb/s | 4.96 gb/s | 5.05 gb/s | 4.81 gb/s |
214
+ | `nk_cast_neon` | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s |
215
+ | __f32 ↔ e3m2__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
216
+ | `nk_cast_serial` | 2.17 gb/s | 2.13 gb/s | 1.97 gb/s | 5.90 gb/s | 6.02 gb/s | 6.07 gb/s |
217
+ | `nk_cast_neon` | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s |
218
+ | __f32 ↔ e2m3__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
219
+ | `nk_cast_serial` | 2.54 gb/s | 2.45 gb/s | 2.23 gb/s | 5.88 gb/s | 6.11 gb/s | 6.10 gb/s |
220
+ | `nk_cast_neon` | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s |
221
+ | __f32 ↔ i16__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
222
+ | `nk_cast_serial` | 6.13 gb/s | 5.99 gb/s | 6.10 gb/s | 8.29 gb/s | 8.53 gb/s | 8.58 gb/s |
223
+ | `nk_cast_neon` | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s |
224
+ | __f32 ↔ u16__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
225
+ | `nk_cast_serial` | 5.36 gb/s | 5.01 gb/s | 4.49 gb/s | 8.43 gb/s | 8.64 gb/s | 8.76 gb/s |
226
+ | `nk_cast_neon` | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s |
227
+ | __f32 ↔ i8__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
228
+ | `nk_cast_serial` | 4.83 gb/s | 4.89 gb/s | 5.09 gb/s | 6.67 gb/s | 6.92 gb/s | 7.08 gb/s |
229
+ | `nk_cast_neon` | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s |
230
+ | __f32 ↔ u8__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
231
+ | `nk_cast_serial` | 4.31 gb/s | 4.10 gb/s | 3.62 gb/s | 7.03 gb/s | 7.21 gb/s | 7.28 gb/s |
232
+ | `nk_cast_neon` | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s |
233
+ | __f64 ↔ f32__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
234
+ | `nk_cast_serial` | 17.3 gb/s | 17.8 gb/s | 18.1 gb/s | 17.9 gb/s | 18.5 gb/s | 18.5 gb/s |
235
+ | `nk_cast_neon` | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s |
236
+ | __f64 ↔ i64__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
237
+ | `nk_cast_serial` | 16.8 gb/s | 17.2 gb/s | 17.0 gb/s | 23.9 gb/s | 24.7 gb/s | 24.8 gb/s |
238
+ | `nk_cast_neon` | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s |
239
+ | __f64 ↔ u64__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
240
+ | `nk_cast_serial` | 13.5 gb/s | 12.8 gb/s | 11.3 gb/s | 24.4 gb/s | 25.0 gb/s | 25.1 gb/s |
241
+ | `nk_cast_neon` | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s |
242
+ | __f64 ↔ i32__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
243
+ | `nk_cast_serial` | 12.1 gb/s | 12.4 gb/s | 12.6 gb/s | 18.2 gb/s | 18.9 gb/s | 19.2 gb/s |
244
+ | `nk_cast_neon` | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s |
245
+ | __f64 ↔ u32__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
246
+ | `nk_cast_serial` | 10.9 gb/s | 10.6 gb/s | 9.58 gb/s | 17.6 gb/s | 18.0 gb/s | 18.1 gb/s |
247
+ | `nk_cast_neon` | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s | ? gb/s |
248
+
249
+ #### WASM
250
+
251
+ Measured with Wasmtime v42 (Cranelift backend).
252
+
253
+ | Kernel | ↓ 256 | ↓ 1K | ↓ 4K | ↑ 256 | ↑ 1K | ↑ 4K |
254
+ | :--------------- | -----------: | -----------: | -----------: | -----------: | -----------: | -----------: |
255
+ | __f32 ↔ bf16__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
256
+ | `nk_cast_serial` | 3.39 gb/s | 7.29 gb/s | 11.2 gb/s | 3.08 gb/s | 6.20 gb/s | 8.81 gb/s |
257
+ | __f32 ↔ f16__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
258
+ | `nk_cast_serial` | 0.605 gb/s | 0.952 gb/s | 1.22 gb/s | 2.36 gb/s | 4.71 gb/s | 7.31 gb/s |
259
+ | __f32 ↔ e5m2__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
260
+ | `nk_cast_serial` | 0.752 gb/s | 1.84 gb/s | 1.80 gb/s | 2.24 gb/s | 6.32 gb/s | 6.31 gb/s |
261
+ | __f32 ↔ e4m3__ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ | ░░░░░░░░░░░░ |
262
+ | `nk_cast_serial` | 0.623 gb/s | 1.61 gb/s | 1.50 gb/s | 1.68 gb/s | 4.35 gb/s | 4.28 gb/s |